r600g: Implement spilling of temp arrays (v2)
[mesa.git] / src / gallium / drivers / r600 / r600_shader.c
1 /*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 */
23 #include "r600_sq.h"
24 #include "r600_formats.h"
25 #include "r600_opcodes.h"
26 #include "r600_shader.h"
27 #include "r600d.h"
28
29 #include "sb/sb_public.h"
30
31 #include "pipe/p_shader_tokens.h"
32 #include "tgsi/tgsi_info.h"
33 #include "tgsi/tgsi_parse.h"
34 #include "tgsi/tgsi_scan.h"
35 #include "tgsi/tgsi_dump.h"
36 #include "util/u_bitcast.h"
37 #include "util/u_memory.h"
38 #include "util/u_math.h"
39 #include <stdio.h>
40 #include <errno.h>
41
42 /* CAYMAN notes
43 Why CAYMAN got loops for lots of instructions is explained here.
44
45 -These 8xx t-slot only ops are implemented in all vector slots.
46 MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT
47 These 8xx t-slot only opcodes become vector ops, with all four
48 slots expecting the arguments on sources a and b. Result is
49 broadcast to all channels.
50 MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT, MUL_64
51 These 8xx t-slot only opcodes become vector ops in the z, y, and
52 x slots.
53 EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64
54 RECIPSQRT_IEEE/CLAMPED/FF/_64/CLAMPED_64
55 SQRT_IEEE/_64
56 SIN/COS
57 The w slot may have an independent co-issued operation, or if the
58 result is required to be in the w slot, the opcode above may be
59 issued in the w slot as well.
60 The compiler must issue the source argument to slots z, y, and x
61 */
62
63 /* Contents of r0 on entry to various shaders
64
65 VS - .x = VertexID
66 .y = RelVertexID (??)
67 .w = InstanceID
68
69 GS - r0.xyw, r1.xyz = per-vertex offsets
70 r0.z = PrimitiveID
71
72 TCS - .x = PatchID
73 .y = RelPatchID (??)
74 .z = InvocationID
75 .w = tess factor base.
76
77 TES - .x = TessCoord.x
78 - .y = TessCoord.y
79 - .z = RelPatchID (??)
80 - .w = PrimitiveID
81
82 PS - face_gpr.z = SampleMask
83 face_gpr.w = SampleID
84 */
85 #define R600_SHADER_BUFFER_INFO_SEL (512 + R600_BUFFER_INFO_OFFSET / 16)
86 static int r600_shader_from_tgsi(struct r600_context *rctx,
87 struct r600_pipe_shader *pipeshader,
88 union r600_shader_key key);
89
90 static void r600_add_gpr_array(struct r600_shader *ps, int start_gpr,
91 int size, unsigned comp_mask) {
92
93 if (!size)
94 return;
95
96 if (ps->num_arrays == ps->max_arrays) {
97 ps->max_arrays += 64;
98 ps->arrays = realloc(ps->arrays, ps->max_arrays *
99 sizeof(struct r600_shader_array));
100 }
101
102 int n = ps->num_arrays;
103 ++ps->num_arrays;
104
105 ps->arrays[n].comp_mask = comp_mask;
106 ps->arrays[n].gpr_start = start_gpr;
107 ps->arrays[n].gpr_count = size;
108 }
109
110 static void r600_dump_streamout(struct pipe_stream_output_info *so)
111 {
112 unsigned i;
113
114 fprintf(stderr, "STREAMOUT\n");
115 for (i = 0; i < so->num_outputs; i++) {
116 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
117 so->output[i].start_component;
118 fprintf(stderr, " %i: MEM_STREAM%d_BUF%i[%i..%i] <- OUT[%i].%s%s%s%s%s\n",
119 i,
120 so->output[i].stream,
121 so->output[i].output_buffer,
122 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
123 so->output[i].register_index,
124 mask & 1 ? "x" : "",
125 mask & 2 ? "y" : "",
126 mask & 4 ? "z" : "",
127 mask & 8 ? "w" : "",
128 so->output[i].dst_offset < so->output[i].start_component ? " (will lower)" : "");
129 }
130 }
131
132 static int store_shader(struct pipe_context *ctx,
133 struct r600_pipe_shader *shader)
134 {
135 struct r600_context *rctx = (struct r600_context *)ctx;
136 uint32_t *ptr, i;
137
138 if (shader->bo == NULL) {
139 shader->bo = (struct r600_resource*)
140 pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_IMMUTABLE, shader->shader.bc.ndw * 4);
141 if (shader->bo == NULL) {
142 return -ENOMEM;
143 }
144 ptr = r600_buffer_map_sync_with_rings(&rctx->b, shader->bo, PIPE_TRANSFER_WRITE);
145 if (R600_BIG_ENDIAN) {
146 for (i = 0; i < shader->shader.bc.ndw; ++i) {
147 ptr[i] = util_cpu_to_le32(shader->shader.bc.bytecode[i]);
148 }
149 } else {
150 memcpy(ptr, shader->shader.bc.bytecode, shader->shader.bc.ndw * sizeof(*ptr));
151 }
152 rctx->b.ws->buffer_unmap(shader->bo->buf);
153 }
154
155 return 0;
156 }
157
158 int r600_pipe_shader_create(struct pipe_context *ctx,
159 struct r600_pipe_shader *shader,
160 union r600_shader_key key)
161 {
162 struct r600_context *rctx = (struct r600_context *)ctx;
163 struct r600_pipe_shader_selector *sel = shader->selector;
164 int r;
165 bool dump = r600_can_dump_shader(&rctx->screen->b,
166 tgsi_get_processor_type(sel->tokens));
167 unsigned use_sb = !(rctx->screen->b.debug_flags & DBG_NO_SB);
168 unsigned sb_disasm;
169 unsigned export_shader;
170
171 shader->shader.bc.isa = rctx->isa;
172
173 if (dump) {
174 fprintf(stderr, "--------------------------------------------------------------\n");
175 tgsi_dump(sel->tokens, 0);
176
177 if (sel->so.num_outputs) {
178 r600_dump_streamout(&sel->so);
179 }
180 }
181 r = r600_shader_from_tgsi(rctx, shader, key);
182 if (r) {
183 R600_ERR("translation from TGSI failed !\n");
184 goto error;
185 }
186 if (shader->shader.processor_type == PIPE_SHADER_VERTEX) {
187 /* only disable for vertex shaders in tess paths */
188 if (key.vs.as_ls)
189 use_sb = 0;
190 }
191 use_sb &= (shader->shader.processor_type != PIPE_SHADER_TESS_CTRL);
192 use_sb &= (shader->shader.processor_type != PIPE_SHADER_TESS_EVAL);
193 use_sb &= (shader->shader.processor_type != PIPE_SHADER_COMPUTE);
194
195 /* disable SB for shaders using doubles */
196 use_sb &= !shader->shader.uses_doubles;
197
198 use_sb &= !shader->shader.uses_atomics;
199 use_sb &= !shader->shader.uses_images;
200 use_sb &= !shader->shader.uses_helper_invocation;
201
202 /* Check if the bytecode has already been built. */
203 if (!shader->shader.bc.bytecode) {
204 r = r600_bytecode_build(&shader->shader.bc);
205 if (r) {
206 R600_ERR("building bytecode failed !\n");
207 goto error;
208 }
209 }
210
211 sb_disasm = use_sb || (rctx->screen->b.debug_flags & DBG_SB_DISASM);
212 if (dump && !sb_disasm) {
213 fprintf(stderr, "--------------------------------------------------------------\n");
214 r600_bytecode_disasm(&shader->shader.bc);
215 fprintf(stderr, "______________________________________________________________\n");
216 } else if ((dump && sb_disasm) || use_sb) {
217 r = r600_sb_bytecode_process(rctx, &shader->shader.bc, &shader->shader,
218 dump, use_sb);
219 if (r) {
220 R600_ERR("r600_sb_bytecode_process failed !\n");
221 goto error;
222 }
223 }
224
225 if (shader->gs_copy_shader) {
226 if (dump) {
227 // dump copy shader
228 r = r600_sb_bytecode_process(rctx, &shader->gs_copy_shader->shader.bc,
229 &shader->gs_copy_shader->shader, dump, 0);
230 if (r)
231 goto error;
232 }
233
234 if ((r = store_shader(ctx, shader->gs_copy_shader)))
235 goto error;
236 }
237
238 /* Store the shader in a buffer. */
239 if ((r = store_shader(ctx, shader)))
240 goto error;
241
242 /* Build state. */
243 switch (shader->shader.processor_type) {
244 case PIPE_SHADER_TESS_CTRL:
245 evergreen_update_hs_state(ctx, shader);
246 break;
247 case PIPE_SHADER_TESS_EVAL:
248 if (key.tes.as_es)
249 evergreen_update_es_state(ctx, shader);
250 else
251 evergreen_update_vs_state(ctx, shader);
252 break;
253 case PIPE_SHADER_GEOMETRY:
254 if (rctx->b.chip_class >= EVERGREEN) {
255 evergreen_update_gs_state(ctx, shader);
256 evergreen_update_vs_state(ctx, shader->gs_copy_shader);
257 } else {
258 r600_update_gs_state(ctx, shader);
259 r600_update_vs_state(ctx, shader->gs_copy_shader);
260 }
261 break;
262 case PIPE_SHADER_VERTEX:
263 export_shader = key.vs.as_es;
264 if (rctx->b.chip_class >= EVERGREEN) {
265 if (key.vs.as_ls)
266 evergreen_update_ls_state(ctx, shader);
267 else if (key.vs.as_es)
268 evergreen_update_es_state(ctx, shader);
269 else
270 evergreen_update_vs_state(ctx, shader);
271 } else {
272 if (export_shader)
273 r600_update_es_state(ctx, shader);
274 else
275 r600_update_vs_state(ctx, shader);
276 }
277 break;
278 case PIPE_SHADER_FRAGMENT:
279 if (rctx->b.chip_class >= EVERGREEN) {
280 evergreen_update_ps_state(ctx, shader);
281 } else {
282 r600_update_ps_state(ctx, shader);
283 }
284 break;
285 case PIPE_SHADER_COMPUTE:
286 evergreen_update_ls_state(ctx, shader);
287 break;
288 default:
289 r = -EINVAL;
290 goto error;
291 }
292 return 0;
293
294 error:
295 r600_pipe_shader_destroy(ctx, shader);
296 return r;
297 }
298
299 void r600_pipe_shader_destroy(struct pipe_context *ctx UNUSED, struct r600_pipe_shader *shader)
300 {
301 r600_resource_reference(&shader->bo, NULL);
302 r600_bytecode_clear(&shader->shader.bc);
303 r600_release_command_buffer(&shader->command_buffer);
304 }
305
306 /*
307 * tgsi -> r600 shader
308 */
309 struct r600_shader_tgsi_instruction;
310
311 struct r600_shader_src {
312 unsigned sel;
313 unsigned swizzle[4];
314 unsigned neg;
315 unsigned abs;
316 unsigned rel;
317 unsigned kc_bank;
318 boolean kc_rel; /* true if cache bank is indexed */
319 uint32_t value[4];
320 };
321
322 struct eg_interp {
323 boolean enabled;
324 unsigned ij_index;
325 };
326
327 struct r600_shader_ctx {
328 struct tgsi_shader_info info;
329 struct tgsi_array_info *array_infos;
330 /* flag for each tgsi temp array if its been spilled or not */
331 bool *spilled_arrays;
332 struct tgsi_parse_context parse;
333 const struct tgsi_token *tokens;
334 unsigned type;
335 unsigned file_offset[TGSI_FILE_COUNT];
336 unsigned temp_reg;
337 const struct r600_shader_tgsi_instruction *inst_info;
338 struct r600_bytecode *bc;
339 struct r600_shader *shader;
340 struct r600_shader_src src[4];
341 uint32_t *literals;
342 uint32_t nliterals;
343 uint32_t max_driver_temp_used;
344 /* needed for evergreen interpolation */
345 struct eg_interp eg_interpolators[6]; // indexed by Persp/Linear * 3 + sample/center/centroid
346 /* evergreen/cayman also store sample mask in face register */
347 int face_gpr;
348 /* sample id is .w component stored in fixed point position register */
349 int fixed_pt_position_gpr;
350 int colors_used;
351 boolean clip_vertex_write;
352 unsigned cv_output;
353 unsigned edgeflag_output;
354 int helper_invoc_reg;
355 int cs_block_size_reg;
356 int cs_grid_size_reg;
357 bool cs_block_size_loaded, cs_grid_size_loaded;
358 int fragcoord_input;
359 int next_ring_offset;
360 int gs_out_ring_offset;
361 int gs_next_vertex;
362 struct r600_shader *gs_for_vs;
363 int gs_export_gpr_tregs[4];
364 int gs_rotated_input[2];
365 const struct pipe_stream_output_info *gs_stream_output_info;
366 unsigned enabled_stream_buffers_mask;
367 unsigned tess_input_info; /* temp with tess input offsets */
368 unsigned tess_output_info; /* temp with tess input offsets */
369 unsigned thread_id_gpr; /* temp with thread id calculated for images */
370 bool thread_id_gpr_loaded;
371 };
372
373 struct r600_shader_tgsi_instruction {
374 unsigned op;
375 int (*process)(struct r600_shader_ctx *ctx);
376 };
377
378 static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so, int stream, bool ind);
379 static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[];
380 static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx);
381 static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason);
382 static void fc_pushlevel(struct r600_shader_ctx *ctx, int type);
383 static int tgsi_else(struct r600_shader_ctx *ctx);
384 static int tgsi_endif(struct r600_shader_ctx *ctx);
385 static int tgsi_bgnloop(struct r600_shader_ctx *ctx);
386 static int tgsi_endloop(struct r600_shader_ctx *ctx);
387 static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx);
388 static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
389 unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,
390 unsigned int dst_reg);
391 static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
392 const struct r600_shader_src *shader_src,
393 unsigned chan);
394 static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg,
395 unsigned dst_reg, unsigned mask);
396
397 static int tgsi_last_instruction(unsigned writemask)
398 {
399 int i, lasti = 0;
400
401 for (i = 0; i < 4; i++) {
402 if (writemask & (1 << i)) {
403 lasti = i;
404 }
405 }
406 return lasti;
407 }
408
409 static int tgsi_is_supported(struct r600_shader_ctx *ctx)
410 {
411 struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction;
412 unsigned j;
413
414 if (i->Instruction.NumDstRegs > 1 && i->Instruction.Opcode != TGSI_OPCODE_DFRACEXP) {
415 R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs);
416 return -EINVAL;
417 }
418 #if 0
419 if (i->Instruction.Label) {
420 R600_ERR("label unsupported\n");
421 return -EINVAL;
422 }
423 #endif
424 for (j = 0; j < i->Instruction.NumSrcRegs; j++) {
425 if (i->Src[j].Register.Dimension) {
426 switch (i->Src[j].Register.File) {
427 case TGSI_FILE_CONSTANT:
428 case TGSI_FILE_HW_ATOMIC:
429 break;
430 case TGSI_FILE_INPUT:
431 if (ctx->type == PIPE_SHADER_GEOMETRY ||
432 ctx->type == PIPE_SHADER_TESS_CTRL ||
433 ctx->type == PIPE_SHADER_TESS_EVAL)
434 break;
435 case TGSI_FILE_OUTPUT:
436 if (ctx->type == PIPE_SHADER_TESS_CTRL)
437 break;
438 default:
439 R600_ERR("unsupported src %d (file %d, dimension %d)\n", j,
440 i->Src[j].Register.File,
441 i->Src[j].Register.Dimension);
442 return -EINVAL;
443 }
444 }
445 }
446 for (j = 0; j < i->Instruction.NumDstRegs; j++) {
447 if (i->Dst[j].Register.Dimension) {
448 if (ctx->type == PIPE_SHADER_TESS_CTRL)
449 continue;
450 R600_ERR("unsupported dst (dimension)\n");
451 return -EINVAL;
452 }
453 }
454 return 0;
455 }
456
457 int eg_get_interpolator_index(unsigned interpolate, unsigned location)
458 {
459 if (interpolate == TGSI_INTERPOLATE_COLOR ||
460 interpolate == TGSI_INTERPOLATE_LINEAR ||
461 interpolate == TGSI_INTERPOLATE_PERSPECTIVE)
462 {
463 int is_linear = interpolate == TGSI_INTERPOLATE_LINEAR;
464 int loc;
465
466 switch(location) {
467 case TGSI_INTERPOLATE_LOC_CENTER:
468 loc = 1;
469 break;
470 case TGSI_INTERPOLATE_LOC_CENTROID:
471 loc = 2;
472 break;
473 case TGSI_INTERPOLATE_LOC_SAMPLE:
474 default:
475 loc = 0; break;
476 }
477
478 return is_linear * 3 + loc;
479 }
480
481 return -1;
482 }
483
484 static void evergreen_interp_assign_ij_index(struct r600_shader_ctx *ctx,
485 int input)
486 {
487 int i = eg_get_interpolator_index(
488 ctx->shader->input[input].interpolate,
489 ctx->shader->input[input].interpolate_location);
490 assert(i >= 0);
491 ctx->shader->input[input].ij_index = ctx->eg_interpolators[i].ij_index;
492 }
493
494 static int evergreen_interp_alu(struct r600_shader_ctx *ctx, int input)
495 {
496 int i, r;
497 struct r600_bytecode_alu alu;
498 int gpr = 0, base_chan = 0;
499 int ij_index = ctx->shader->input[input].ij_index;
500
501 /* work out gpr and base_chan from index */
502 gpr = ij_index / 2;
503 base_chan = (2 * (ij_index % 2)) + 1;
504
505 for (i = 0; i < 8; i++) {
506 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
507
508 if (i < 4)
509 alu.op = ALU_OP2_INTERP_ZW;
510 else
511 alu.op = ALU_OP2_INTERP_XY;
512
513 if ((i > 1) && (i < 6)) {
514 alu.dst.sel = ctx->shader->input[input].gpr;
515 alu.dst.write = 1;
516 }
517
518 alu.dst.chan = i % 4;
519
520 alu.src[0].sel = gpr;
521 alu.src[0].chan = (base_chan - (i % 2));
522
523 alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
524
525 alu.bank_swizzle_force = SQ_ALU_VEC_210;
526 if ((i % 4) == 3)
527 alu.last = 1;
528 r = r600_bytecode_add_alu(ctx->bc, &alu);
529 if (r)
530 return r;
531 }
532 return 0;
533 }
534
535 static int evergreen_interp_flat(struct r600_shader_ctx *ctx, int input)
536 {
537 int i, r;
538 struct r600_bytecode_alu alu;
539
540 for (i = 0; i < 4; i++) {
541 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
542
543 alu.op = ALU_OP1_INTERP_LOAD_P0;
544
545 alu.dst.sel = ctx->shader->input[input].gpr;
546 alu.dst.write = 1;
547
548 alu.dst.chan = i;
549
550 alu.src[0].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
551 alu.src[0].chan = i;
552
553 if (i == 3)
554 alu.last = 1;
555 r = r600_bytecode_add_alu(ctx->bc, &alu);
556 if (r)
557 return r;
558 }
559 return 0;
560 }
561
562 /*
563 * Special export handling in shaders
564 *
565 * shader export ARRAY_BASE for EXPORT_POS:
566 * 60 is position
567 * 61 is misc vector
568 * 62, 63 are clip distance vectors
569 *
570 * The use of the values exported in 61-63 are controlled by PA_CL_VS_OUT_CNTL:
571 * VS_OUT_MISC_VEC_ENA - enables the use of all fields in export 61
572 * USE_VTX_POINT_SIZE - point size in the X channel of export 61
573 * USE_VTX_EDGE_FLAG - edge flag in the Y channel of export 61
574 * USE_VTX_RENDER_TARGET_INDX - render target index in the Z channel of export 61
575 * USE_VTX_VIEWPORT_INDX - viewport index in the W channel of export 61
576 * USE_VTX_KILL_FLAG - kill flag in the Z channel of export 61 (mutually
577 * exclusive from render target index)
578 * VS_OUT_CCDIST0_VEC_ENA/VS_OUT_CCDIST1_VEC_ENA - enable clip distance vectors
579 *
580 *
581 * shader export ARRAY_BASE for EXPORT_PIXEL:
582 * 0-7 CB targets
583 * 61 computed Z vector
584 *
585 * The use of the values exported in the computed Z vector are controlled
586 * by DB_SHADER_CONTROL:
587 * Z_EXPORT_ENABLE - Z as a float in RED
588 * STENCIL_REF_EXPORT_ENABLE - stencil ref as int in GREEN
589 * COVERAGE_TO_MASK_ENABLE - alpha to mask in ALPHA
590 * MASK_EXPORT_ENABLE - pixel sample mask in BLUE
591 * DB_SOURCE_FORMAT - export control restrictions
592 *
593 */
594
595
596 /* Map name/sid pair from tgsi to the 8-bit semantic index for SPI setup */
597 static int r600_spi_sid(struct r600_shader_io * io)
598 {
599 int index, name = io->name;
600
601 /* These params are handled differently, they don't need
602 * semantic indices, so we'll use 0 for them.
603 */
604 if (name == TGSI_SEMANTIC_POSITION ||
605 name == TGSI_SEMANTIC_PSIZE ||
606 name == TGSI_SEMANTIC_EDGEFLAG ||
607 name == TGSI_SEMANTIC_FACE ||
608 name == TGSI_SEMANTIC_SAMPLEMASK)
609 index = 0;
610 else {
611 if (name == TGSI_SEMANTIC_GENERIC) {
612 /* For generic params simply use sid from tgsi */
613 index = io->sid;
614 } else {
615 /* For non-generic params - pack name and sid into 8 bits */
616 index = 0x80 | (name<<3) | (io->sid);
617 }
618
619 /* Make sure that all really used indices have nonzero value, so
620 * we can just compare it to 0 later instead of comparing the name
621 * with different values to detect special cases. */
622 index++;
623 }
624
625 return index;
626 };
627
628 /* we need this to get a common lds index for vs/tcs/tes input/outputs */
629 int r600_get_lds_unique_index(unsigned semantic_name, unsigned index)
630 {
631 switch (semantic_name) {
632 case TGSI_SEMANTIC_POSITION:
633 return 0;
634 case TGSI_SEMANTIC_PSIZE:
635 return 1;
636 case TGSI_SEMANTIC_CLIPDIST:
637 assert(index <= 1);
638 return 2 + index;
639 case TGSI_SEMANTIC_GENERIC:
640 if (index <= 63-4)
641 return 4 + index - 9;
642 else
643 /* same explanation as in the default statement,
644 * the only user hitting this is st/nine.
645 */
646 return 0;
647
648 /* patch indices are completely separate and thus start from 0 */
649 case TGSI_SEMANTIC_TESSOUTER:
650 return 0;
651 case TGSI_SEMANTIC_TESSINNER:
652 return 1;
653 case TGSI_SEMANTIC_PATCH:
654 return 2 + index;
655
656 default:
657 /* Don't fail here. The result of this function is only used
658 * for LS, TCS, TES, and GS, where legacy GL semantics can't
659 * occur, but this function is called for all vertex shaders
660 * before it's known whether LS will be compiled or not.
661 */
662 return 0;
663 }
664 }
665
666 /* turn input into interpolate on EG */
667 static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index)
668 {
669 int r = 0;
670
671 if (ctx->shader->input[index].spi_sid) {
672 ctx->shader->input[index].lds_pos = ctx->shader->nlds++;
673 if (ctx->shader->input[index].interpolate > 0) {
674 evergreen_interp_assign_ij_index(ctx, index);
675 r = evergreen_interp_alu(ctx, index);
676 } else {
677 r = evergreen_interp_flat(ctx, index);
678 }
679 }
680 return r;
681 }
682
683 static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back)
684 {
685 struct r600_bytecode_alu alu;
686 int i, r;
687 int gpr_front = ctx->shader->input[front].gpr;
688 int gpr_back = ctx->shader->input[back].gpr;
689
690 for (i = 0; i < 4; i++) {
691 memset(&alu, 0, sizeof(alu));
692 alu.op = ALU_OP3_CNDGT;
693 alu.is_op3 = 1;
694 alu.dst.write = 1;
695 alu.dst.sel = gpr_front;
696 alu.src[0].sel = ctx->face_gpr;
697 alu.src[1].sel = gpr_front;
698 alu.src[2].sel = gpr_back;
699
700 alu.dst.chan = i;
701 alu.src[1].chan = i;
702 alu.src[2].chan = i;
703 alu.last = (i==3);
704
705 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
706 return r;
707 }
708
709 return 0;
710 }
711
712 /* execute a single slot ALU calculation */
713 static int single_alu_op2(struct r600_shader_ctx *ctx, int op,
714 int dst_sel, int dst_chan,
715 int src0_sel, unsigned src0_chan_val,
716 int src1_sel, unsigned src1_chan_val)
717 {
718 struct r600_bytecode_alu alu;
719 int r, i;
720
721 if (ctx->bc->chip_class == CAYMAN && op == ALU_OP2_MULLO_INT) {
722 for (i = 0; i < 4; i++) {
723 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
724 alu.op = op;
725 alu.src[0].sel = src0_sel;
726 if (src0_sel == V_SQ_ALU_SRC_LITERAL)
727 alu.src[0].value = src0_chan_val;
728 else
729 alu.src[0].chan = src0_chan_val;
730 alu.src[1].sel = src1_sel;
731 if (src1_sel == V_SQ_ALU_SRC_LITERAL)
732 alu.src[1].value = src1_chan_val;
733 else
734 alu.src[1].chan = src1_chan_val;
735 alu.dst.sel = dst_sel;
736 alu.dst.chan = i;
737 alu.dst.write = i == dst_chan;
738 alu.last = (i == 3);
739 r = r600_bytecode_add_alu(ctx->bc, &alu);
740 if (r)
741 return r;
742 }
743 return 0;
744 }
745
746 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
747 alu.op = op;
748 alu.src[0].sel = src0_sel;
749 if (src0_sel == V_SQ_ALU_SRC_LITERAL)
750 alu.src[0].value = src0_chan_val;
751 else
752 alu.src[0].chan = src0_chan_val;
753 alu.src[1].sel = src1_sel;
754 if (src1_sel == V_SQ_ALU_SRC_LITERAL)
755 alu.src[1].value = src1_chan_val;
756 else
757 alu.src[1].chan = src1_chan_val;
758 alu.dst.sel = dst_sel;
759 alu.dst.chan = dst_chan;
760 alu.dst.write = 1;
761 alu.last = 1;
762 r = r600_bytecode_add_alu(ctx->bc, &alu);
763 if (r)
764 return r;
765 return 0;
766 }
767
768 /* execute a single slot ALU calculation */
769 static int single_alu_op3(struct r600_shader_ctx *ctx, int op,
770 int dst_sel, int dst_chan,
771 int src0_sel, unsigned src0_chan_val,
772 int src1_sel, unsigned src1_chan_val,
773 int src2_sel, unsigned src2_chan_val)
774 {
775 struct r600_bytecode_alu alu;
776 int r;
777
778 /* validate this for other ops */
779 assert(op == ALU_OP3_MULADD_UINT24 || op == ALU_OP3_CNDE_INT || op == ALU_OP3_BFE_UINT);
780 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
781 alu.op = op;
782 alu.src[0].sel = src0_sel;
783 if (src0_sel == V_SQ_ALU_SRC_LITERAL)
784 alu.src[0].value = src0_chan_val;
785 else
786 alu.src[0].chan = src0_chan_val;
787 alu.src[1].sel = src1_sel;
788 if (src1_sel == V_SQ_ALU_SRC_LITERAL)
789 alu.src[1].value = src1_chan_val;
790 else
791 alu.src[1].chan = src1_chan_val;
792 alu.src[2].sel = src2_sel;
793 if (src2_sel == V_SQ_ALU_SRC_LITERAL)
794 alu.src[2].value = src2_chan_val;
795 else
796 alu.src[2].chan = src2_chan_val;
797 alu.dst.sel = dst_sel;
798 alu.dst.chan = dst_chan;
799 alu.is_op3 = 1;
800 alu.last = 1;
801 r = r600_bytecode_add_alu(ctx->bc, &alu);
802 if (r)
803 return r;
804 return 0;
805 }
806
807 /* put it in temp_reg.x */
808 static int get_lds_offset0(struct r600_shader_ctx *ctx,
809 int rel_patch_chan,
810 int temp_reg, bool is_patch_var)
811 {
812 int r;
813
814 /* MUL temp.x, patch_stride (input_vals.x), rel_patch_id (r0.y (tcs)) */
815 /* ADD
816 Dimension - patch0_offset (input_vals.z),
817 Non-dim - patch0_data_offset (input_vals.w)
818 */
819 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
820 temp_reg, 0,
821 ctx->tess_output_info, 0,
822 0, rel_patch_chan,
823 ctx->tess_output_info, is_patch_var ? 3 : 2);
824 if (r)
825 return r;
826 return 0;
827 }
828
829 static inline int get_address_file_reg(struct r600_shader_ctx *ctx, int index)
830 {
831 return index > 0 ? ctx->bc->index_reg[index - 1] : ctx->bc->ar_reg;
832 }
833
834 static int r600_get_temp(struct r600_shader_ctx *ctx)
835 {
836 return ctx->temp_reg + ctx->max_driver_temp_used++;
837 }
838
839 static int vs_add_primid_output(struct r600_shader_ctx *ctx, int prim_id_sid)
840 {
841 int i;
842 i = ctx->shader->noutput++;
843 ctx->shader->output[i].name = TGSI_SEMANTIC_PRIMID;
844 ctx->shader->output[i].sid = 0;
845 ctx->shader->output[i].gpr = 0;
846 ctx->shader->output[i].interpolate = TGSI_INTERPOLATE_CONSTANT;
847 ctx->shader->output[i].write_mask = 0x4;
848 ctx->shader->output[i].spi_sid = prim_id_sid;
849
850 return 0;
851 }
852
853 static int tgsi_barrier(struct r600_shader_ctx *ctx)
854 {
855 struct r600_bytecode_alu alu;
856 int r;
857
858 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
859 alu.op = ctx->inst_info->op;
860 alu.last = 1;
861
862 r = r600_bytecode_add_alu(ctx->bc, &alu);
863 if (r)
864 return r;
865 return 0;
866 }
867
868 static void choose_spill_arrays(struct r600_shader_ctx *ctx, int *regno, unsigned *scratch_space_needed)
869 {
870 // pick largest array and spill it, repeat until the number of temps is under limit or we run out of arrays
871 unsigned n = ctx->info.array_max[TGSI_FILE_TEMPORARY];
872 unsigned narrays_left = n;
873 bool *spilled = ctx->spilled_arrays; // assumed calloc:ed
874
875 *scratch_space_needed = 0;
876 while (*regno > 124 && narrays_left) {
877 unsigned i;
878 unsigned largest = 0;
879 unsigned largest_index = 0;
880
881 for (i = 0; i < n; i++) {
882 unsigned size = ctx->array_infos[i].range.Last - ctx->array_infos[i].range.First + 1;
883 if (!spilled[i] && size > largest) {
884 largest = size;
885 largest_index = i;
886 }
887 }
888
889 spilled[largest_index] = true;
890 *regno -= largest;
891 *scratch_space_needed += largest;
892
893 narrays_left --;
894 }
895
896 if (narrays_left == 0) {
897 ctx->info.indirect_files &= ~(1 << TGSI_FILE_TEMPORARY);
898 }
899 }
900
901 /* Take spilled temp arrays into account when translating tgsi register
902 * indexes into r600 gprs if spilled is false, or scratch array offset if
903 * spilled is true */
904 static int map_tgsi_reg_index_to_r600_gpr(struct r600_shader_ctx *ctx, unsigned tgsi_reg_index, bool *spilled)
905 {
906 unsigned i;
907 unsigned spilled_size = 0;
908
909 for (i = 0; i < ctx->info.array_max[TGSI_FILE_TEMPORARY]; i++) {
910 if (tgsi_reg_index >= ctx->array_infos[i].range.First && tgsi_reg_index <= ctx->array_infos[i].range.Last) {
911 if (ctx->spilled_arrays[i]) {
912 /* vec4 index into spilled scratch memory */
913 *spilled = true;
914 return tgsi_reg_index - ctx->array_infos[i].range.First + spilled_size;
915 }
916 else {
917 /* regular GPR array */
918 *spilled = false;
919 return tgsi_reg_index - spilled_size + ctx->file_offset[TGSI_FILE_TEMPORARY];
920 }
921 }
922
923 if (ctx->spilled_arrays[i]) {
924 spilled_size += ctx->array_infos[i].range.Last - ctx->array_infos[i].range.First + 1;
925 }
926 }
927
928 /* regular GPR index, minus the holes from spilled arrays */
929 *spilled = false;
930
931 return tgsi_reg_index - spilled_size + ctx->file_offset[TGSI_FILE_TEMPORARY];
932 }
933
934 /* look up spill area base offset and array size for a spilled temp array */
935 static void get_spilled_array_base_and_size(struct r600_shader_ctx *ctx, unsigned tgsi_reg_index,
936 unsigned *array_base, unsigned *array_size)
937 {
938 unsigned i;
939 unsigned offset = 0;
940
941 for (i = 0; i < ctx->info.array_max[TGSI_FILE_TEMPORARY]; i++) {
942 if (ctx->spilled_arrays[i]) {
943 unsigned size = ctx->array_infos[i].range.Last - ctx->array_infos[i].range.First + 1;
944
945 if (tgsi_reg_index >= ctx->array_infos[i].range.First && tgsi_reg_index <= ctx->array_infos[i].range.Last) {
946 *array_base = offset;
947 *array_size = size - 1; /* hw counts from 1 */
948
949 return;
950 }
951
952 offset += size;
953 }
954 }
955 }
956
957 static int tgsi_declaration(struct r600_shader_ctx *ctx)
958 {
959 struct tgsi_full_declaration *d = &ctx->parse.FullToken.FullDeclaration;
960 int r, i, j, count = d->Range.Last - d->Range.First + 1;
961
962 switch (d->Declaration.File) {
963 case TGSI_FILE_INPUT:
964 for (j = 0; j < count; j++) {
965 i = ctx->shader->ninput + j;
966 assert(i < ARRAY_SIZE(ctx->shader->input));
967 ctx->shader->input[i].name = d->Semantic.Name;
968 ctx->shader->input[i].sid = d->Semantic.Index + j;
969 ctx->shader->input[i].interpolate = d->Interp.Interpolate;
970 ctx->shader->input[i].interpolate_location = d->Interp.Location;
971 ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First + j;
972 if (ctx->type == PIPE_SHADER_FRAGMENT) {
973 ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]);
974 switch (ctx->shader->input[i].name) {
975 case TGSI_SEMANTIC_FACE:
976 if (ctx->face_gpr != -1)
977 ctx->shader->input[i].gpr = ctx->face_gpr; /* already allocated by allocate_system_value_inputs */
978 else
979 ctx->face_gpr = ctx->shader->input[i].gpr;
980 break;
981 case TGSI_SEMANTIC_COLOR:
982 ctx->colors_used++;
983 break;
984 case TGSI_SEMANTIC_POSITION:
985 ctx->fragcoord_input = i;
986 break;
987 case TGSI_SEMANTIC_PRIMID:
988 /* set this for now */
989 ctx->shader->gs_prim_id_input = true;
990 ctx->shader->ps_prim_id_input = i;
991 break;
992 }
993 if (ctx->bc->chip_class >= EVERGREEN) {
994 if ((r = evergreen_interp_input(ctx, i)))
995 return r;
996 }
997 } else if (ctx->type == PIPE_SHADER_GEOMETRY) {
998 /* FIXME probably skip inputs if they aren't passed in the ring */
999 ctx->shader->input[i].ring_offset = ctx->next_ring_offset;
1000 ctx->next_ring_offset += 16;
1001 if (ctx->shader->input[i].name == TGSI_SEMANTIC_PRIMID)
1002 ctx->shader->gs_prim_id_input = true;
1003 }
1004 }
1005 ctx->shader->ninput += count;
1006 break;
1007 case TGSI_FILE_OUTPUT:
1008 for (j = 0; j < count; j++) {
1009 i = ctx->shader->noutput + j;
1010 assert(i < ARRAY_SIZE(ctx->shader->output));
1011 ctx->shader->output[i].name = d->Semantic.Name;
1012 ctx->shader->output[i].sid = d->Semantic.Index + j;
1013 ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First + j;
1014 ctx->shader->output[i].interpolate = d->Interp.Interpolate;
1015 ctx->shader->output[i].write_mask = d->Declaration.UsageMask;
1016 if (ctx->type == PIPE_SHADER_VERTEX ||
1017 ctx->type == PIPE_SHADER_GEOMETRY ||
1018 ctx->type == PIPE_SHADER_TESS_EVAL) {
1019 ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]);
1020 switch (d->Semantic.Name) {
1021 case TGSI_SEMANTIC_CLIPDIST:
1022 break;
1023 case TGSI_SEMANTIC_PSIZE:
1024 ctx->shader->vs_out_misc_write = 1;
1025 ctx->shader->vs_out_point_size = 1;
1026 break;
1027 case TGSI_SEMANTIC_EDGEFLAG:
1028 ctx->shader->vs_out_misc_write = 1;
1029 ctx->shader->vs_out_edgeflag = 1;
1030 ctx->edgeflag_output = i;
1031 break;
1032 case TGSI_SEMANTIC_VIEWPORT_INDEX:
1033 ctx->shader->vs_out_misc_write = 1;
1034 ctx->shader->vs_out_viewport = 1;
1035 break;
1036 case TGSI_SEMANTIC_LAYER:
1037 ctx->shader->vs_out_misc_write = 1;
1038 ctx->shader->vs_out_layer = 1;
1039 break;
1040 case TGSI_SEMANTIC_CLIPVERTEX:
1041 ctx->clip_vertex_write = TRUE;
1042 ctx->cv_output = i;
1043 break;
1044 }
1045 if (ctx->type == PIPE_SHADER_GEOMETRY) {
1046 ctx->gs_out_ring_offset += 16;
1047 }
1048 } else if (ctx->type == PIPE_SHADER_FRAGMENT) {
1049 switch (d->Semantic.Name) {
1050 case TGSI_SEMANTIC_COLOR:
1051 ctx->shader->nr_ps_max_color_exports++;
1052 break;
1053 }
1054 }
1055 }
1056 ctx->shader->noutput += count;
1057 break;
1058 case TGSI_FILE_TEMPORARY:
1059 if (ctx->info.indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
1060 if (d->Array.ArrayID) {
1061 bool spilled;
1062 unsigned idx = map_tgsi_reg_index_to_r600_gpr(ctx,
1063 d->Range.First,
1064 &spilled);
1065
1066 if (!spilled) {
1067 r600_add_gpr_array(ctx->shader, idx,
1068 d->Range.Last - d->Range.First + 1, 0x0F);
1069 }
1070 }
1071 }
1072 break;
1073
1074 case TGSI_FILE_CONSTANT:
1075 case TGSI_FILE_SAMPLER:
1076 case TGSI_FILE_SAMPLER_VIEW:
1077 case TGSI_FILE_ADDRESS:
1078 case TGSI_FILE_BUFFER:
1079 case TGSI_FILE_IMAGE:
1080 case TGSI_FILE_MEMORY:
1081 break;
1082
1083 case TGSI_FILE_HW_ATOMIC:
1084 i = ctx->shader->nhwatomic_ranges;
1085 ctx->shader->atomics[i].start = d->Range.First;
1086 ctx->shader->atomics[i].end = d->Range.Last;
1087 ctx->shader->atomics[i].hw_idx = ctx->shader->atomic_base + ctx->shader->nhwatomic;
1088 ctx->shader->atomics[i].array_id = d->Array.ArrayID;
1089 ctx->shader->atomics[i].buffer_id = d->Dim.Index2D;
1090 ctx->shader->nhwatomic_ranges++;
1091 ctx->shader->nhwatomic += count;
1092 break;
1093
1094 case TGSI_FILE_SYSTEM_VALUE:
1095 if (d->Semantic.Name == TGSI_SEMANTIC_SAMPLEMASK ||
1096 d->Semantic.Name == TGSI_SEMANTIC_SAMPLEID ||
1097 d->Semantic.Name == TGSI_SEMANTIC_SAMPLEPOS) {
1098 break; /* Already handled from allocate_system_value_inputs */
1099 } else if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) {
1100 break;
1101 } else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID)
1102 break;
1103 else if (d->Semantic.Name == TGSI_SEMANTIC_INVOCATIONID)
1104 break;
1105 else if (d->Semantic.Name == TGSI_SEMANTIC_TESSINNER ||
1106 d->Semantic.Name == TGSI_SEMANTIC_TESSOUTER) {
1107 int param = r600_get_lds_unique_index(d->Semantic.Name, 0);
1108 int dreg = d->Semantic.Name == TGSI_SEMANTIC_TESSINNER ? 3 : 2;
1109 unsigned temp_reg = r600_get_temp(ctx);
1110
1111 r = get_lds_offset0(ctx, 2, temp_reg, true);
1112 if (r)
1113 return r;
1114
1115 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
1116 temp_reg, 0,
1117 temp_reg, 0,
1118 V_SQ_ALU_SRC_LITERAL, param * 16);
1119 if (r)
1120 return r;
1121
1122 do_lds_fetch_values(ctx, temp_reg, dreg, 0xf);
1123 }
1124 else if (d->Semantic.Name == TGSI_SEMANTIC_TESSCOORD) {
1125 /* MOV r1.x, r0.x;
1126 MOV r1.y, r0.y;
1127 */
1128 for (i = 0; i < 2; i++) {
1129 struct r600_bytecode_alu alu;
1130 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1131 alu.op = ALU_OP1_MOV;
1132 alu.src[0].sel = 0;
1133 alu.src[0].chan = 0 + i;
1134 alu.dst.sel = 1;
1135 alu.dst.chan = 0 + i;
1136 alu.dst.write = 1;
1137 alu.last = (i == 1) ? 1 : 0;
1138 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1139 return r;
1140 }
1141 /* ADD r1.z, 1.0f, -r0.x */
1142 struct r600_bytecode_alu alu;
1143 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1144 alu.op = ALU_OP2_ADD;
1145 alu.src[0].sel = V_SQ_ALU_SRC_1;
1146 alu.src[1].sel = 1;
1147 alu.src[1].chan = 0;
1148 alu.src[1].neg = 1;
1149 alu.dst.sel = 1;
1150 alu.dst.chan = 2;
1151 alu.dst.write = 1;
1152 alu.last = 1;
1153 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1154 return r;
1155
1156 /* ADD r1.z, r1.z, -r1.y */
1157 alu.op = ALU_OP2_ADD;
1158 alu.src[0].sel = 1;
1159 alu.src[0].chan = 2;
1160 alu.src[1].sel = 1;
1161 alu.src[1].chan = 1;
1162 alu.src[1].neg = 1;
1163 alu.dst.sel = 1;
1164 alu.dst.chan = 2;
1165 alu.dst.write = 1;
1166 alu.last = 1;
1167 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1168 return r;
1169 break;
1170 }
1171 break;
1172 default:
1173 R600_ERR("unsupported file %d declaration\n", d->Declaration.File);
1174 return -EINVAL;
1175 }
1176 return 0;
1177 }
1178
1179 static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_offset)
1180 {
1181 struct tgsi_parse_context parse;
1182 struct {
1183 boolean enabled;
1184 int *reg;
1185 unsigned name, alternate_name;
1186 } inputs[2] = {
1187 { false, &ctx->face_gpr, TGSI_SEMANTIC_SAMPLEMASK, ~0u }, /* lives in Front Face GPR.z */
1188
1189 { false, &ctx->fixed_pt_position_gpr, TGSI_SEMANTIC_SAMPLEID, TGSI_SEMANTIC_SAMPLEPOS } /* SAMPLEID is in Fixed Point Position GPR.w */
1190 };
1191 int num_regs = 0;
1192 unsigned k, i;
1193
1194 if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) {
1195 return 0;
1196 }
1197
1198 /* need to scan shader for system values and interpolateAtSample/Offset/Centroid */
1199 while (!tgsi_parse_end_of_tokens(&parse)) {
1200 tgsi_parse_token(&parse);
1201
1202 if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) {
1203 const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
1204 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE ||
1205 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
1206 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID)
1207 {
1208 int interpolate, location, k;
1209
1210 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
1211 location = TGSI_INTERPOLATE_LOC_CENTER;
1212 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
1213 location = TGSI_INTERPOLATE_LOC_CENTER;
1214 /* Needs sample positions, currently those are always available */
1215 } else {
1216 location = TGSI_INTERPOLATE_LOC_CENTROID;
1217 }
1218
1219 interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index];
1220 k = eg_get_interpolator_index(interpolate, location);
1221 if (k >= 0)
1222 ctx->eg_interpolators[k].enabled = true;
1223 }
1224 } else if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_DECLARATION) {
1225 struct tgsi_full_declaration *d = &parse.FullToken.FullDeclaration;
1226 if (d->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
1227 for (k = 0; k < ARRAY_SIZE(inputs); k++) {
1228 if (d->Semantic.Name == inputs[k].name ||
1229 d->Semantic.Name == inputs[k].alternate_name) {
1230 inputs[k].enabled = true;
1231 }
1232 }
1233 }
1234 }
1235 }
1236
1237 tgsi_parse_free(&parse);
1238
1239 if (ctx->info.reads_samplemask &&
1240 (ctx->info.uses_linear_sample || ctx->info.uses_linear_sample)) {
1241 inputs[1].enabled = true;
1242 }
1243
1244 if (ctx->bc->chip_class >= EVERGREEN) {
1245 int num_baryc = 0;
1246 /* assign gpr to each interpolator according to priority */
1247 for (i = 0; i < ARRAY_SIZE(ctx->eg_interpolators); i++) {
1248 if (ctx->eg_interpolators[i].enabled) {
1249 ctx->eg_interpolators[i].ij_index = num_baryc;
1250 num_baryc++;
1251 }
1252 }
1253 num_baryc = (num_baryc + 1) >> 1;
1254 gpr_offset += num_baryc;
1255 }
1256
1257 for (i = 0; i < ARRAY_SIZE(inputs); i++) {
1258 boolean enabled = inputs[i].enabled;
1259 int *reg = inputs[i].reg;
1260 unsigned name = inputs[i].name;
1261
1262 if (enabled) {
1263 int gpr = gpr_offset + num_regs++;
1264 ctx->shader->nsys_inputs++;
1265
1266 // add to inputs, allocate a gpr
1267 k = ctx->shader->ninput++;
1268 ctx->shader->input[k].name = name;
1269 ctx->shader->input[k].sid = 0;
1270 ctx->shader->input[k].interpolate = TGSI_INTERPOLATE_CONSTANT;
1271 ctx->shader->input[k].interpolate_location = TGSI_INTERPOLATE_LOC_CENTER;
1272 *reg = ctx->shader->input[k].gpr = gpr;
1273 }
1274 }
1275
1276 return gpr_offset + num_regs;
1277 }
1278
1279 /*
1280 * for evergreen we need to scan the shader to find the number of GPRs we need to
1281 * reserve for interpolation and system values
1282 *
1283 * we need to know if we are going to emit any sample or centroid inputs
1284 * if perspective and linear are required
1285 */
1286 static int evergreen_gpr_count(struct r600_shader_ctx *ctx)
1287 {
1288 unsigned i;
1289
1290 memset(&ctx->eg_interpolators, 0, sizeof(ctx->eg_interpolators));
1291
1292 /*
1293 * Could get this information from the shader info. But right now
1294 * we interpolate all declared inputs, whereas the shader info will
1295 * only contain the bits if the inputs are actually used, so it might
1296 * not be safe...
1297 */
1298 for (i = 0; i < ctx->info.num_inputs; i++) {
1299 int k;
1300 /* skip position/face/mask/sampleid */
1301 if (ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_POSITION ||
1302 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_FACE ||
1303 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEMASK ||
1304 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEID)
1305 continue;
1306
1307 k = eg_get_interpolator_index(
1308 ctx->info.input_interpolate[i],
1309 ctx->info.input_interpolate_loc[i]);
1310 if (k >= 0)
1311 ctx->eg_interpolators[k].enabled = TRUE;
1312 }
1313
1314 /* XXX PULL MODEL and LINE STIPPLE */
1315
1316 return allocate_system_value_inputs(ctx, 0);
1317 }
1318
1319 /* sample_id_sel == NULL means fetch for current sample */
1320 static int load_sample_position(struct r600_shader_ctx *ctx, struct r600_shader_src *sample_id, int chan_sel)
1321 {
1322 struct r600_bytecode_vtx vtx;
1323 int r, t1;
1324
1325 t1 = r600_get_temp(ctx);
1326
1327 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
1328 vtx.op = FETCH_OP_VFETCH;
1329 vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;
1330 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1331 if (sample_id == NULL) {
1332 assert(ctx->fixed_pt_position_gpr != -1);
1333
1334 vtx.src_gpr = ctx->fixed_pt_position_gpr; // SAMPLEID is in .w;
1335 vtx.src_sel_x = 3;
1336 }
1337 else {
1338 struct r600_bytecode_alu alu;
1339
1340 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1341 alu.op = ALU_OP1_MOV;
1342 r600_bytecode_src(&alu.src[0], sample_id, chan_sel);
1343 alu.dst.sel = t1;
1344 alu.dst.write = 1;
1345 alu.last = 1;
1346 r = r600_bytecode_add_alu(ctx->bc, &alu);
1347 if (r)
1348 return r;
1349
1350 vtx.src_gpr = t1;
1351 vtx.src_sel_x = 0;
1352 }
1353 vtx.mega_fetch_count = 16;
1354 vtx.dst_gpr = t1;
1355 vtx.dst_sel_x = 0;
1356 vtx.dst_sel_y = 1;
1357 vtx.dst_sel_z = 2;
1358 vtx.dst_sel_w = 3;
1359 vtx.data_format = FMT_32_32_32_32_FLOAT;
1360 vtx.num_format_all = 2;
1361 vtx.format_comp_all = 1;
1362 vtx.use_const_fields = 0;
1363 vtx.offset = 0;
1364 vtx.endian = r600_endian_swap(32);
1365 vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */
1366
1367 r = r600_bytecode_add_vtx(ctx->bc, &vtx);
1368 if (r)
1369 return r;
1370
1371 return t1;
1372 }
1373
1374 static int eg_load_helper_invocation(struct r600_shader_ctx *ctx)
1375 {
1376 int r;
1377 struct r600_bytecode_alu alu;
1378
1379 /* do a vtx fetch with wqm set on the vtx fetch */
1380 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1381 alu.op = ALU_OP1_MOV;
1382 alu.dst.sel = ctx->helper_invoc_reg;
1383 alu.dst.chan = 0;
1384 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
1385 alu.src[0].value = 0xffffffff;
1386 alu.dst.write = 1;
1387 alu.last = 1;
1388 r = r600_bytecode_add_alu(ctx->bc, &alu);
1389 if (r)
1390 return r;
1391
1392 /* do a vtx fetch in VPM mode */
1393 struct r600_bytecode_vtx vtx;
1394 memset(&vtx, 0, sizeof(vtx));
1395 vtx.op = FETCH_OP_GET_BUFFER_RESINFO;
1396 vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;
1397 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1398 vtx.src_gpr = 0;
1399 vtx.mega_fetch_count = 16; /* no idea here really... */
1400 vtx.dst_gpr = ctx->helper_invoc_reg;
1401 vtx.dst_sel_x = 4;
1402 vtx.dst_sel_y = 7; /* SEL_Y */
1403 vtx.dst_sel_z = 7; /* SEL_Z */
1404 vtx.dst_sel_w = 7; /* SEL_W */
1405 vtx.data_format = FMT_32;
1406 if ((r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx)))
1407 return r;
1408 ctx->bc->cf_last->vpm = 1;
1409 return 0;
1410 }
1411
1412 static int cm_load_helper_invocation(struct r600_shader_ctx *ctx)
1413 {
1414 int r;
1415 struct r600_bytecode_alu alu;
1416
1417 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1418 alu.op = ALU_OP1_MOV;
1419 alu.dst.sel = ctx->helper_invoc_reg;
1420 alu.dst.chan = 0;
1421 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
1422 alu.src[0].value = 0xffffffff;
1423 alu.dst.write = 1;
1424 alu.last = 1;
1425 r = r600_bytecode_add_alu(ctx->bc, &alu);
1426 if (r)
1427 return r;
1428
1429 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1430 alu.op = ALU_OP1_MOV;
1431 alu.dst.sel = ctx->helper_invoc_reg;
1432 alu.dst.chan = 0;
1433 alu.src[0].sel = V_SQ_ALU_SRC_0;
1434 alu.dst.write = 1;
1435 alu.last = 1;
1436 r = r600_bytecode_add_alu_type(ctx->bc, &alu, CF_OP_ALU_VALID_PIXEL_MODE);
1437 if (r)
1438 return r;
1439
1440 return ctx->helper_invoc_reg;
1441 }
1442
1443 static int load_block_grid_size(struct r600_shader_ctx *ctx, bool load_block)
1444 {
1445 struct r600_bytecode_vtx vtx;
1446 int r, t1;
1447
1448 if (ctx->cs_block_size_loaded)
1449 return ctx->cs_block_size_reg;
1450 if (ctx->cs_grid_size_loaded)
1451 return ctx->cs_grid_size_reg;
1452
1453 t1 = load_block ? ctx->cs_block_size_reg : ctx->cs_grid_size_reg;
1454 struct r600_bytecode_alu alu;
1455 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1456 alu.op = ALU_OP1_MOV;
1457 alu.src[0].sel = V_SQ_ALU_SRC_0;
1458 alu.dst.sel = t1;
1459 alu.dst.write = 1;
1460 alu.last = 1;
1461 r = r600_bytecode_add_alu(ctx->bc, &alu);
1462 if (r)
1463 return r;
1464
1465 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
1466 vtx.op = FETCH_OP_VFETCH;
1467 vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;
1468 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1469 vtx.src_gpr = t1;
1470 vtx.src_sel_x = 0;
1471
1472 vtx.mega_fetch_count = 16;
1473 vtx.dst_gpr = t1;
1474 vtx.dst_sel_x = 0;
1475 vtx.dst_sel_y = 1;
1476 vtx.dst_sel_z = 2;
1477 vtx.dst_sel_w = 7;
1478 vtx.data_format = FMT_32_32_32_32;
1479 vtx.num_format_all = 1;
1480 vtx.format_comp_all = 0;
1481 vtx.use_const_fields = 0;
1482 vtx.offset = load_block ? 0 : 16; // first element is size of buffer
1483 vtx.endian = r600_endian_swap(32);
1484 vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */
1485
1486 r = r600_bytecode_add_vtx(ctx->bc, &vtx);
1487 if (r)
1488 return r;
1489
1490 if (load_block)
1491 ctx->cs_block_size_loaded = true;
1492 else
1493 ctx->cs_grid_size_loaded = true;
1494 return t1;
1495 }
1496
1497 static void tgsi_src(struct r600_shader_ctx *ctx,
1498 const struct tgsi_full_src_register *tgsi_src,
1499 struct r600_shader_src *r600_src)
1500 {
1501 memset(r600_src, 0, sizeof(*r600_src));
1502 r600_src->swizzle[0] = tgsi_src->Register.SwizzleX;
1503 r600_src->swizzle[1] = tgsi_src->Register.SwizzleY;
1504 r600_src->swizzle[2] = tgsi_src->Register.SwizzleZ;
1505 r600_src->swizzle[3] = tgsi_src->Register.SwizzleW;
1506 r600_src->neg = tgsi_src->Register.Negate;
1507 r600_src->abs = tgsi_src->Register.Absolute;
1508
1509 if (tgsi_src->Register.File == TGSI_FILE_TEMPORARY) {
1510 bool spilled;
1511 unsigned idx;
1512
1513 idx = map_tgsi_reg_index_to_r600_gpr(ctx, tgsi_src->Register.Index, &spilled);
1514
1515 if (spilled) {
1516 int reg = r600_get_temp(ctx);
1517 int r;
1518
1519 r600_src->sel = reg;
1520
1521 if (ctx->bc->chip_class < R700) {
1522 struct r600_bytecode_output cf;
1523
1524 memset(&cf, 0, sizeof(struct r600_bytecode_output));
1525 cf.op = CF_OP_MEM_SCRATCH;
1526 cf.elem_size = 3;
1527 cf.gpr = reg;
1528 cf.comp_mask = 0xF;
1529 cf.swizzle_x = 0;
1530 cf.swizzle_y = 1;
1531 cf.swizzle_z = 2;
1532 cf.swizzle_w = 3;
1533 cf.burst_count = 1;
1534
1535 get_spilled_array_base_and_size(ctx, tgsi_src->Register.Index,
1536 &cf.array_base, &cf.array_size);
1537
1538 if (tgsi_src->Register.Indirect) {
1539 cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND;
1540 cf.index_gpr = ctx->bc->ar_reg;
1541 }
1542 else {
1543 cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ;
1544 cf.array_base += idx;
1545 cf.array_size = 0;
1546 }
1547
1548 r = r600_bytecode_add_output(ctx->bc, &cf);
1549 }
1550 else {
1551 struct r600_bytecode_vtx vtx;
1552
1553 if (r600_bytecode_get_need_wait_ack(ctx->bc)) {
1554 r600_bytecode_need_wait_ack(ctx->bc, false);
1555 r = r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK);
1556 }
1557
1558 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
1559 vtx.op = FETCH_OP_READ_SCRATCH;
1560 vtx.dst_gpr = reg;
1561 vtx.uncached = 1; // Must bypass cache since prior spill written in same invocation
1562 vtx.elem_size = 3;
1563 vtx.data_format = FMT_32_32_32_32;
1564 vtx.num_format_all = V_038010_SQ_NUM_FORMAT_INT;
1565 vtx.dst_sel_x = tgsi_src->Register.SwizzleX;
1566 vtx.dst_sel_y = tgsi_src->Register.SwizzleY;
1567 vtx.dst_sel_z = tgsi_src->Register.SwizzleZ;
1568 vtx.dst_sel_w = tgsi_src->Register.SwizzleW;
1569
1570 get_spilled_array_base_and_size(ctx, tgsi_src->Register.Index,
1571 &vtx.array_base, &vtx.array_size);
1572
1573 if (tgsi_src->Register.Indirect) {
1574 vtx.indexed = 1;
1575 vtx.src_gpr = ctx->bc->ar_reg;
1576 }
1577 else {
1578 vtx.array_base += idx;
1579 vtx.array_size = 0;
1580 }
1581
1582 r = r600_bytecode_add_vtx(ctx->bc, &vtx);
1583 }
1584
1585 if (r)
1586 return;
1587 }
1588 else {
1589 if (tgsi_src->Register.Indirect)
1590 r600_src->rel = V_SQ_REL_RELATIVE;
1591
1592 r600_src->sel = idx;
1593 }
1594
1595 return;
1596 }
1597
1598 if (tgsi_src->Register.File == TGSI_FILE_IMMEDIATE) {
1599 int index;
1600 if ((tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleY) &&
1601 (tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleZ) &&
1602 (tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleW)) {
1603
1604 index = tgsi_src->Register.Index * 4 + tgsi_src->Register.SwizzleX;
1605 r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel, &r600_src->neg, r600_src->abs);
1606 if (r600_src->sel != V_SQ_ALU_SRC_LITERAL)
1607 return;
1608 }
1609 index = tgsi_src->Register.Index;
1610 r600_src->sel = V_SQ_ALU_SRC_LITERAL;
1611 memcpy(r600_src->value, ctx->literals + index * 4, sizeof(r600_src->value));
1612 } else if (tgsi_src->Register.File == TGSI_FILE_SYSTEM_VALUE) {
1613 if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEMASK) {
1614 r600_src->swizzle[0] = 2; // Z value
1615 r600_src->swizzle[1] = 2;
1616 r600_src->swizzle[2] = 2;
1617 r600_src->swizzle[3] = 2;
1618 r600_src->sel = ctx->face_gpr;
1619 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEID) {
1620 r600_src->swizzle[0] = 3; // W value
1621 r600_src->swizzle[1] = 3;
1622 r600_src->swizzle[2] = 3;
1623 r600_src->swizzle[3] = 3;
1624 r600_src->sel = ctx->fixed_pt_position_gpr;
1625 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEPOS) {
1626 r600_src->swizzle[0] = 0;
1627 r600_src->swizzle[1] = 1;
1628 r600_src->swizzle[2] = 4;
1629 r600_src->swizzle[3] = 4;
1630 r600_src->sel = load_sample_position(ctx, NULL, -1);
1631 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INSTANCEID) {
1632 r600_src->swizzle[0] = 3;
1633 r600_src->swizzle[1] = 3;
1634 r600_src->swizzle[2] = 3;
1635 r600_src->swizzle[3] = 3;
1636 r600_src->sel = 0;
1637 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTEXID) {
1638 r600_src->swizzle[0] = 0;
1639 r600_src->swizzle[1] = 0;
1640 r600_src->swizzle[2] = 0;
1641 r600_src->swizzle[3] = 0;
1642 r600_src->sel = 0;
1643 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_THREAD_ID) {
1644 r600_src->sel = 0;
1645 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_BLOCK_ID) {
1646 r600_src->sel = 1;
1647 } else if (ctx->type != PIPE_SHADER_TESS_CTRL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) {
1648 r600_src->swizzle[0] = 3;
1649 r600_src->swizzle[1] = 3;
1650 r600_src->swizzle[2] = 3;
1651 r600_src->swizzle[3] = 3;
1652 r600_src->sel = 1;
1653 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) {
1654 r600_src->swizzle[0] = 2;
1655 r600_src->swizzle[1] = 2;
1656 r600_src->swizzle[2] = 2;
1657 r600_src->swizzle[3] = 2;
1658 r600_src->sel = 0;
1659 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSCOORD) {
1660 r600_src->sel = 1;
1661 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSINNER) {
1662 r600_src->sel = 3;
1663 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSOUTER) {
1664 r600_src->sel = 2;
1665 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTICESIN) {
1666 if (ctx->type == PIPE_SHADER_TESS_CTRL) {
1667 r600_src->sel = ctx->tess_input_info;
1668 r600_src->swizzle[0] = 2;
1669 r600_src->swizzle[1] = 2;
1670 r600_src->swizzle[2] = 2;
1671 r600_src->swizzle[3] = 2;
1672 } else {
1673 r600_src->sel = ctx->tess_input_info;
1674 r600_src->swizzle[0] = 3;
1675 r600_src->swizzle[1] = 3;
1676 r600_src->swizzle[2] = 3;
1677 r600_src->swizzle[3] = 3;
1678 }
1679 } else if (ctx->type == PIPE_SHADER_TESS_CTRL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_PRIMID) {
1680 r600_src->sel = 0;
1681 r600_src->swizzle[0] = 0;
1682 r600_src->swizzle[1] = 0;
1683 r600_src->swizzle[2] = 0;
1684 r600_src->swizzle[3] = 0;
1685 } else if (ctx->type == PIPE_SHADER_TESS_EVAL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_PRIMID) {
1686 r600_src->sel = 0;
1687 r600_src->swizzle[0] = 3;
1688 r600_src->swizzle[1] = 3;
1689 r600_src->swizzle[2] = 3;
1690 r600_src->swizzle[3] = 3;
1691 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_GRID_SIZE) {
1692 r600_src->sel = load_block_grid_size(ctx, false);
1693 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_BLOCK_SIZE) {
1694 r600_src->sel = load_block_grid_size(ctx, true);
1695 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_HELPER_INVOCATION) {
1696 r600_src->sel = ctx->helper_invoc_reg;
1697 r600_src->swizzle[0] = 0;
1698 r600_src->swizzle[1] = 0;
1699 r600_src->swizzle[2] = 0;
1700 r600_src->swizzle[3] = 0;
1701 }
1702 } else {
1703 if (tgsi_src->Register.Indirect)
1704 r600_src->rel = V_SQ_REL_RELATIVE;
1705 r600_src->sel = tgsi_src->Register.Index;
1706 r600_src->sel += ctx->file_offset[tgsi_src->Register.File];
1707 }
1708 if (tgsi_src->Register.File == TGSI_FILE_CONSTANT) {
1709 if (tgsi_src->Register.Dimension) {
1710 r600_src->kc_bank = tgsi_src->Dimension.Index;
1711 if (tgsi_src->Dimension.Indirect) {
1712 r600_src->kc_rel = 1;
1713 }
1714 }
1715 }
1716 }
1717
1718 static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
1719 unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,
1720 unsigned int dst_reg)
1721 {
1722 struct r600_bytecode_vtx vtx;
1723 unsigned int ar_reg;
1724 int r;
1725
1726 if (offset) {
1727 struct r600_bytecode_alu alu;
1728
1729 memset(&alu, 0, sizeof(alu));
1730
1731 alu.op = ALU_OP2_ADD_INT;
1732 alu.src[0].sel = ctx->bc->ar_reg;
1733 alu.src[0].chan = ar_chan;
1734
1735 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1736 alu.src[1].value = offset;
1737
1738 alu.dst.sel = dst_reg;
1739 alu.dst.chan = ar_chan;
1740 alu.dst.write = 1;
1741 alu.last = 1;
1742
1743 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1744 return r;
1745
1746 ar_reg = dst_reg;
1747 } else {
1748 ar_reg = ctx->bc->ar_reg;
1749 }
1750
1751 memset(&vtx, 0, sizeof(vtx));
1752 vtx.buffer_id = cb_idx;
1753 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1754 vtx.src_gpr = ar_reg;
1755 vtx.src_sel_x = ar_chan;
1756 vtx.mega_fetch_count = 16;
1757 vtx.dst_gpr = dst_reg;
1758 vtx.dst_sel_x = 0; /* SEL_X */
1759 vtx.dst_sel_y = 1; /* SEL_Y */
1760 vtx.dst_sel_z = 2; /* SEL_Z */
1761 vtx.dst_sel_w = 3; /* SEL_W */
1762 vtx.data_format = FMT_32_32_32_32_FLOAT;
1763 vtx.num_format_all = 2; /* NUM_FORMAT_SCALED */
1764 vtx.format_comp_all = 1; /* FORMAT_COMP_SIGNED */
1765 vtx.endian = r600_endian_swap(32);
1766 vtx.buffer_index_mode = cb_rel; // cb_rel ? V_SQ_CF_INDEX_0 : V_SQ_CF_INDEX_NONE;
1767
1768 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1769 return r;
1770
1771 return 0;
1772 }
1773
1774 static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1775 {
1776 struct r600_bytecode_vtx vtx;
1777 int r;
1778 unsigned index = src->Register.Index;
1779 unsigned vtx_id = src->Dimension.Index;
1780 int offset_reg = ctx->gs_rotated_input[vtx_id / 3];
1781 int offset_chan = vtx_id % 3;
1782 int t2 = 0;
1783
1784 /* offsets of per-vertex data in ESGS ring are passed to GS in R0.x, R0.y,
1785 * R0.w, R1.x, R1.y, R1.z (it seems R0.z is used for PrimitiveID) */
1786
1787 if (offset_reg == ctx->gs_rotated_input[0] && offset_chan == 2)
1788 offset_chan = 3;
1789
1790 if (src->Dimension.Indirect || src->Register.Indirect)
1791 t2 = r600_get_temp(ctx);
1792
1793 if (src->Dimension.Indirect) {
1794 int treg[3];
1795 struct r600_bytecode_alu alu;
1796 int r, i;
1797 unsigned addr_reg;
1798 addr_reg = get_address_file_reg(ctx, src->DimIndirect.Index);
1799 if (src->DimIndirect.Index > 0) {
1800 r = single_alu_op2(ctx, ALU_OP1_MOV,
1801 ctx->bc->ar_reg, 0,
1802 addr_reg, 0,
1803 0, 0);
1804 if (r)
1805 return r;
1806 }
1807 /*
1808 we have to put the R0.x/y/w into Rt.x Rt+1.x Rt+2.x then index reg from Rt.
1809 at least this is what fglrx seems to do. */
1810 for (i = 0; i < 3; i++) {
1811 treg[i] = r600_get_temp(ctx);
1812 }
1813 r600_add_gpr_array(ctx->shader, treg[0], 3, 0x0F);
1814
1815 for (i = 0; i < 3; i++) {
1816 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1817 alu.op = ALU_OP1_MOV;
1818 alu.src[0].sel = ctx->gs_rotated_input[0];
1819 alu.src[0].chan = i == 2 ? 3 : i;
1820 alu.dst.sel = treg[i];
1821 alu.dst.chan = 0;
1822 alu.dst.write = 1;
1823 alu.last = 1;
1824 r = r600_bytecode_add_alu(ctx->bc, &alu);
1825 if (r)
1826 return r;
1827 }
1828 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1829 alu.op = ALU_OP1_MOV;
1830 alu.src[0].sel = treg[0];
1831 alu.src[0].rel = 1;
1832 alu.dst.sel = t2;
1833 alu.dst.write = 1;
1834 alu.last = 1;
1835 r = r600_bytecode_add_alu(ctx->bc, &alu);
1836 if (r)
1837 return r;
1838 offset_reg = t2;
1839 offset_chan = 0;
1840 }
1841
1842 if (src->Register.Indirect) {
1843 int addr_reg;
1844 unsigned first = ctx->info.input_array_first[src->Indirect.ArrayID];
1845
1846 addr_reg = get_address_file_reg(ctx, src->Indirect.Index);
1847
1848 /* pull the value from index_reg */
1849 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
1850 t2, 1,
1851 addr_reg, 0,
1852 V_SQ_ALU_SRC_LITERAL, first);
1853 if (r)
1854 return r;
1855 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
1856 t2, 0,
1857 t2, 1,
1858 V_SQ_ALU_SRC_LITERAL, 4,
1859 offset_reg, offset_chan);
1860 if (r)
1861 return r;
1862 offset_reg = t2;
1863 offset_chan = 0;
1864 index = src->Register.Index - first;
1865 }
1866
1867 memset(&vtx, 0, sizeof(vtx));
1868 vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
1869 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1870 vtx.src_gpr = offset_reg;
1871 vtx.src_sel_x = offset_chan;
1872 vtx.offset = index * 16; /*bytes*/
1873 vtx.mega_fetch_count = 16;
1874 vtx.dst_gpr = dst_reg;
1875 vtx.dst_sel_x = 0; /* SEL_X */
1876 vtx.dst_sel_y = 1; /* SEL_Y */
1877 vtx.dst_sel_z = 2; /* SEL_Z */
1878 vtx.dst_sel_w = 3; /* SEL_W */
1879 if (ctx->bc->chip_class >= EVERGREEN) {
1880 vtx.use_const_fields = 1;
1881 } else {
1882 vtx.data_format = FMT_32_32_32_32_FLOAT;
1883 }
1884
1885 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1886 return r;
1887
1888 return 0;
1889 }
1890
1891 static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx)
1892 {
1893 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1894 unsigned i;
1895
1896 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1897 struct tgsi_full_src_register *src = &inst->Src[i];
1898
1899 if (src->Register.File == TGSI_FILE_INPUT) {
1900 if (ctx->shader->input[src->Register.Index].name == TGSI_SEMANTIC_PRIMID) {
1901 /* primitive id is in R0.z */
1902 ctx->src[i].sel = 0;
1903 ctx->src[i].swizzle[0] = 2;
1904 }
1905 }
1906 if (src->Register.File == TGSI_FILE_INPUT && src->Register.Dimension) {
1907 int treg = r600_get_temp(ctx);
1908
1909 fetch_gs_input(ctx, src, treg);
1910 ctx->src[i].sel = treg;
1911 ctx->src[i].rel = 0;
1912 }
1913 }
1914 return 0;
1915 }
1916
1917
1918 /* Tessellation shaders pass outputs to the next shader using LDS.
1919 *
1920 * LS outputs = TCS(HS) inputs
1921 * TCS(HS) outputs = TES(DS) inputs
1922 *
1923 * The LDS layout is:
1924 * - TCS inputs for patch 0
1925 * - TCS inputs for patch 1
1926 * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2)
1927 * - ...
1928 * - TCS outputs for patch 0 = get_tcs_out_patch0_offset
1929 * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset
1930 * - TCS outputs for patch 1
1931 * - Per-patch TCS outputs for patch 1
1932 * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2)
1933 * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
1934 * - ...
1935 *
1936 * All three shaders VS(LS), TCS, TES share the same LDS space.
1937 */
1938 /* this will return with the dw address in temp_reg.x */
1939 static int r600_get_byte_address(struct r600_shader_ctx *ctx, int temp_reg,
1940 const struct tgsi_full_dst_register *dst,
1941 const struct tgsi_full_src_register *src,
1942 int stride_bytes_reg, int stride_bytes_chan)
1943 {
1944 struct tgsi_full_dst_register reg;
1945 ubyte *name, *index, *array_first;
1946 int r;
1947 int param;
1948 struct tgsi_shader_info *info = &ctx->info;
1949 /* Set the register description. The address computation is the same
1950 * for sources and destinations. */
1951 if (src) {
1952 reg.Register.File = src->Register.File;
1953 reg.Register.Index = src->Register.Index;
1954 reg.Register.Indirect = src->Register.Indirect;
1955 reg.Register.Dimension = src->Register.Dimension;
1956 reg.Indirect = src->Indirect;
1957 reg.Dimension = src->Dimension;
1958 reg.DimIndirect = src->DimIndirect;
1959 } else
1960 reg = *dst;
1961
1962 /* If the register is 2-dimensional (e.g. an array of vertices
1963 * in a primitive), calculate the base address of the vertex. */
1964 if (reg.Register.Dimension) {
1965 int sel, chan;
1966 if (reg.Dimension.Indirect) {
1967 unsigned addr_reg;
1968 assert (reg.DimIndirect.File == TGSI_FILE_ADDRESS);
1969
1970 addr_reg = get_address_file_reg(ctx, reg.DimIndirect.Index);
1971 /* pull the value from index_reg */
1972 sel = addr_reg;
1973 chan = 0;
1974 } else {
1975 sel = V_SQ_ALU_SRC_LITERAL;
1976 chan = reg.Dimension.Index;
1977 }
1978
1979 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
1980 temp_reg, 0,
1981 stride_bytes_reg, stride_bytes_chan,
1982 sel, chan,
1983 temp_reg, 0);
1984 if (r)
1985 return r;
1986 }
1987
1988 if (reg.Register.File == TGSI_FILE_INPUT) {
1989 name = info->input_semantic_name;
1990 index = info->input_semantic_index;
1991 array_first = info->input_array_first;
1992 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
1993 name = info->output_semantic_name;
1994 index = info->output_semantic_index;
1995 array_first = info->output_array_first;
1996 } else {
1997 assert(0);
1998 return -1;
1999 }
2000 if (reg.Register.Indirect) {
2001 int addr_reg;
2002 int first;
2003 /* Add the relative address of the element. */
2004 if (reg.Indirect.ArrayID)
2005 first = array_first[reg.Indirect.ArrayID];
2006 else
2007 first = reg.Register.Index;
2008
2009 addr_reg = get_address_file_reg(ctx, reg.Indirect.Index);
2010
2011 /* pull the value from index_reg */
2012 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
2013 temp_reg, 0,
2014 V_SQ_ALU_SRC_LITERAL, 16,
2015 addr_reg, 0,
2016 temp_reg, 0);
2017 if (r)
2018 return r;
2019
2020 param = r600_get_lds_unique_index(name[first],
2021 index[first]);
2022
2023 } else {
2024 param = r600_get_lds_unique_index(name[reg.Register.Index],
2025 index[reg.Register.Index]);
2026 }
2027
2028 /* add to base_addr - passed in temp_reg.x */
2029 if (param) {
2030 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2031 temp_reg, 0,
2032 temp_reg, 0,
2033 V_SQ_ALU_SRC_LITERAL, param * 16);
2034 if (r)
2035 return r;
2036
2037 }
2038 return 0;
2039 }
2040
2041 static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg,
2042 unsigned dst_reg, unsigned mask)
2043 {
2044 struct r600_bytecode_alu alu;
2045 int r, i, lasti;
2046
2047 if ((ctx->bc->cf_last->ndw>>1) >= 0x60)
2048 ctx->bc->force_add_cf = 1;
2049
2050 lasti = tgsi_last_instruction(mask);
2051 for (i = 1; i <= lasti; i++) {
2052 if (!(mask & (1 << i)))
2053 continue;
2054
2055 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2056 temp_reg, i,
2057 temp_reg, 0,
2058 V_SQ_ALU_SRC_LITERAL, 4 * i);
2059 if (r)
2060 return r;
2061 }
2062 for (i = 0; i <= lasti; i++) {
2063 if (!(mask & (1 << i)))
2064 continue;
2065
2066 /* emit an LDS_READ_RET */
2067 memset(&alu, 0, sizeof(alu));
2068 alu.op = LDS_OP1_LDS_READ_RET;
2069 alu.src[0].sel = temp_reg;
2070 alu.src[0].chan = i;
2071 alu.src[1].sel = V_SQ_ALU_SRC_0;
2072 alu.src[2].sel = V_SQ_ALU_SRC_0;
2073 alu.dst.chan = 0;
2074 alu.is_lds_idx_op = true;
2075 alu.last = 1;
2076 r = r600_bytecode_add_alu(ctx->bc, &alu);
2077 if (r)
2078 return r;
2079 }
2080 for (i = 0; i <= lasti; i++) {
2081 if (!(mask & (1 << i)))
2082 continue;
2083
2084 /* then read from LDS_OQ_A_POP */
2085 memset(&alu, 0, sizeof(alu));
2086
2087 alu.op = ALU_OP1_MOV;
2088 alu.src[0].sel = EG_V_SQ_ALU_SRC_LDS_OQ_A_POP;
2089 alu.src[0].chan = 0;
2090 alu.dst.sel = dst_reg;
2091 alu.dst.chan = i;
2092 alu.dst.write = 1;
2093 alu.last = 1;
2094 r = r600_bytecode_add_alu(ctx->bc, &alu);
2095 if (r)
2096 return r;
2097 }
2098 return 0;
2099 }
2100
2101 static int fetch_mask(struct tgsi_src_register *reg)
2102 {
2103 int mask = 0;
2104 mask |= 1 << reg->SwizzleX;
2105 mask |= 1 << reg->SwizzleY;
2106 mask |= 1 << reg->SwizzleZ;
2107 mask |= 1 << reg->SwizzleW;
2108 return mask;
2109 }
2110
2111 static int fetch_tes_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
2112 {
2113 int r;
2114 unsigned temp_reg = r600_get_temp(ctx);
2115
2116 r = get_lds_offset0(ctx, 2, temp_reg,
2117 src->Register.Dimension ? false : true);
2118 if (r)
2119 return r;
2120
2121 /* the base address is now in temp.x */
2122 r = r600_get_byte_address(ctx, temp_reg,
2123 NULL, src, ctx->tess_output_info, 1);
2124 if (r)
2125 return r;
2126
2127 r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register));
2128 if (r)
2129 return r;
2130 return 0;
2131 }
2132
2133 static int fetch_tcs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
2134 {
2135 int r;
2136 unsigned temp_reg = r600_get_temp(ctx);
2137
2138 /* t.x = ips * r0.y */
2139 r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24,
2140 temp_reg, 0,
2141 ctx->tess_input_info, 0,
2142 0, 1);
2143
2144 if (r)
2145 return r;
2146
2147 /* the base address is now in temp.x */
2148 r = r600_get_byte_address(ctx, temp_reg,
2149 NULL, src, ctx->tess_input_info, 1);
2150 if (r)
2151 return r;
2152
2153 r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register));
2154 if (r)
2155 return r;
2156 return 0;
2157 }
2158
2159 static int fetch_tcs_output(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
2160 {
2161 int r;
2162 unsigned temp_reg = r600_get_temp(ctx);
2163
2164 r = get_lds_offset0(ctx, 1, temp_reg,
2165 src->Register.Dimension ? false : true);
2166 if (r)
2167 return r;
2168 /* the base address is now in temp.x */
2169 r = r600_get_byte_address(ctx, temp_reg,
2170 NULL, src,
2171 ctx->tess_output_info, 1);
2172 if (r)
2173 return r;
2174
2175 r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register));
2176 if (r)
2177 return r;
2178 return 0;
2179 }
2180
2181 static int tgsi_split_lds_inputs(struct r600_shader_ctx *ctx)
2182 {
2183 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2184 unsigned i;
2185
2186 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
2187 struct tgsi_full_src_register *src = &inst->Src[i];
2188
2189 if (ctx->type == PIPE_SHADER_TESS_EVAL && src->Register.File == TGSI_FILE_INPUT) {
2190 int treg = r600_get_temp(ctx);
2191 fetch_tes_input(ctx, src, treg);
2192 ctx->src[i].sel = treg;
2193 ctx->src[i].rel = 0;
2194 }
2195 if (ctx->type == PIPE_SHADER_TESS_CTRL && src->Register.File == TGSI_FILE_INPUT) {
2196 int treg = r600_get_temp(ctx);
2197 fetch_tcs_input(ctx, src, treg);
2198 ctx->src[i].sel = treg;
2199 ctx->src[i].rel = 0;
2200 }
2201 if (ctx->type == PIPE_SHADER_TESS_CTRL && src->Register.File == TGSI_FILE_OUTPUT) {
2202 int treg = r600_get_temp(ctx);
2203 fetch_tcs_output(ctx, src, treg);
2204 ctx->src[i].sel = treg;
2205 ctx->src[i].rel = 0;
2206 }
2207 }
2208 return 0;
2209 }
2210
2211 static int tgsi_split_constant(struct r600_shader_ctx *ctx)
2212 {
2213 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2214 struct r600_bytecode_alu alu;
2215 int i, j, k, nconst, r;
2216
2217 for (i = 0, nconst = 0; i < inst->Instruction.NumSrcRegs; i++) {
2218 if (inst->Src[i].Register.File == TGSI_FILE_CONSTANT) {
2219 nconst++;
2220 }
2221 tgsi_src(ctx, &inst->Src[i], &ctx->src[i]);
2222 }
2223 for (i = 0, j = nconst - 1; i < inst->Instruction.NumSrcRegs; i++) {
2224 if (inst->Src[i].Register.File != TGSI_FILE_CONSTANT) {
2225 continue;
2226 }
2227
2228 if (ctx->src[i].rel) {
2229 int chan = inst->Src[i].Indirect.Swizzle;
2230 int treg = r600_get_temp(ctx);
2231 if ((r = tgsi_fetch_rel_const(ctx, ctx->src[i].kc_bank, ctx->src[i].kc_rel, ctx->src[i].sel - 512, chan, treg)))
2232 return r;
2233
2234 ctx->src[i].kc_bank = 0;
2235 ctx->src[i].kc_rel = 0;
2236 ctx->src[i].sel = treg;
2237 ctx->src[i].rel = 0;
2238 j--;
2239 } else if (j > 0) {
2240 int treg = r600_get_temp(ctx);
2241 for (k = 0; k < 4; k++) {
2242 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2243 alu.op = ALU_OP1_MOV;
2244 alu.src[0].sel = ctx->src[i].sel;
2245 alu.src[0].chan = k;
2246 alu.src[0].rel = ctx->src[i].rel;
2247 alu.src[0].kc_bank = ctx->src[i].kc_bank;
2248 alu.src[0].kc_rel = ctx->src[i].kc_rel;
2249 alu.dst.sel = treg;
2250 alu.dst.chan = k;
2251 alu.dst.write = 1;
2252 if (k == 3)
2253 alu.last = 1;
2254 r = r600_bytecode_add_alu(ctx->bc, &alu);
2255 if (r)
2256 return r;
2257 }
2258 ctx->src[i].sel = treg;
2259 ctx->src[i].rel =0;
2260 j--;
2261 }
2262 }
2263 return 0;
2264 }
2265
2266 /* need to move any immediate into a temp - for trig functions which use literal for PI stuff */
2267 static int tgsi_split_literal_constant(struct r600_shader_ctx *ctx)
2268 {
2269 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2270 struct r600_bytecode_alu alu;
2271 int i, j, k, nliteral, r;
2272
2273 for (i = 0, nliteral = 0; i < inst->Instruction.NumSrcRegs; i++) {
2274 if (ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
2275 nliteral++;
2276 }
2277 }
2278 for (i = 0, j = nliteral - 1; i < inst->Instruction.NumSrcRegs; i++) {
2279 if (j > 0 && ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
2280 int treg = r600_get_temp(ctx);
2281 for (k = 0; k < 4; k++) {
2282 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2283 alu.op = ALU_OP1_MOV;
2284 alu.src[0].sel = ctx->src[i].sel;
2285 alu.src[0].chan = k;
2286 alu.src[0].value = ctx->src[i].value[k];
2287 alu.dst.sel = treg;
2288 alu.dst.chan = k;
2289 alu.dst.write = 1;
2290 if (k == 3)
2291 alu.last = 1;
2292 r = r600_bytecode_add_alu(ctx->bc, &alu);
2293 if (r)
2294 return r;
2295 }
2296 ctx->src[i].sel = treg;
2297 j--;
2298 }
2299 }
2300 return 0;
2301 }
2302
2303 static int process_twoside_color_inputs(struct r600_shader_ctx *ctx)
2304 {
2305 int i, r, count = ctx->shader->ninput;
2306
2307 for (i = 0; i < count; i++) {
2308 if (ctx->shader->input[i].name == TGSI_SEMANTIC_COLOR) {
2309 r = select_twoside_color(ctx, i, ctx->shader->input[i].back_color_input);
2310 if (r)
2311 return r;
2312 }
2313 }
2314 return 0;
2315 }
2316
2317 static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output_info *so,
2318 int stream, unsigned *stream_item_size UNUSED)
2319 {
2320 unsigned so_gpr[PIPE_MAX_SHADER_OUTPUTS];
2321 unsigned start_comp[PIPE_MAX_SHADER_OUTPUTS];
2322 int j, r;
2323 unsigned i;
2324
2325 /* Sanity checking. */
2326 if (so->num_outputs > PIPE_MAX_SO_OUTPUTS) {
2327 R600_ERR("Too many stream outputs: %d\n", so->num_outputs);
2328 r = -EINVAL;
2329 goto out_err;
2330 }
2331 for (i = 0; i < so->num_outputs; i++) {
2332 if (so->output[i].output_buffer >= 4) {
2333 R600_ERR("Exceeded the max number of stream output buffers, got: %d\n",
2334 so->output[i].output_buffer);
2335 r = -EINVAL;
2336 goto out_err;
2337 }
2338 }
2339
2340 /* Initialize locations where the outputs are stored. */
2341 for (i = 0; i < so->num_outputs; i++) {
2342
2343 so_gpr[i] = ctx->shader->output[so->output[i].register_index].gpr;
2344 start_comp[i] = so->output[i].start_component;
2345 /* Lower outputs with dst_offset < start_component.
2346 *
2347 * We can only output 4D vectors with a write mask, e.g. we can
2348 * only output the W component at offset 3, etc. If we want
2349 * to store Y, Z, or W at buffer offset 0, we need to use MOV
2350 * to move it to X and output X. */
2351 if (so->output[i].dst_offset < so->output[i].start_component) {
2352 unsigned tmp = r600_get_temp(ctx);
2353
2354 for (j = 0; j < so->output[i].num_components; j++) {
2355 struct r600_bytecode_alu alu;
2356 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2357 alu.op = ALU_OP1_MOV;
2358 alu.src[0].sel = so_gpr[i];
2359 alu.src[0].chan = so->output[i].start_component + j;
2360
2361 alu.dst.sel = tmp;
2362 alu.dst.chan = j;
2363 alu.dst.write = 1;
2364 if (j == so->output[i].num_components - 1)
2365 alu.last = 1;
2366 r = r600_bytecode_add_alu(ctx->bc, &alu);
2367 if (r)
2368 return r;
2369 }
2370 start_comp[i] = 0;
2371 so_gpr[i] = tmp;
2372 }
2373 }
2374
2375 /* Write outputs to buffers. */
2376 for (i = 0; i < so->num_outputs; i++) {
2377 struct r600_bytecode_output output;
2378
2379 if (stream != -1 && stream != so->output[i].stream)
2380 continue;
2381
2382 memset(&output, 0, sizeof(struct r600_bytecode_output));
2383 output.gpr = so_gpr[i];
2384 output.elem_size = so->output[i].num_components - 1;
2385 if (output.elem_size == 2)
2386 output.elem_size = 3; // 3 not supported, write 4 with junk at end
2387 output.array_base = so->output[i].dst_offset - start_comp[i];
2388 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
2389 output.burst_count = 1;
2390 /* array_size is an upper limit for the burst_count
2391 * with MEM_STREAM instructions */
2392 output.array_size = 0xFFF;
2393 output.comp_mask = ((1 << so->output[i].num_components) - 1) << start_comp[i];
2394
2395 if (ctx->bc->chip_class >= EVERGREEN) {
2396 switch (so->output[i].output_buffer) {
2397 case 0:
2398 output.op = CF_OP_MEM_STREAM0_BUF0;
2399 break;
2400 case 1:
2401 output.op = CF_OP_MEM_STREAM0_BUF1;
2402 break;
2403 case 2:
2404 output.op = CF_OP_MEM_STREAM0_BUF2;
2405 break;
2406 case 3:
2407 output.op = CF_OP_MEM_STREAM0_BUF3;
2408 break;
2409 }
2410 output.op += so->output[i].stream * 4;
2411 assert(output.op >= CF_OP_MEM_STREAM0_BUF0 && output.op <= CF_OP_MEM_STREAM3_BUF3);
2412 ctx->enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer) << so->output[i].stream * 4;
2413 } else {
2414 switch (so->output[i].output_buffer) {
2415 case 0:
2416 output.op = CF_OP_MEM_STREAM0;
2417 break;
2418 case 1:
2419 output.op = CF_OP_MEM_STREAM1;
2420 break;
2421 case 2:
2422 output.op = CF_OP_MEM_STREAM2;
2423 break;
2424 case 3:
2425 output.op = CF_OP_MEM_STREAM3;
2426 break;
2427 }
2428 ctx->enabled_stream_buffers_mask |= 1 << so->output[i].output_buffer;
2429 }
2430 r = r600_bytecode_add_output(ctx->bc, &output);
2431 if (r)
2432 goto out_err;
2433 }
2434 return 0;
2435 out_err:
2436 return r;
2437 }
2438
2439 static void convert_edgeflag_to_int(struct r600_shader_ctx *ctx)
2440 {
2441 struct r600_bytecode_alu alu;
2442 unsigned reg;
2443
2444 if (!ctx->shader->vs_out_edgeflag)
2445 return;
2446
2447 reg = ctx->shader->output[ctx->edgeflag_output].gpr;
2448
2449 /* clamp(x, 0, 1) */
2450 memset(&alu, 0, sizeof(alu));
2451 alu.op = ALU_OP1_MOV;
2452 alu.src[0].sel = reg;
2453 alu.dst.sel = reg;
2454 alu.dst.write = 1;
2455 alu.dst.clamp = 1;
2456 alu.last = 1;
2457 r600_bytecode_add_alu(ctx->bc, &alu);
2458
2459 memset(&alu, 0, sizeof(alu));
2460 alu.op = ALU_OP1_FLT_TO_INT;
2461 alu.src[0].sel = reg;
2462 alu.dst.sel = reg;
2463 alu.dst.write = 1;
2464 alu.last = 1;
2465 r600_bytecode_add_alu(ctx->bc, &alu);
2466 }
2467
2468 static int generate_gs_copy_shader(struct r600_context *rctx,
2469 struct r600_pipe_shader *gs,
2470 struct pipe_stream_output_info *so)
2471 {
2472 struct r600_shader_ctx ctx = {};
2473 struct r600_shader *gs_shader = &gs->shader;
2474 struct r600_pipe_shader *cshader;
2475 unsigned ocnt = gs_shader->noutput;
2476 struct r600_bytecode_alu alu;
2477 struct r600_bytecode_vtx vtx;
2478 struct r600_bytecode_output output;
2479 struct r600_bytecode_cf *cf_jump, *cf_pop,
2480 *last_exp_pos = NULL, *last_exp_param = NULL;
2481 int next_clip_pos = 61, next_param = 0;
2482 unsigned i, j;
2483 int ring;
2484 bool only_ring_0 = true;
2485 cshader = calloc(1, sizeof(struct r600_pipe_shader));
2486 if (!cshader)
2487 return 0;
2488
2489 memcpy(cshader->shader.output, gs_shader->output, ocnt *
2490 sizeof(struct r600_shader_io));
2491
2492 cshader->shader.noutput = ocnt;
2493
2494 ctx.shader = &cshader->shader;
2495 ctx.bc = &ctx.shader->bc;
2496 ctx.type = ctx.bc->type = PIPE_SHADER_VERTEX;
2497
2498 r600_bytecode_init(ctx.bc, rctx->b.chip_class, rctx->b.family,
2499 rctx->screen->has_compressed_msaa_texturing);
2500
2501 ctx.bc->isa = rctx->isa;
2502
2503 cf_jump = NULL;
2504 memset(cshader->shader.ring_item_sizes, 0, sizeof(cshader->shader.ring_item_sizes));
2505
2506 /* R0.x = R0.x & 0x3fffffff */
2507 memset(&alu, 0, sizeof(alu));
2508 alu.op = ALU_OP2_AND_INT;
2509 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2510 alu.src[1].value = 0x3fffffff;
2511 alu.dst.write = 1;
2512 r600_bytecode_add_alu(ctx.bc, &alu);
2513
2514 /* R0.y = R0.x >> 30 */
2515 memset(&alu, 0, sizeof(alu));
2516 alu.op = ALU_OP2_LSHR_INT;
2517 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2518 alu.src[1].value = 0x1e;
2519 alu.dst.chan = 1;
2520 alu.dst.write = 1;
2521 alu.last = 1;
2522 r600_bytecode_add_alu(ctx.bc, &alu);
2523
2524 /* fetch vertex data from GSVS ring */
2525 for (i = 0; i < ocnt; ++i) {
2526 struct r600_shader_io *out = &ctx.shader->output[i];
2527
2528 out->gpr = i + 1;
2529 out->ring_offset = i * 16;
2530
2531 memset(&vtx, 0, sizeof(vtx));
2532 vtx.op = FETCH_OP_VFETCH;
2533 vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
2534 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
2535 vtx.mega_fetch_count = 16;
2536 vtx.offset = out->ring_offset;
2537 vtx.dst_gpr = out->gpr;
2538 vtx.src_gpr = 0;
2539 vtx.dst_sel_x = 0;
2540 vtx.dst_sel_y = 1;
2541 vtx.dst_sel_z = 2;
2542 vtx.dst_sel_w = 3;
2543 if (rctx->b.chip_class >= EVERGREEN) {
2544 vtx.use_const_fields = 1;
2545 } else {
2546 vtx.data_format = FMT_32_32_32_32_FLOAT;
2547 }
2548
2549 r600_bytecode_add_vtx(ctx.bc, &vtx);
2550 }
2551 ctx.temp_reg = i + 1;
2552 for (ring = 3; ring >= 0; --ring) {
2553 bool enabled = false;
2554 for (i = 0; i < so->num_outputs; i++) {
2555 if (so->output[i].stream == ring) {
2556 enabled = true;
2557 if (ring > 0)
2558 only_ring_0 = false;
2559 break;
2560 }
2561 }
2562 if (ring != 0 && !enabled) {
2563 cshader->shader.ring_item_sizes[ring] = 0;
2564 continue;
2565 }
2566
2567 if (cf_jump) {
2568 // Patch up jump label
2569 r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
2570 cf_pop = ctx.bc->cf_last;
2571
2572 cf_jump->cf_addr = cf_pop->id + 2;
2573 cf_jump->pop_count = 1;
2574 cf_pop->cf_addr = cf_pop->id + 2;
2575 cf_pop->pop_count = 1;
2576 }
2577
2578 /* PRED_SETE_INT __, R0.y, ring */
2579 memset(&alu, 0, sizeof(alu));
2580 alu.op = ALU_OP2_PRED_SETE_INT;
2581 alu.src[0].chan = 1;
2582 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2583 alu.src[1].value = ring;
2584 alu.execute_mask = 1;
2585 alu.update_pred = 1;
2586 alu.last = 1;
2587 r600_bytecode_add_alu_type(ctx.bc, &alu, CF_OP_ALU_PUSH_BEFORE);
2588
2589 r600_bytecode_add_cfinst(ctx.bc, CF_OP_JUMP);
2590 cf_jump = ctx.bc->cf_last;
2591
2592 if (enabled)
2593 emit_streamout(&ctx, so, only_ring_0 ? -1 : ring, &cshader->shader.ring_item_sizes[ring]);
2594 cshader->shader.ring_item_sizes[ring] = ocnt * 16;
2595 }
2596
2597 /* bc adds nops - copy it */
2598 if (ctx.bc->chip_class == R600) {
2599 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2600 alu.op = ALU_OP0_NOP;
2601 alu.last = 1;
2602 r600_bytecode_add_alu(ctx.bc, &alu);
2603
2604 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
2605 }
2606
2607 /* export vertex data */
2608 /* XXX factor out common code with r600_shader_from_tgsi ? */
2609 for (i = 0; i < ocnt; ++i) {
2610 struct r600_shader_io *out = &ctx.shader->output[i];
2611 bool instream0 = true;
2612 if (out->name == TGSI_SEMANTIC_CLIPVERTEX)
2613 continue;
2614
2615 for (j = 0; j < so->num_outputs; j++) {
2616 if (so->output[j].register_index == i) {
2617 if (so->output[j].stream == 0)
2618 break;
2619 if (so->output[j].stream > 0)
2620 instream0 = false;
2621 }
2622 }
2623 if (!instream0)
2624 continue;
2625 memset(&output, 0, sizeof(output));
2626 output.gpr = out->gpr;
2627 output.elem_size = 3;
2628 output.swizzle_x = 0;
2629 output.swizzle_y = 1;
2630 output.swizzle_z = 2;
2631 output.swizzle_w = 3;
2632 output.burst_count = 1;
2633 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2634 output.op = CF_OP_EXPORT;
2635 switch (out->name) {
2636 case TGSI_SEMANTIC_POSITION:
2637 output.array_base = 60;
2638 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2639 break;
2640
2641 case TGSI_SEMANTIC_PSIZE:
2642 output.array_base = 61;
2643 if (next_clip_pos == 61)
2644 next_clip_pos = 62;
2645 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2646 output.swizzle_y = 7;
2647 output.swizzle_z = 7;
2648 output.swizzle_w = 7;
2649 ctx.shader->vs_out_misc_write = 1;
2650 ctx.shader->vs_out_point_size = 1;
2651 break;
2652 case TGSI_SEMANTIC_LAYER:
2653 if (out->spi_sid) {
2654 /* duplicate it as PARAM to pass to the pixel shader */
2655 output.array_base = next_param++;
2656 r600_bytecode_add_output(ctx.bc, &output);
2657 last_exp_param = ctx.bc->cf_last;
2658 }
2659 output.array_base = 61;
2660 if (next_clip_pos == 61)
2661 next_clip_pos = 62;
2662 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2663 output.swizzle_x = 7;
2664 output.swizzle_y = 7;
2665 output.swizzle_z = 0;
2666 output.swizzle_w = 7;
2667 ctx.shader->vs_out_misc_write = 1;
2668 ctx.shader->vs_out_layer = 1;
2669 break;
2670 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2671 if (out->spi_sid) {
2672 /* duplicate it as PARAM to pass to the pixel shader */
2673 output.array_base = next_param++;
2674 r600_bytecode_add_output(ctx.bc, &output);
2675 last_exp_param = ctx.bc->cf_last;
2676 }
2677 output.array_base = 61;
2678 if (next_clip_pos == 61)
2679 next_clip_pos = 62;
2680 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2681 ctx.shader->vs_out_misc_write = 1;
2682 ctx.shader->vs_out_viewport = 1;
2683 output.swizzle_x = 7;
2684 output.swizzle_y = 7;
2685 output.swizzle_z = 7;
2686 output.swizzle_w = 0;
2687 break;
2688 case TGSI_SEMANTIC_CLIPDIST:
2689 /* spi_sid is 0 for clipdistance outputs that were generated
2690 * for clipvertex - we don't need to pass them to PS */
2691 ctx.shader->clip_dist_write = gs->shader.clip_dist_write;
2692 ctx.shader->cull_dist_write = gs->shader.cull_dist_write;
2693 ctx.shader->cc_dist_mask = gs->shader.cc_dist_mask;
2694 if (out->spi_sid) {
2695 /* duplicate it as PARAM to pass to the pixel shader */
2696 output.array_base = next_param++;
2697 r600_bytecode_add_output(ctx.bc, &output);
2698 last_exp_param = ctx.bc->cf_last;
2699 }
2700 output.array_base = next_clip_pos++;
2701 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2702 break;
2703 case TGSI_SEMANTIC_FOG:
2704 output.swizzle_y = 4; /* 0 */
2705 output.swizzle_z = 4; /* 0 */
2706 output.swizzle_w = 5; /* 1 */
2707 break;
2708 default:
2709 output.array_base = next_param++;
2710 break;
2711 }
2712 r600_bytecode_add_output(ctx.bc, &output);
2713 if (output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM)
2714 last_exp_param = ctx.bc->cf_last;
2715 else
2716 last_exp_pos = ctx.bc->cf_last;
2717 }
2718
2719 if (!last_exp_pos) {
2720 memset(&output, 0, sizeof(output));
2721 output.gpr = 0;
2722 output.elem_size = 3;
2723 output.swizzle_x = 7;
2724 output.swizzle_y = 7;
2725 output.swizzle_z = 7;
2726 output.swizzle_w = 7;
2727 output.burst_count = 1;
2728 output.type = 2;
2729 output.op = CF_OP_EXPORT;
2730 output.array_base = 60;
2731 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2732 r600_bytecode_add_output(ctx.bc, &output);
2733 last_exp_pos = ctx.bc->cf_last;
2734 }
2735
2736 if (!last_exp_param) {
2737 memset(&output, 0, sizeof(output));
2738 output.gpr = 0;
2739 output.elem_size = 3;
2740 output.swizzle_x = 7;
2741 output.swizzle_y = 7;
2742 output.swizzle_z = 7;
2743 output.swizzle_w = 7;
2744 output.burst_count = 1;
2745 output.type = 2;
2746 output.op = CF_OP_EXPORT;
2747 output.array_base = next_param++;
2748 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2749 r600_bytecode_add_output(ctx.bc, &output);
2750 last_exp_param = ctx.bc->cf_last;
2751 }
2752
2753 last_exp_pos->op = CF_OP_EXPORT_DONE;
2754 last_exp_param->op = CF_OP_EXPORT_DONE;
2755
2756 r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
2757 cf_pop = ctx.bc->cf_last;
2758
2759 cf_jump->cf_addr = cf_pop->id + 2;
2760 cf_jump->pop_count = 1;
2761 cf_pop->cf_addr = cf_pop->id + 2;
2762 cf_pop->pop_count = 1;
2763
2764 if (ctx.bc->chip_class == CAYMAN)
2765 cm_bytecode_add_cf_end(ctx.bc);
2766 else {
2767 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
2768 ctx.bc->cf_last->end_of_program = 1;
2769 }
2770
2771 gs->gs_copy_shader = cshader;
2772 cshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
2773
2774 ctx.bc->nstack = 1;
2775
2776 return r600_bytecode_build(ctx.bc);
2777 }
2778
2779 static int emit_inc_ring_offset(struct r600_shader_ctx *ctx, int idx, bool ind)
2780 {
2781 if (ind) {
2782 struct r600_bytecode_alu alu;
2783 int r;
2784
2785 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2786 alu.op = ALU_OP2_ADD_INT;
2787 alu.src[0].sel = ctx->gs_export_gpr_tregs[idx];
2788 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2789 alu.src[1].value = ctx->gs_out_ring_offset >> 4;
2790 alu.dst.sel = ctx->gs_export_gpr_tregs[idx];
2791 alu.dst.write = 1;
2792 alu.last = 1;
2793 r = r600_bytecode_add_alu(ctx->bc, &alu);
2794 if (r)
2795 return r;
2796 }
2797 return 0;
2798 }
2799
2800 static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so UNUSED, int stream, bool ind)
2801 {
2802 struct r600_bytecode_output output;
2803 int ring_offset;
2804 unsigned i, k;
2805 int effective_stream = stream == -1 ? 0 : stream;
2806 int idx = 0;
2807
2808 for (i = 0; i < ctx->shader->noutput; i++) {
2809 if (ctx->gs_for_vs) {
2810 /* for ES we need to lookup corresponding ring offset expected by GS
2811 * (map this output to GS input by name and sid) */
2812 /* FIXME precompute offsets */
2813 ring_offset = -1;
2814 for(k = 0; k < ctx->gs_for_vs->ninput; ++k) {
2815 struct r600_shader_io *in = &ctx->gs_for_vs->input[k];
2816 struct r600_shader_io *out = &ctx->shader->output[i];
2817 if (in->name == out->name && in->sid == out->sid)
2818 ring_offset = in->ring_offset;
2819 }
2820
2821 if (ring_offset == -1)
2822 continue;
2823 } else {
2824 ring_offset = idx * 16;
2825 idx++;
2826 }
2827
2828 if (stream > 0 && ctx->shader->output[i].name == TGSI_SEMANTIC_POSITION)
2829 continue;
2830 /* next_ring_offset after parsing input decls contains total size of
2831 * single vertex data, gs_next_vertex - current vertex index */
2832 if (!ind)
2833 ring_offset += ctx->gs_out_ring_offset * ctx->gs_next_vertex;
2834
2835 memset(&output, 0, sizeof(struct r600_bytecode_output));
2836 output.gpr = ctx->shader->output[i].gpr;
2837 output.elem_size = 3;
2838 output.comp_mask = 0xF;
2839 output.burst_count = 1;
2840
2841 if (ind)
2842 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
2843 else
2844 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
2845
2846 switch (stream) {
2847 default:
2848 case 0:
2849 output.op = CF_OP_MEM_RING; break;
2850 case 1:
2851 output.op = CF_OP_MEM_RING1; break;
2852 case 2:
2853 output.op = CF_OP_MEM_RING2; break;
2854 case 3:
2855 output.op = CF_OP_MEM_RING3; break;
2856 }
2857
2858 if (ind) {
2859 output.array_base = ring_offset >> 2; /* in dwords */
2860 output.array_size = 0xfff;
2861 output.index_gpr = ctx->gs_export_gpr_tregs[effective_stream];
2862 } else
2863 output.array_base = ring_offset >> 2; /* in dwords */
2864 r600_bytecode_add_output(ctx->bc, &output);
2865 }
2866
2867 ++ctx->gs_next_vertex;
2868 return 0;
2869 }
2870
2871
2872 static int r600_fetch_tess_io_info(struct r600_shader_ctx *ctx)
2873 {
2874 int r;
2875 struct r600_bytecode_vtx vtx;
2876 int temp_val = ctx->temp_reg;
2877 /* need to store the TCS output somewhere */
2878 r = single_alu_op2(ctx, ALU_OP1_MOV,
2879 temp_val, 0,
2880 V_SQ_ALU_SRC_LITERAL, 0,
2881 0, 0);
2882 if (r)
2883 return r;
2884
2885 /* used by VS/TCS */
2886 if (ctx->tess_input_info) {
2887 /* fetch tcs input values into resv space */
2888 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
2889 vtx.op = FETCH_OP_VFETCH;
2890 vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER;
2891 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
2892 vtx.mega_fetch_count = 16;
2893 vtx.data_format = FMT_32_32_32_32;
2894 vtx.num_format_all = 2;
2895 vtx.format_comp_all = 1;
2896 vtx.use_const_fields = 0;
2897 vtx.endian = r600_endian_swap(32);
2898 vtx.srf_mode_all = 1;
2899 vtx.offset = 0;
2900 vtx.dst_gpr = ctx->tess_input_info;
2901 vtx.dst_sel_x = 0;
2902 vtx.dst_sel_y = 1;
2903 vtx.dst_sel_z = 2;
2904 vtx.dst_sel_w = 3;
2905 vtx.src_gpr = temp_val;
2906 vtx.src_sel_x = 0;
2907
2908 r = r600_bytecode_add_vtx(ctx->bc, &vtx);
2909 if (r)
2910 return r;
2911 }
2912
2913 /* used by TCS/TES */
2914 if (ctx->tess_output_info) {
2915 /* fetch tcs output values into resv space */
2916 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
2917 vtx.op = FETCH_OP_VFETCH;
2918 vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER;
2919 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
2920 vtx.mega_fetch_count = 16;
2921 vtx.data_format = FMT_32_32_32_32;
2922 vtx.num_format_all = 2;
2923 vtx.format_comp_all = 1;
2924 vtx.use_const_fields = 0;
2925 vtx.endian = r600_endian_swap(32);
2926 vtx.srf_mode_all = 1;
2927 vtx.offset = 16;
2928 vtx.dst_gpr = ctx->tess_output_info;
2929 vtx.dst_sel_x = 0;
2930 vtx.dst_sel_y = 1;
2931 vtx.dst_sel_z = 2;
2932 vtx.dst_sel_w = 3;
2933 vtx.src_gpr = temp_val;
2934 vtx.src_sel_x = 0;
2935
2936 r = r600_bytecode_add_vtx(ctx->bc, &vtx);
2937 if (r)
2938 return r;
2939 }
2940 return 0;
2941 }
2942
2943 static int emit_lds_vs_writes(struct r600_shader_ctx *ctx)
2944 {
2945 int j, r;
2946 int temp_reg;
2947 unsigned i;
2948
2949 /* fetch tcs input values into input_vals */
2950 ctx->tess_input_info = r600_get_temp(ctx);
2951 ctx->tess_output_info = 0;
2952 r = r600_fetch_tess_io_info(ctx);
2953 if (r)
2954 return r;
2955
2956 temp_reg = r600_get_temp(ctx);
2957 /* dst reg contains LDS address stride * idx */
2958 /* MUL vertexID, vertex_dw_stride */
2959 r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24,
2960 temp_reg, 0,
2961 ctx->tess_input_info, 1,
2962 0, 1); /* rel id in r0.y? */
2963 if (r)
2964 return r;
2965
2966 for (i = 0; i < ctx->shader->noutput; i++) {
2967 struct r600_bytecode_alu alu;
2968 int param = r600_get_lds_unique_index(ctx->shader->output[i].name, ctx->shader->output[i].sid);
2969
2970 if (param) {
2971 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2972 temp_reg, 1,
2973 temp_reg, 0,
2974 V_SQ_ALU_SRC_LITERAL, param * 16);
2975 if (r)
2976 return r;
2977 }
2978
2979 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2980 temp_reg, 2,
2981 temp_reg, param ? 1 : 0,
2982 V_SQ_ALU_SRC_LITERAL, 8);
2983 if (r)
2984 return r;
2985
2986
2987 for (j = 0; j < 2; j++) {
2988 int chan = (j == 1) ? 2 : (param ? 1 : 0);
2989 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2990 alu.op = LDS_OP3_LDS_WRITE_REL;
2991 alu.src[0].sel = temp_reg;
2992 alu.src[0].chan = chan;
2993 alu.src[1].sel = ctx->shader->output[i].gpr;
2994 alu.src[1].chan = j * 2;
2995 alu.src[2].sel = ctx->shader->output[i].gpr;
2996 alu.src[2].chan = (j * 2) + 1;
2997 alu.last = 1;
2998 alu.dst.chan = 0;
2999 alu.lds_idx = 1;
3000 alu.is_lds_idx_op = true;
3001 r = r600_bytecode_add_alu(ctx->bc, &alu);
3002 if (r)
3003 return r;
3004 }
3005 }
3006 return 0;
3007 }
3008
3009 static int r600_store_tcs_output(struct r600_shader_ctx *ctx)
3010 {
3011 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3012 const struct tgsi_full_dst_register *dst = &inst->Dst[0];
3013 int i, r, lasti;
3014 int temp_reg = r600_get_temp(ctx);
3015 struct r600_bytecode_alu alu;
3016 unsigned write_mask = dst->Register.WriteMask;
3017
3018 if (inst->Dst[0].Register.File != TGSI_FILE_OUTPUT)
3019 return 0;
3020
3021 r = get_lds_offset0(ctx, 1, temp_reg, dst->Register.Dimension ? false : true);
3022 if (r)
3023 return r;
3024
3025 /* the base address is now in temp.x */
3026 r = r600_get_byte_address(ctx, temp_reg,
3027 &inst->Dst[0], NULL, ctx->tess_output_info, 1);
3028 if (r)
3029 return r;
3030
3031 /* LDS write */
3032 lasti = tgsi_last_instruction(write_mask);
3033 for (i = 1; i <= lasti; i++) {
3034
3035 if (!(write_mask & (1 << i)))
3036 continue;
3037 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
3038 temp_reg, i,
3039 temp_reg, 0,
3040 V_SQ_ALU_SRC_LITERAL, 4 * i);
3041 if (r)
3042 return r;
3043 }
3044
3045 for (i = 0; i <= lasti; i++) {
3046 if (!(write_mask & (1 << i)))
3047 continue;
3048
3049 if ((i == 0 && ((write_mask & 3) == 3)) ||
3050 (i == 2 && ((write_mask & 0xc) == 0xc))) {
3051 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3052 alu.op = LDS_OP3_LDS_WRITE_REL;
3053 alu.src[0].sel = temp_reg;
3054 alu.src[0].chan = i;
3055
3056 alu.src[1].sel = dst->Register.Index;
3057 alu.src[1].sel += ctx->file_offset[dst->Register.File];
3058 alu.src[1].chan = i;
3059
3060 alu.src[2].sel = dst->Register.Index;
3061 alu.src[2].sel += ctx->file_offset[dst->Register.File];
3062 alu.src[2].chan = i + 1;
3063 alu.lds_idx = 1;
3064 alu.dst.chan = 0;
3065 alu.last = 1;
3066 alu.is_lds_idx_op = true;
3067 r = r600_bytecode_add_alu(ctx->bc, &alu);
3068 if (r)
3069 return r;
3070 i += 1;
3071 continue;
3072 }
3073 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3074 alu.op = LDS_OP2_LDS_WRITE;
3075 alu.src[0].sel = temp_reg;
3076 alu.src[0].chan = i;
3077
3078 alu.src[1].sel = dst->Register.Index;
3079 alu.src[1].sel += ctx->file_offset[dst->Register.File];
3080 alu.src[1].chan = i;
3081
3082 alu.src[2].sel = V_SQ_ALU_SRC_0;
3083 alu.dst.chan = 0;
3084 alu.last = 1;
3085 alu.is_lds_idx_op = true;
3086 r = r600_bytecode_add_alu(ctx->bc, &alu);
3087 if (r)
3088 return r;
3089 }
3090 return 0;
3091 }
3092
3093 static int r600_tess_factor_read(struct r600_shader_ctx *ctx,
3094 int output_idx, int nc)
3095 {
3096 int param;
3097 unsigned temp_reg = r600_get_temp(ctx);
3098 unsigned name = ctx->shader->output[output_idx].name;
3099 int dreg = ctx->shader->output[output_idx].gpr;
3100 int r;
3101
3102 param = r600_get_lds_unique_index(name, 0);
3103 r = get_lds_offset0(ctx, 1, temp_reg, true);
3104 if (r)
3105 return r;
3106
3107 if (param) {
3108 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
3109 temp_reg, 0,
3110 temp_reg, 0,
3111 V_SQ_ALU_SRC_LITERAL, param * 16);
3112 if (r)
3113 return r;
3114 }
3115
3116 do_lds_fetch_values(ctx, temp_reg, dreg, ((1u << nc) - 1));
3117 return 0;
3118 }
3119
3120 static int r600_emit_tess_factor(struct r600_shader_ctx *ctx)
3121 {
3122 int stride, outer_comps, inner_comps;
3123 int tessinner_idx = -1, tessouter_idx = -1;
3124 int i, r;
3125 unsigned j;
3126 int temp_reg = r600_get_temp(ctx);
3127 int treg[3] = {-1, -1, -1};
3128 struct r600_bytecode_alu alu;
3129 struct r600_bytecode_cf *cf_jump, *cf_pop;
3130
3131 /* only execute factor emission for invocation 0 */
3132 /* PRED_SETE_INT __, R0.x, 0 */
3133 memset(&alu, 0, sizeof(alu));
3134 alu.op = ALU_OP2_PRED_SETE_INT;
3135 alu.src[0].chan = 2;
3136 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
3137 alu.execute_mask = 1;
3138 alu.update_pred = 1;
3139 alu.last = 1;
3140 r600_bytecode_add_alu_type(ctx->bc, &alu, CF_OP_ALU_PUSH_BEFORE);
3141
3142 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
3143 cf_jump = ctx->bc->cf_last;
3144
3145 treg[0] = r600_get_temp(ctx);
3146 switch (ctx->shader->tcs_prim_mode) {
3147 case PIPE_PRIM_LINES:
3148 stride = 8; /* 2 dwords, 1 vec2 store */
3149 outer_comps = 2;
3150 inner_comps = 0;
3151 break;
3152 case PIPE_PRIM_TRIANGLES:
3153 stride = 16; /* 4 dwords, 1 vec4 store */
3154 outer_comps = 3;
3155 inner_comps = 1;
3156 treg[1] = r600_get_temp(ctx);
3157 break;
3158 case PIPE_PRIM_QUADS:
3159 stride = 24; /* 6 dwords, 2 stores (vec4 + vec2) */
3160 outer_comps = 4;
3161 inner_comps = 2;
3162 treg[1] = r600_get_temp(ctx);
3163 treg[2] = r600_get_temp(ctx);
3164 break;
3165 default:
3166 assert(0);
3167 return -1;
3168 }
3169
3170 /* R0 is InvocationID, RelPatchID, PatchID, tf_base */
3171 /* TF_WRITE takes index in R.x, value in R.y */
3172 for (j = 0; j < ctx->shader->noutput; j++) {
3173 if (ctx->shader->output[j].name == TGSI_SEMANTIC_TESSINNER)
3174 tessinner_idx = j;
3175 if (ctx->shader->output[j].name == TGSI_SEMANTIC_TESSOUTER)
3176 tessouter_idx = j;
3177 }
3178
3179 if (tessouter_idx == -1)
3180 return -1;
3181
3182 if (tessinner_idx == -1 && inner_comps)
3183 return -1;
3184
3185 if (tessouter_idx != -1) {
3186 r = r600_tess_factor_read(ctx, tessouter_idx, outer_comps);
3187 if (r)
3188 return r;
3189 }
3190
3191 if (tessinner_idx != -1) {
3192 r = r600_tess_factor_read(ctx, tessinner_idx, inner_comps);
3193 if (r)
3194 return r;
3195 }
3196
3197 /* r.x = tf_base(r0.w) + relpatchid(r0.y) * tf_stride */
3198 /* r.x = relpatchid(r0.y) * tf_stride */
3199
3200 /* multiply incoming r0.y * stride - t.x = r0.y * stride */
3201 /* add incoming r0.w to it: t.x = t.x + r0.w */
3202 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
3203 temp_reg, 0,
3204 0, 1,
3205 V_SQ_ALU_SRC_LITERAL, stride,
3206 0, 3);
3207 if (r)
3208 return r;
3209
3210 for (i = 0; i < outer_comps + inner_comps; i++) {
3211 int out_idx = i >= outer_comps ? tessinner_idx : tessouter_idx;
3212 int out_comp = i >= outer_comps ? i - outer_comps : i;
3213
3214 if (ctx->shader->tcs_prim_mode == PIPE_PRIM_LINES) {
3215 if (out_comp == 1)
3216 out_comp = 0;
3217 else if (out_comp == 0)
3218 out_comp = 1;
3219 }
3220
3221 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
3222 treg[i / 2], (2 * (i % 2)),
3223 temp_reg, 0,
3224 V_SQ_ALU_SRC_LITERAL, 4 * i);
3225 if (r)
3226 return r;
3227 r = single_alu_op2(ctx, ALU_OP1_MOV,
3228 treg[i / 2], 1 + (2 * (i%2)),
3229 ctx->shader->output[out_idx].gpr, out_comp,
3230 0, 0);
3231 if (r)
3232 return r;
3233 }
3234 for (i = 0; i < outer_comps + inner_comps; i++) {
3235 struct r600_bytecode_gds gds;
3236
3237 memset(&gds, 0, sizeof(struct r600_bytecode_gds));
3238 gds.src_gpr = treg[i / 2];
3239 gds.src_sel_x = 2 * (i % 2);
3240 gds.src_sel_y = 1 + (2 * (i % 2));
3241 gds.src_sel_z = 4;
3242 gds.dst_sel_x = 7;
3243 gds.dst_sel_y = 7;
3244 gds.dst_sel_z = 7;
3245 gds.dst_sel_w = 7;
3246 gds.op = FETCH_OP_TF_WRITE;
3247 r = r600_bytecode_add_gds(ctx->bc, &gds);
3248 if (r)
3249 return r;
3250 }
3251
3252 // Patch up jump label
3253 r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP);
3254 cf_pop = ctx->bc->cf_last;
3255
3256 cf_jump->cf_addr = cf_pop->id + 2;
3257 cf_jump->pop_count = 1;
3258 cf_pop->cf_addr = cf_pop->id + 2;
3259 cf_pop->pop_count = 1;
3260
3261 return 0;
3262 }
3263
3264 /*
3265 * We have to work out the thread ID for load and atomic
3266 * operations, which store the returned value to an index
3267 * in an intermediate buffer.
3268 * The index is calculated by taking the thread id,
3269 * calculated from the MBCNT instructions.
3270 * Then the shader engine ID is multiplied by 256,
3271 * and the wave id is added.
3272 * Then the result is multipled by 64 and thread id is
3273 * added.
3274 */
3275 static int load_thread_id_gpr(struct r600_shader_ctx *ctx)
3276 {
3277 struct r600_bytecode_alu alu;
3278 int r;
3279
3280 if (ctx->thread_id_gpr_loaded)
3281 return 0;
3282
3283 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3284 alu.op = ALU_OP1_MBCNT_32LO_ACCUM_PREV_INT;
3285 alu.dst.sel = ctx->temp_reg;
3286 alu.dst.chan = 0;
3287 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3288 alu.src[0].value = 0xffffffff;
3289 alu.dst.write = 1;
3290 r = r600_bytecode_add_alu(ctx->bc, &alu);
3291 if (r)
3292 return r;
3293
3294 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3295 alu.op = ALU_OP1_MBCNT_32HI_INT;
3296 alu.dst.sel = ctx->temp_reg;
3297 alu.dst.chan = 1;
3298 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3299 alu.src[0].value = 0xffffffff;
3300 alu.dst.write = 1;
3301 r = r600_bytecode_add_alu(ctx->bc, &alu);
3302 if (r)
3303 return r;
3304
3305 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3306 alu.op = ALU_OP3_MULADD_UINT24;
3307 alu.dst.sel = ctx->temp_reg;
3308 alu.dst.chan = 2;
3309 alu.src[0].sel = EG_V_SQ_ALU_SRC_SE_ID;
3310 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
3311 alu.src[1].value = 256;
3312 alu.src[2].sel = EG_V_SQ_ALU_SRC_HW_WAVE_ID;
3313 alu.dst.write = 1;
3314 alu.is_op3 = 1;
3315 alu.last = 1;
3316 r = r600_bytecode_add_alu(ctx->bc, &alu);
3317 if (r)
3318 return r;
3319
3320 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
3321 ctx->thread_id_gpr, 1,
3322 ctx->temp_reg, 2,
3323 V_SQ_ALU_SRC_LITERAL, 0x40,
3324 ctx->temp_reg, 0);
3325 if (r)
3326 return r;
3327 ctx->thread_id_gpr_loaded = true;
3328 return 0;
3329 }
3330
3331 static int r600_shader_from_tgsi(struct r600_context *rctx,
3332 struct r600_pipe_shader *pipeshader,
3333 union r600_shader_key key)
3334 {
3335 struct r600_screen *rscreen = rctx->screen;
3336 struct r600_shader *shader = &pipeshader->shader;
3337 struct tgsi_token *tokens = pipeshader->selector->tokens;
3338 struct pipe_stream_output_info so = pipeshader->selector->so;
3339 struct tgsi_full_immediate *immediate;
3340 struct r600_shader_ctx ctx;
3341 struct r600_bytecode_output output[ARRAY_SIZE(shader->output)];
3342 unsigned output_done, noutput;
3343 unsigned opcode;
3344 int j, k, r = 0;
3345 unsigned i;
3346 int next_param_base = 0, next_clip_base;
3347 int max_color_exports = MAX2(key.ps.nr_cbufs, 1);
3348 bool indirect_gprs;
3349 bool ring_outputs = false;
3350 bool lds_outputs = false;
3351 bool lds_inputs = false;
3352 bool pos_emitted = false;
3353
3354 ctx.bc = &shader->bc;
3355 ctx.shader = shader;
3356
3357 r600_bytecode_init(ctx.bc, rscreen->b.chip_class, rscreen->b.family,
3358 rscreen->has_compressed_msaa_texturing);
3359 ctx.tokens = tokens;
3360 tgsi_scan_shader(tokens, &ctx.info);
3361 shader->indirect_files = ctx.info.indirect_files;
3362
3363 int narrays = ctx.info.array_max[TGSI_FILE_TEMPORARY];
3364 ctx.array_infos = calloc(narrays, sizeof(*ctx.array_infos));
3365 ctx.spilled_arrays = calloc(narrays, sizeof(bool));
3366 tgsi_scan_arrays(tokens, TGSI_FILE_TEMPORARY, narrays, ctx.array_infos);
3367
3368 shader->uses_helper_invocation = false;
3369 shader->uses_doubles = ctx.info.uses_doubles;
3370 shader->uses_atomics = ctx.info.file_mask[TGSI_FILE_HW_ATOMIC];
3371 shader->nsys_inputs = 0;
3372
3373 shader->uses_images = ctx.info.file_count[TGSI_FILE_IMAGE] > 0 ||
3374 ctx.info.file_count[TGSI_FILE_BUFFER] > 0;
3375 indirect_gprs = ctx.info.indirect_files & ~((1 << TGSI_FILE_CONSTANT) | (1 << TGSI_FILE_SAMPLER));
3376 tgsi_parse_init(&ctx.parse, tokens);
3377 ctx.type = ctx.info.processor;
3378 shader->processor_type = ctx.type;
3379 ctx.bc->type = shader->processor_type;
3380
3381 switch (ctx.type) {
3382 case PIPE_SHADER_VERTEX:
3383 shader->vs_as_gs_a = key.vs.as_gs_a;
3384 shader->vs_as_es = key.vs.as_es;
3385 shader->vs_as_ls = key.vs.as_ls;
3386 shader->atomic_base = key.vs.first_atomic_counter;
3387 if (shader->vs_as_es)
3388 ring_outputs = true;
3389 if (shader->vs_as_ls)
3390 lds_outputs = true;
3391 break;
3392 case PIPE_SHADER_GEOMETRY:
3393 ring_outputs = true;
3394 shader->atomic_base = key.gs.first_atomic_counter;
3395 shader->gs_tri_strip_adj_fix = key.gs.tri_strip_adj_fix;
3396 break;
3397 case PIPE_SHADER_TESS_CTRL:
3398 shader->tcs_prim_mode = key.tcs.prim_mode;
3399 shader->atomic_base = key.tcs.first_atomic_counter;
3400 lds_outputs = true;
3401 lds_inputs = true;
3402 break;
3403 case PIPE_SHADER_TESS_EVAL:
3404 shader->tes_as_es = key.tes.as_es;
3405 shader->atomic_base = key.tes.first_atomic_counter;
3406 lds_inputs = true;
3407 if (shader->tes_as_es)
3408 ring_outputs = true;
3409 break;
3410 case PIPE_SHADER_FRAGMENT:
3411 shader->two_side = key.ps.color_two_side;
3412 shader->atomic_base = key.ps.first_atomic_counter;
3413 shader->rat_base = key.ps.nr_cbufs;
3414 shader->image_size_const_offset = key.ps.image_size_const_offset;
3415 break;
3416 case PIPE_SHADER_COMPUTE:
3417 shader->rat_base = 0;
3418 shader->image_size_const_offset = ctx.info.file_count[TGSI_FILE_SAMPLER];
3419 break;
3420 default:
3421 break;
3422 }
3423
3424 if (shader->vs_as_es || shader->tes_as_es) {
3425 ctx.gs_for_vs = &rctx->gs_shader->current->shader;
3426 } else {
3427 ctx.gs_for_vs = NULL;
3428 }
3429
3430 ctx.next_ring_offset = 0;
3431 ctx.gs_out_ring_offset = 0;
3432 ctx.gs_next_vertex = 0;
3433 ctx.gs_stream_output_info = &so;
3434
3435 ctx.face_gpr = -1;
3436 ctx.fixed_pt_position_gpr = -1;
3437 ctx.fragcoord_input = -1;
3438 ctx.colors_used = 0;
3439 ctx.clip_vertex_write = 0;
3440 ctx.thread_id_gpr_loaded = false;
3441
3442 ctx.helper_invoc_reg = -1;
3443 ctx.cs_block_size_reg = -1;
3444 ctx.cs_grid_size_reg = -1;
3445 ctx.cs_block_size_loaded = false;
3446 ctx.cs_grid_size_loaded = false;
3447
3448 shader->nr_ps_color_exports = 0;
3449 shader->nr_ps_max_color_exports = 0;
3450
3451
3452 /* register allocations */
3453 /* Values [0,127] correspond to GPR[0..127].
3454 * Values [128,159] correspond to constant buffer bank 0
3455 * Values [160,191] correspond to constant buffer bank 1
3456 * Values [256,511] correspond to cfile constants c[0..255]. (Gone on EG)
3457 * Values [256,287] correspond to constant buffer bank 2 (EG)
3458 * Values [288,319] correspond to constant buffer bank 3 (EG)
3459 * Other special values are shown in the list below.
3460 * 244 ALU_SRC_1_DBL_L: special constant 1.0 double-float, LSW. (RV670+)
3461 * 245 ALU_SRC_1_DBL_M: special constant 1.0 double-float, MSW. (RV670+)
3462 * 246 ALU_SRC_0_5_DBL_L: special constant 0.5 double-float, LSW. (RV670+)
3463 * 247 ALU_SRC_0_5_DBL_M: special constant 0.5 double-float, MSW. (RV670+)
3464 * 248 SQ_ALU_SRC_0: special constant 0.0.
3465 * 249 SQ_ALU_SRC_1: special constant 1.0 float.
3466 * 250 SQ_ALU_SRC_1_INT: special constant 1 integer.
3467 * 251 SQ_ALU_SRC_M_1_INT: special constant -1 integer.
3468 * 252 SQ_ALU_SRC_0_5: special constant 0.5 float.
3469 * 253 SQ_ALU_SRC_LITERAL: literal constant.
3470 * 254 SQ_ALU_SRC_PV: previous vector result.
3471 * 255 SQ_ALU_SRC_PS: previous scalar result.
3472 */
3473 for (i = 0; i < TGSI_FILE_COUNT; i++) {
3474 ctx.file_offset[i] = 0;
3475 }
3476
3477 if (ctx.type == PIPE_SHADER_VERTEX) {
3478
3479 ctx.file_offset[TGSI_FILE_INPUT] = 1;
3480 if (ctx.info.num_inputs)
3481 r600_bytecode_add_cfinst(ctx.bc, CF_OP_CALL_FS);
3482 }
3483 if (ctx.type == PIPE_SHADER_FRAGMENT) {
3484 if (ctx.bc->chip_class >= EVERGREEN)
3485 ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx);
3486 else
3487 ctx.file_offset[TGSI_FILE_INPUT] = allocate_system_value_inputs(&ctx, ctx.file_offset[TGSI_FILE_INPUT]);
3488
3489 for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) {
3490 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_HELPER_INVOCATION) {
3491 ctx.helper_invoc_reg = ctx.file_offset[TGSI_FILE_INPUT]++;
3492 shader->uses_helper_invocation = true;
3493 }
3494 }
3495 }
3496 if (ctx.type == PIPE_SHADER_GEOMETRY) {
3497 /* FIXME 1 would be enough in some cases (3 or less input vertices) */
3498 ctx.file_offset[TGSI_FILE_INPUT] = 2;
3499 }
3500 if (ctx.type == PIPE_SHADER_TESS_CTRL)
3501 ctx.file_offset[TGSI_FILE_INPUT] = 1;
3502 if (ctx.type == PIPE_SHADER_TESS_EVAL) {
3503 bool add_tesscoord = false, add_tess_inout = false;
3504 ctx.file_offset[TGSI_FILE_INPUT] = 1;
3505 for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) {
3506 /* if we have tesscoord save one reg */
3507 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSCOORD)
3508 add_tesscoord = true;
3509 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSINNER ||
3510 ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSOUTER)
3511 add_tess_inout = true;
3512 }
3513 if (add_tesscoord || add_tess_inout)
3514 ctx.file_offset[TGSI_FILE_INPUT]++;
3515 if (add_tess_inout)
3516 ctx.file_offset[TGSI_FILE_INPUT]+=2;
3517 }
3518 if (ctx.type == PIPE_SHADER_COMPUTE) {
3519 ctx.file_offset[TGSI_FILE_INPUT] = 2;
3520 for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) {
3521 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_GRID_SIZE)
3522 ctx.cs_grid_size_reg = ctx.file_offset[TGSI_FILE_INPUT]++;
3523 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_BLOCK_SIZE)
3524 ctx.cs_block_size_reg = ctx.file_offset[TGSI_FILE_INPUT]++;
3525 }
3526 }
3527
3528 ctx.file_offset[TGSI_FILE_OUTPUT] =
3529 ctx.file_offset[TGSI_FILE_INPUT] +
3530 ctx.info.file_max[TGSI_FILE_INPUT] + 1;
3531 ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] +
3532 ctx.info.file_max[TGSI_FILE_OUTPUT] + 1;
3533
3534 /* Outside the GPR range. This will be translated to one of the
3535 * kcache banks later. */
3536 ctx.file_offset[TGSI_FILE_CONSTANT] = 512;
3537 ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL;
3538
3539 pipeshader->scratch_space_needed = 0;
3540 int regno = ctx.file_offset[TGSI_FILE_TEMPORARY] +
3541 ctx.info.file_max[TGSI_FILE_TEMPORARY];
3542 if (regno > 124) {
3543 choose_spill_arrays(&ctx, &regno, &pipeshader->scratch_space_needed);
3544 shader->indirect_files = ctx.info.indirect_files;
3545 }
3546 shader->needs_scratch_space = pipeshader->scratch_space_needed != 0;
3547
3548 ctx.bc->ar_reg = ++regno;
3549 ctx.bc->index_reg[0] = ++regno;
3550 ctx.bc->index_reg[1] = ++regno;
3551
3552 if (ctx.type == PIPE_SHADER_TESS_CTRL) {
3553 ctx.tess_input_info = ++regno;
3554 ctx.tess_output_info = ++regno;
3555 } else if (ctx.type == PIPE_SHADER_TESS_EVAL) {
3556 ctx.tess_input_info = 0;
3557 ctx.tess_output_info = ++regno;
3558 } else if (ctx.type == PIPE_SHADER_GEOMETRY) {
3559 ctx.gs_export_gpr_tregs[0] = ++regno;
3560 ctx.gs_export_gpr_tregs[1] = ++regno;
3561 ctx.gs_export_gpr_tregs[2] = ++regno;
3562 ctx.gs_export_gpr_tregs[3] = ++regno;
3563 if (ctx.shader->gs_tri_strip_adj_fix) {
3564 ctx.gs_rotated_input[0] = ++regno;
3565 ctx.gs_rotated_input[1] = ++regno;
3566 } else {
3567 ctx.gs_rotated_input[0] = 0;
3568 ctx.gs_rotated_input[1] = 1;
3569 }
3570 }
3571
3572 if (shader->uses_images) {
3573 ctx.thread_id_gpr = ++regno;
3574 ctx.thread_id_gpr_loaded = false;
3575 }
3576 ctx.temp_reg = ++regno;
3577
3578 shader->max_arrays = 0;
3579 shader->num_arrays = 0;
3580 if (indirect_gprs) {
3581
3582 if (ctx.info.indirect_files & (1 << TGSI_FILE_INPUT)) {
3583 r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_INPUT],
3584 ctx.file_offset[TGSI_FILE_OUTPUT] -
3585 ctx.file_offset[TGSI_FILE_INPUT],
3586 0x0F);
3587 }
3588 if (ctx.info.indirect_files & (1 << TGSI_FILE_OUTPUT)) {
3589 r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_OUTPUT],
3590 ctx.file_offset[TGSI_FILE_TEMPORARY] -
3591 ctx.file_offset[TGSI_FILE_OUTPUT],
3592 0x0F);
3593 }
3594 }
3595
3596 ctx.nliterals = 0;
3597 ctx.literals = NULL;
3598 ctx.max_driver_temp_used = 0;
3599
3600 shader->fs_write_all = ctx.info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] &&
3601 ctx.info.colors_written == 1;
3602 shader->vs_position_window_space = ctx.info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
3603 shader->ps_conservative_z = (uint8_t)ctx.info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT];
3604
3605 if (ctx.type == PIPE_SHADER_VERTEX ||
3606 ctx.type == PIPE_SHADER_GEOMETRY ||
3607 ctx.type == PIPE_SHADER_TESS_EVAL) {
3608 shader->cc_dist_mask = (1 << (ctx.info.properties[TGSI_PROPERTY_NUM_CULLDIST_ENABLED] +
3609 ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED])) - 1;
3610 shader->clip_dist_write = (1 << ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED]) - 1;
3611 shader->cull_dist_write = ((1 << ctx.info.properties[TGSI_PROPERTY_NUM_CULLDIST_ENABLED]) - 1) << ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED];
3612 }
3613
3614 if (shader->vs_as_gs_a)
3615 vs_add_primid_output(&ctx, key.vs.prim_id_out);
3616
3617 if (ctx.type == PIPE_SHADER_TESS_EVAL)
3618 r600_fetch_tess_io_info(&ctx);
3619
3620 while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
3621 tgsi_parse_token(&ctx.parse);
3622 switch (ctx.parse.FullToken.Token.Type) {
3623 case TGSI_TOKEN_TYPE_IMMEDIATE:
3624 immediate = &ctx.parse.FullToken.FullImmediate;
3625 ctx.literals = realloc(ctx.literals, (ctx.nliterals + 1) * 16);
3626 if(ctx.literals == NULL) {
3627 r = -ENOMEM;
3628 goto out_err;
3629 }
3630 ctx.literals[ctx.nliterals * 4 + 0] = immediate->u[0].Uint;
3631 ctx.literals[ctx.nliterals * 4 + 1] = immediate->u[1].Uint;
3632 ctx.literals[ctx.nliterals * 4 + 2] = immediate->u[2].Uint;
3633 ctx.literals[ctx.nliterals * 4 + 3] = immediate->u[3].Uint;
3634 ctx.nliterals++;
3635 break;
3636 case TGSI_TOKEN_TYPE_DECLARATION:
3637 r = tgsi_declaration(&ctx);
3638 if (r)
3639 goto out_err;
3640 break;
3641 case TGSI_TOKEN_TYPE_INSTRUCTION:
3642 case TGSI_TOKEN_TYPE_PROPERTY:
3643 break;
3644 default:
3645 R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type);
3646 r = -EINVAL;
3647 goto out_err;
3648 }
3649 }
3650
3651 shader->ring_item_sizes[0] = ctx.next_ring_offset;
3652 shader->ring_item_sizes[1] = 0;
3653 shader->ring_item_sizes[2] = 0;
3654 shader->ring_item_sizes[3] = 0;
3655
3656 /* Process two side if needed */
3657 if (shader->two_side && ctx.colors_used) {
3658 int i, count = ctx.shader->ninput;
3659 unsigned next_lds_loc = ctx.shader->nlds;
3660
3661 /* additional inputs will be allocated right after the existing inputs,
3662 * we won't need them after the color selection, so we don't need to
3663 * reserve these gprs for the rest of the shader code and to adjust
3664 * output offsets etc. */
3665 int gpr = ctx.file_offset[TGSI_FILE_INPUT] +
3666 ctx.info.file_max[TGSI_FILE_INPUT] + 1;
3667
3668 /* if two sided and neither face or sample mask is used by shader, ensure face_gpr is emitted */
3669 if (ctx.face_gpr == -1) {
3670 i = ctx.shader->ninput++;
3671 ctx.shader->input[i].name = TGSI_SEMANTIC_FACE;
3672 ctx.shader->input[i].spi_sid = 0;
3673 ctx.shader->input[i].gpr = gpr++;
3674 ctx.face_gpr = ctx.shader->input[i].gpr;
3675 }
3676
3677 for (i = 0; i < count; i++) {
3678 if (ctx.shader->input[i].name == TGSI_SEMANTIC_COLOR) {
3679 int ni = ctx.shader->ninput++;
3680 memcpy(&ctx.shader->input[ni],&ctx.shader->input[i], sizeof(struct r600_shader_io));
3681 ctx.shader->input[ni].name = TGSI_SEMANTIC_BCOLOR;
3682 ctx.shader->input[ni].spi_sid = r600_spi_sid(&ctx.shader->input[ni]);
3683 ctx.shader->input[ni].gpr = gpr++;
3684 // TGSI to LLVM needs to know the lds position of inputs.
3685 // Non LLVM path computes it later (in process_twoside_color)
3686 ctx.shader->input[ni].lds_pos = next_lds_loc++;
3687 ctx.shader->input[i].back_color_input = ni;
3688 if (ctx.bc->chip_class >= EVERGREEN) {
3689 if ((r = evergreen_interp_input(&ctx, ni)))
3690 return r;
3691 }
3692 }
3693 }
3694 }
3695
3696 if (shader->fs_write_all && rscreen->b.chip_class >= EVERGREEN)
3697 shader->nr_ps_max_color_exports = 8;
3698
3699 if (ctx.shader->uses_helper_invocation) {
3700 if (ctx.bc->chip_class == CAYMAN)
3701 r = cm_load_helper_invocation(&ctx);
3702 else
3703 r = eg_load_helper_invocation(&ctx);
3704 if (r)
3705 return r;
3706 }
3707
3708 /*
3709 * XXX this relies on fixed_pt_position_gpr only being present when
3710 * this shader should be executed per sample. Should be the case for now...
3711 */
3712 if (ctx.fixed_pt_position_gpr != -1 && ctx.info.reads_samplemask) {
3713 /*
3714 * Fix up sample mask. The hw always gives us coverage mask for
3715 * the pixel. However, for per-sample shading, we need the
3716 * coverage for the shader invocation only.
3717 * Also, with disabled msaa, only the first bit should be set
3718 * (luckily the same fixup works for both problems).
3719 * For now, we can only do it if we know this shader is always
3720 * executed per sample (due to usage of bits in the shader
3721 * forcing per-sample execution).
3722 * If the fb is not multisampled, we'd do unnecessary work but
3723 * it should still be correct.
3724 * It will however do nothing for sample shading according
3725 * to MinSampleShading.
3726 */
3727 struct r600_bytecode_alu alu;
3728 int tmp = r600_get_temp(&ctx);
3729 assert(ctx.face_gpr != -1);
3730 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3731
3732 alu.op = ALU_OP2_LSHL_INT;
3733 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3734 alu.src[0].value = 0x1;
3735 alu.src[1].sel = ctx.fixed_pt_position_gpr;
3736 alu.src[1].chan = 3;
3737 alu.dst.sel = tmp;
3738 alu.dst.chan = 0;
3739 alu.dst.write = 1;
3740 alu.last = 1;
3741 if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
3742 return r;
3743
3744 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3745 alu.op = ALU_OP2_AND_INT;
3746 alu.src[0].sel = tmp;
3747 alu.src[1].sel = ctx.face_gpr;
3748 alu.src[1].chan = 2;
3749 alu.dst.sel = ctx.face_gpr;
3750 alu.dst.chan = 2;
3751 alu.dst.write = 1;
3752 alu.last = 1;
3753 if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
3754 return r;
3755 }
3756
3757 if (ctx.fragcoord_input >= 0) {
3758 if (ctx.bc->chip_class == CAYMAN) {
3759 for (j = 0 ; j < 4; j++) {
3760 struct r600_bytecode_alu alu;
3761 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3762 alu.op = ALU_OP1_RECIP_IEEE;
3763 alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
3764 alu.src[0].chan = 3;
3765
3766 alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
3767 alu.dst.chan = j;
3768 alu.dst.write = (j == 3);
3769 alu.last = 1;
3770 if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
3771 return r;
3772 }
3773 } else {
3774 struct r600_bytecode_alu alu;
3775 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3776 alu.op = ALU_OP1_RECIP_IEEE;
3777 alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
3778 alu.src[0].chan = 3;
3779
3780 alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
3781 alu.dst.chan = 3;
3782 alu.dst.write = 1;
3783 alu.last = 1;
3784 if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
3785 return r;
3786 }
3787 }
3788
3789 if (ctx.type == PIPE_SHADER_GEOMETRY) {
3790 struct r600_bytecode_alu alu;
3791 int r;
3792
3793 /* GS thread with no output workaround - emit a cut at start of GS */
3794 if (ctx.bc->chip_class == R600)
3795 r600_bytecode_add_cfinst(ctx.bc, CF_OP_CUT_VERTEX);
3796
3797 for (j = 0; j < 4; j++) {
3798 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3799 alu.op = ALU_OP1_MOV;
3800 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3801 alu.src[0].value = 0;
3802 alu.dst.sel = ctx.gs_export_gpr_tregs[j];
3803 alu.dst.write = 1;
3804 alu.last = 1;
3805 r = r600_bytecode_add_alu(ctx.bc, &alu);
3806 if (r)
3807 return r;
3808 }
3809
3810 if (ctx.shader->gs_tri_strip_adj_fix) {
3811 r = single_alu_op2(&ctx, ALU_OP2_AND_INT,
3812 ctx.gs_rotated_input[0], 2,
3813 0, 2,
3814 V_SQ_ALU_SRC_LITERAL, 1);
3815 if (r)
3816 return r;
3817
3818 for (i = 0; i < 6; i++) {
3819 int rotated = (i + 4) % 6;
3820 int offset_reg = i / 3;
3821 int offset_chan = i % 3;
3822 int rotated_offset_reg = rotated / 3;
3823 int rotated_offset_chan = rotated % 3;
3824
3825 if (offset_reg == 0 && offset_chan == 2)
3826 offset_chan = 3;
3827 if (rotated_offset_reg == 0 && rotated_offset_chan == 2)
3828 rotated_offset_chan = 3;
3829
3830 r = single_alu_op3(&ctx, ALU_OP3_CNDE_INT,
3831 ctx.gs_rotated_input[offset_reg], offset_chan,
3832 ctx.gs_rotated_input[0], 2,
3833 offset_reg, offset_chan,
3834 rotated_offset_reg, rotated_offset_chan);
3835 if (r)
3836 return r;
3837 }
3838 }
3839 }
3840
3841 if (ctx.type == PIPE_SHADER_TESS_CTRL)
3842 r600_fetch_tess_io_info(&ctx);
3843
3844 if (shader->two_side && ctx.colors_used) {
3845 if ((r = process_twoside_color_inputs(&ctx)))
3846 return r;
3847 }
3848
3849 tgsi_parse_init(&ctx.parse, tokens);
3850 while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
3851 tgsi_parse_token(&ctx.parse);
3852 switch (ctx.parse.FullToken.Token.Type) {
3853 case TGSI_TOKEN_TYPE_INSTRUCTION:
3854 r = tgsi_is_supported(&ctx);
3855 if (r)
3856 goto out_err;
3857 ctx.max_driver_temp_used = 0;
3858 /* reserve first tmp for everyone */
3859 r600_get_temp(&ctx);
3860
3861 opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode;
3862 if ((r = tgsi_split_constant(&ctx)))
3863 goto out_err;
3864 if ((r = tgsi_split_literal_constant(&ctx)))
3865 goto out_err;
3866 if (ctx.type == PIPE_SHADER_GEOMETRY) {
3867 if ((r = tgsi_split_gs_inputs(&ctx)))
3868 goto out_err;
3869 } else if (lds_inputs) {
3870 if ((r = tgsi_split_lds_inputs(&ctx)))
3871 goto out_err;
3872 }
3873 if (ctx.bc->chip_class == CAYMAN)
3874 ctx.inst_info = &cm_shader_tgsi_instruction[opcode];
3875 else if (ctx.bc->chip_class >= EVERGREEN)
3876 ctx.inst_info = &eg_shader_tgsi_instruction[opcode];
3877 else
3878 ctx.inst_info = &r600_shader_tgsi_instruction[opcode];
3879 r = ctx.inst_info->process(&ctx);
3880 if (r)
3881 goto out_err;
3882
3883 if (ctx.type == PIPE_SHADER_TESS_CTRL) {
3884 r = r600_store_tcs_output(&ctx);
3885 if (r)
3886 goto out_err;
3887 }
3888 break;
3889 default:
3890 break;
3891 }
3892 }
3893
3894 /* Reset the temporary register counter. */
3895 ctx.max_driver_temp_used = 0;
3896
3897 noutput = shader->noutput;
3898
3899 if (!ring_outputs && ctx.clip_vertex_write) {
3900 unsigned clipdist_temp[2];
3901
3902 clipdist_temp[0] = r600_get_temp(&ctx);
3903 clipdist_temp[1] = r600_get_temp(&ctx);
3904
3905 /* need to convert a clipvertex write into clipdistance writes and not export
3906 the clip vertex anymore */
3907
3908 memset(&shader->output[noutput], 0, 2*sizeof(struct r600_shader_io));
3909 shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
3910 shader->output[noutput].gpr = clipdist_temp[0];
3911 noutput++;
3912 shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
3913 shader->output[noutput].gpr = clipdist_temp[1];
3914 noutput++;
3915
3916 /* reset spi_sid for clipvertex output to avoid confusing spi */
3917 shader->output[ctx.cv_output].spi_sid = 0;
3918
3919 shader->clip_dist_write = 0xFF;
3920 shader->cc_dist_mask = 0xFF;
3921
3922 for (i = 0; i < 8; i++) {
3923 int oreg = i >> 2;
3924 int ochan = i & 3;
3925
3926 for (j = 0; j < 4; j++) {
3927 struct r600_bytecode_alu alu;
3928 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3929 alu.op = ALU_OP2_DOT4;
3930 alu.src[0].sel = shader->output[ctx.cv_output].gpr;
3931 alu.src[0].chan = j;
3932
3933 alu.src[1].sel = 512 + i;
3934 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
3935 alu.src[1].chan = j;
3936
3937 alu.dst.sel = clipdist_temp[oreg];
3938 alu.dst.chan = j;
3939 alu.dst.write = (j == ochan);
3940 if (j == 3)
3941 alu.last = 1;
3942 r = r600_bytecode_add_alu(ctx.bc, &alu);
3943 if (r)
3944 return r;
3945 }
3946 }
3947 }
3948
3949 /* Add stream outputs. */
3950 if (so.num_outputs) {
3951 bool emit = false;
3952 if (!lds_outputs && !ring_outputs && ctx.type == PIPE_SHADER_VERTEX)
3953 emit = true;
3954 if (!ring_outputs && ctx.type == PIPE_SHADER_TESS_EVAL)
3955 emit = true;
3956 if (emit)
3957 emit_streamout(&ctx, &so, -1, NULL);
3958 }
3959 pipeshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
3960 convert_edgeflag_to_int(&ctx);
3961
3962 if (ctx.type == PIPE_SHADER_TESS_CTRL)
3963 r600_emit_tess_factor(&ctx);
3964
3965 if (lds_outputs) {
3966 if (ctx.type == PIPE_SHADER_VERTEX) {
3967 if (ctx.shader->noutput)
3968 emit_lds_vs_writes(&ctx);
3969 }
3970 } else if (ring_outputs) {
3971 if (shader->vs_as_es || shader->tes_as_es) {
3972 ctx.gs_export_gpr_tregs[0] = r600_get_temp(&ctx);
3973 ctx.gs_export_gpr_tregs[1] = -1;
3974 ctx.gs_export_gpr_tregs[2] = -1;
3975 ctx.gs_export_gpr_tregs[3] = -1;
3976
3977 emit_gs_ring_writes(&ctx, &so, -1, FALSE);
3978 }
3979 } else {
3980 /* Export output */
3981 next_clip_base = shader->vs_out_misc_write ? 62 : 61;
3982
3983 for (i = 0, j = 0; i < noutput; i++, j++) {
3984 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
3985 output[j].gpr = shader->output[i].gpr;
3986 output[j].elem_size = 3;
3987 output[j].swizzle_x = 0;
3988 output[j].swizzle_y = 1;
3989 output[j].swizzle_z = 2;
3990 output[j].swizzle_w = 3;
3991 output[j].burst_count = 1;
3992 output[j].type = 0xffffffff;
3993 output[j].op = CF_OP_EXPORT;
3994 switch (ctx.type) {
3995 case PIPE_SHADER_VERTEX:
3996 case PIPE_SHADER_TESS_EVAL:
3997 switch (shader->output[i].name) {
3998 case TGSI_SEMANTIC_POSITION:
3999 output[j].array_base = 60;
4000 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4001 pos_emitted = true;
4002 break;
4003
4004 case TGSI_SEMANTIC_PSIZE:
4005 output[j].array_base = 61;
4006 output[j].swizzle_y = 7;
4007 output[j].swizzle_z = 7;
4008 output[j].swizzle_w = 7;
4009 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4010 pos_emitted = true;
4011 break;
4012 case TGSI_SEMANTIC_EDGEFLAG:
4013 output[j].array_base = 61;
4014 output[j].swizzle_x = 7;
4015 output[j].swizzle_y = 0;
4016 output[j].swizzle_z = 7;
4017 output[j].swizzle_w = 7;
4018 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4019 pos_emitted = true;
4020 break;
4021 case TGSI_SEMANTIC_LAYER:
4022 /* spi_sid is 0 for outputs that are
4023 * not consumed by PS */
4024 if (shader->output[i].spi_sid) {
4025 output[j].array_base = next_param_base++;
4026 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
4027 j++;
4028 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
4029 }
4030 output[j].array_base = 61;
4031 output[j].swizzle_x = 7;
4032 output[j].swizzle_y = 7;
4033 output[j].swizzle_z = 0;
4034 output[j].swizzle_w = 7;
4035 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4036 pos_emitted = true;
4037 break;
4038 case TGSI_SEMANTIC_VIEWPORT_INDEX:
4039 /* spi_sid is 0 for outputs that are
4040 * not consumed by PS */
4041 if (shader->output[i].spi_sid) {
4042 output[j].array_base = next_param_base++;
4043 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
4044 j++;
4045 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
4046 }
4047 output[j].array_base = 61;
4048 output[j].swizzle_x = 7;
4049 output[j].swizzle_y = 7;
4050 output[j].swizzle_z = 7;
4051 output[j].swizzle_w = 0;
4052 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4053 pos_emitted = true;
4054 break;
4055 case TGSI_SEMANTIC_CLIPVERTEX:
4056 j--;
4057 break;
4058 case TGSI_SEMANTIC_CLIPDIST:
4059 output[j].array_base = next_clip_base++;
4060 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4061 pos_emitted = true;
4062 /* spi_sid is 0 for clipdistance outputs that were generated
4063 * for clipvertex - we don't need to pass them to PS */
4064 if (shader->output[i].spi_sid) {
4065 j++;
4066 /* duplicate it as PARAM to pass to the pixel shader */
4067 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
4068 output[j].array_base = next_param_base++;
4069 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
4070 }
4071 break;
4072 case TGSI_SEMANTIC_FOG:
4073 output[j].swizzle_y = 4; /* 0 */
4074 output[j].swizzle_z = 4; /* 0 */
4075 output[j].swizzle_w = 5; /* 1 */
4076 break;
4077 case TGSI_SEMANTIC_PRIMID:
4078 output[j].swizzle_x = 2;
4079 output[j].swizzle_y = 4; /* 0 */
4080 output[j].swizzle_z = 4; /* 0 */
4081 output[j].swizzle_w = 4; /* 0 */
4082 break;
4083 }
4084
4085 break;
4086 case PIPE_SHADER_FRAGMENT:
4087 if (shader->output[i].name == TGSI_SEMANTIC_COLOR) {
4088 /* never export more colors than the number of CBs */
4089 if (shader->output[i].sid >= max_color_exports) {
4090 /* skip export */
4091 j--;
4092 continue;
4093 }
4094 output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
4095 output[j].array_base = shader->output[i].sid;
4096 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
4097 shader->nr_ps_color_exports++;
4098 shader->ps_color_export_mask |= (0xf << (shader->output[i].sid * 4));
4099
4100 /* If the i-th target format is set, all previous target formats must
4101 * be non-zero to avoid hangs. - from radeonsi, seems to apply to eg as well.
4102 */
4103 if (shader->output[i].sid > 0)
4104 for (unsigned x = 0; x < shader->output[i].sid; x++)
4105 shader->ps_color_export_mask |= (1 << (x*4));
4106
4107 if (shader->output[i].sid > shader->ps_export_highest)
4108 shader->ps_export_highest = shader->output[i].sid;
4109 if (shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN)) {
4110 for (k = 1; k < max_color_exports; k++) {
4111 j++;
4112 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
4113 output[j].gpr = shader->output[i].gpr;
4114 output[j].elem_size = 3;
4115 output[j].swizzle_x = 0;
4116 output[j].swizzle_y = 1;
4117 output[j].swizzle_z = 2;
4118 output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
4119 output[j].burst_count = 1;
4120 output[j].array_base = k;
4121 output[j].op = CF_OP_EXPORT;
4122 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
4123 shader->nr_ps_color_exports++;
4124 shader->ps_color_export_mask |= (0xf << (j * 4));
4125 }
4126 }
4127 } else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) {
4128 output[j].array_base = 61;
4129 output[j].swizzle_x = 2;
4130 output[j].swizzle_y = 7;
4131 output[j].swizzle_z = output[j].swizzle_w = 7;
4132 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
4133 } else if (shader->output[i].name == TGSI_SEMANTIC_STENCIL) {
4134 output[j].array_base = 61;
4135 output[j].swizzle_x = 7;
4136 output[j].swizzle_y = 1;
4137 output[j].swizzle_z = output[j].swizzle_w = 7;
4138 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
4139 } else if (shader->output[i].name == TGSI_SEMANTIC_SAMPLEMASK) {
4140 output[j].array_base = 61;
4141 output[j].swizzle_x = 7;
4142 output[j].swizzle_y = 7;
4143 output[j].swizzle_z = 0;
4144 output[j].swizzle_w = 7;
4145 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
4146 } else {
4147 R600_ERR("unsupported fragment output name %d\n", shader->output[i].name);
4148 r = -EINVAL;
4149 goto out_err;
4150 }
4151 break;
4152 case PIPE_SHADER_TESS_CTRL:
4153 break;
4154 default:
4155 R600_ERR("unsupported processor type %d\n", ctx.type);
4156 r = -EINVAL;
4157 goto out_err;
4158 }
4159
4160 if (output[j].type == 0xffffffff) {
4161 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
4162 output[j].array_base = next_param_base++;
4163 }
4164 }
4165
4166 /* add fake position export */
4167 if ((ctx.type == PIPE_SHADER_VERTEX || ctx.type == PIPE_SHADER_TESS_EVAL) && pos_emitted == false) {
4168 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
4169 output[j].gpr = 0;
4170 output[j].elem_size = 3;
4171 output[j].swizzle_x = 7;
4172 output[j].swizzle_y = 7;
4173 output[j].swizzle_z = 7;
4174 output[j].swizzle_w = 7;
4175 output[j].burst_count = 1;
4176 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
4177 output[j].array_base = 60;
4178 output[j].op = CF_OP_EXPORT;
4179 j++;
4180 }
4181
4182 /* add fake param output for vertex shader if no param is exported */
4183 if ((ctx.type == PIPE_SHADER_VERTEX || ctx.type == PIPE_SHADER_TESS_EVAL) && next_param_base == 0) {
4184 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
4185 output[j].gpr = 0;
4186 output[j].elem_size = 3;
4187 output[j].swizzle_x = 7;
4188 output[j].swizzle_y = 7;
4189 output[j].swizzle_z = 7;
4190 output[j].swizzle_w = 7;
4191 output[j].burst_count = 1;
4192 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
4193 output[j].array_base = 0;
4194 output[j].op = CF_OP_EXPORT;
4195 j++;
4196 }
4197
4198 /* add fake pixel export */
4199 if (ctx.type == PIPE_SHADER_FRAGMENT && shader->nr_ps_color_exports == 0) {
4200 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
4201 output[j].gpr = 0;
4202 output[j].elem_size = 3;
4203 output[j].swizzle_x = 7;
4204 output[j].swizzle_y = 7;
4205 output[j].swizzle_z = 7;
4206 output[j].swizzle_w = 7;
4207 output[j].burst_count = 1;
4208 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
4209 output[j].array_base = 0;
4210 output[j].op = CF_OP_EXPORT;
4211 j++;
4212 shader->nr_ps_color_exports++;
4213 shader->ps_color_export_mask = 0xf;
4214 }
4215
4216 noutput = j;
4217
4218 /* set export done on last export of each type */
4219 for (k = noutput - 1, output_done = 0; k >= 0; k--) {
4220 if (!(output_done & (1 << output[k].type))) {
4221 output_done |= (1 << output[k].type);
4222 output[k].op = CF_OP_EXPORT_DONE;
4223 }
4224 }
4225 /* add output to bytecode */
4226 for (i = 0; i < noutput; i++) {
4227 r = r600_bytecode_add_output(ctx.bc, &output[i]);
4228 if (r)
4229 goto out_err;
4230 }
4231 }
4232
4233 /* add program end */
4234 if (ctx.bc->chip_class == CAYMAN)
4235 cm_bytecode_add_cf_end(ctx.bc);
4236 else {
4237 const struct cf_op_info *last = NULL;
4238
4239 if (ctx.bc->cf_last)
4240 last = r600_isa_cf(ctx.bc->cf_last->op);
4241
4242 /* alu clause instructions don't have EOP bit, so add NOP */
4243 if (!last || last->flags & CF_ALU || ctx.bc->cf_last->op == CF_OP_LOOP_END || ctx.bc->cf_last->op == CF_OP_POP)
4244 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
4245
4246 ctx.bc->cf_last->end_of_program = 1;
4247 }
4248
4249 /* check GPR limit - we have 124 = 128 - 4
4250 * (4 are reserved as alu clause temporary registers) */
4251 if (ctx.bc->ngpr > 124) {
4252 R600_ERR("GPR limit exceeded - shader requires %d registers\n", ctx.bc->ngpr);
4253 r = -ENOMEM;
4254 goto out_err;
4255 }
4256
4257 if (ctx.type == PIPE_SHADER_GEOMETRY) {
4258 if ((r = generate_gs_copy_shader(rctx, pipeshader, &so)))
4259 return r;
4260 }
4261
4262 free(ctx.spilled_arrays);
4263 free(ctx.array_infos);
4264 free(ctx.literals);
4265 tgsi_parse_free(&ctx.parse);
4266 return 0;
4267 out_err:
4268 free(ctx.spilled_arrays);
4269 free(ctx.array_infos);
4270 free(ctx.literals);
4271 tgsi_parse_free(&ctx.parse);
4272 return r;
4273 }
4274
4275 static int tgsi_unsupported(struct r600_shader_ctx *ctx)
4276 {
4277 const unsigned tgsi_opcode =
4278 ctx->parse.FullToken.FullInstruction.Instruction.Opcode;
4279 R600_ERR("%s tgsi opcode unsupported\n",
4280 tgsi_get_opcode_name(tgsi_opcode));
4281 return -EINVAL;
4282 }
4283
4284 static int tgsi_end(struct r600_shader_ctx *ctx UNUSED)
4285 {
4286 return 0;
4287 }
4288
4289 static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
4290 const struct r600_shader_src *shader_src,
4291 unsigned chan)
4292 {
4293 bc_src->sel = shader_src->sel;
4294 bc_src->chan = shader_src->swizzle[chan];
4295 bc_src->neg = shader_src->neg;
4296 bc_src->abs = shader_src->abs;
4297 bc_src->rel = shader_src->rel;
4298 bc_src->value = shader_src->value[bc_src->chan];
4299 bc_src->kc_bank = shader_src->kc_bank;
4300 bc_src->kc_rel = shader_src->kc_rel;
4301 }
4302
4303 static void r600_bytecode_src_set_abs(struct r600_bytecode_alu_src *bc_src)
4304 {
4305 bc_src->abs = 1;
4306 bc_src->neg = 0;
4307 }
4308
4309 static void r600_bytecode_src_toggle_neg(struct r600_bytecode_alu_src *bc_src)
4310 {
4311 bc_src->neg = !bc_src->neg;
4312 }
4313
4314 static void tgsi_dst(struct r600_shader_ctx *ctx,
4315 const struct tgsi_full_dst_register *tgsi_dst,
4316 unsigned swizzle,
4317 struct r600_bytecode_alu_dst *r600_dst)
4318 {
4319 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4320
4321 if (tgsi_dst->Register.File == TGSI_FILE_TEMPORARY) {
4322 bool spilled;
4323 unsigned idx;
4324
4325 idx = map_tgsi_reg_index_to_r600_gpr(ctx, tgsi_dst->Register.Index, &spilled);
4326
4327 if (spilled) {
4328 struct r600_bytecode_output cf;
4329 int reg = r600_get_temp(ctx);
4330 int r;
4331
4332 r600_dst->sel = reg;
4333 r600_dst->chan = swizzle;
4334 r600_dst->write = 1;
4335 if (inst->Instruction.Saturate) {
4336 r600_dst->clamp = 1;
4337 }
4338
4339 // needs to be added after op using tgsi_dst
4340 memset(&cf, 0, sizeof(struct r600_bytecode_output));
4341 cf.op = CF_OP_MEM_SCRATCH;
4342 cf.elem_size = 3;
4343 cf.gpr = reg;
4344 cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
4345 cf.mark = 1;
4346 cf.comp_mask = inst->Dst[0].Register.WriteMask;
4347 cf.swizzle_x = 0;
4348 cf.swizzle_y = 1;
4349 cf.swizzle_z = 2;
4350 cf.swizzle_w = 3;
4351 cf.burst_count = 1;
4352
4353 get_spilled_array_base_and_size(ctx, tgsi_dst->Register.Index,
4354 &cf.array_base, &cf.array_size);
4355
4356 if (tgsi_dst->Register.Indirect) {
4357 if (ctx->bc->chip_class < R700)
4358 cf.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
4359 else
4360 cf.type = 3; // V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND_ACK;
4361 cf.index_gpr = ctx->bc->ar_reg;
4362 }
4363 else {
4364 cf.array_base += idx;
4365 cf.array_size = 0;
4366 }
4367
4368 r = r600_bytecode_add_pending_output(ctx->bc, &cf);
4369 if (r)
4370 return;
4371
4372 if (ctx->bc->chip_class >= R700)
4373 r600_bytecode_need_wait_ack(ctx->bc, true);
4374
4375 return;
4376 }
4377 else {
4378 r600_dst->sel = idx;
4379 }
4380 }
4381 else {
4382 r600_dst->sel = tgsi_dst->Register.Index;
4383 r600_dst->sel += ctx->file_offset[tgsi_dst->Register.File];
4384 }
4385 r600_dst->chan = swizzle;
4386 r600_dst->write = 1;
4387 if (inst->Instruction.Saturate) {
4388 r600_dst->clamp = 1;
4389 }
4390 if (ctx->type == PIPE_SHADER_TESS_CTRL) {
4391 if (tgsi_dst->Register.File == TGSI_FILE_OUTPUT) {
4392 return;
4393 }
4394 }
4395 if (tgsi_dst->Register.Indirect)
4396 r600_dst->rel = V_SQ_REL_RELATIVE;
4397
4398 }
4399
4400 static int tgsi_op2_64_params(struct r600_shader_ctx *ctx, bool singledest, bool swap, int dest_temp, int op_override)
4401 {
4402 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4403 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4404 struct r600_bytecode_alu alu;
4405 int i, j, r, lasti = tgsi_last_instruction(write_mask);
4406 int use_tmp = 0;
4407 int swizzle_x = inst->Src[0].Register.SwizzleX;
4408
4409 if (singledest) {
4410 switch (write_mask) {
4411 case 0x1:
4412 if (swizzle_x == 2) {
4413 write_mask = 0xc;
4414 use_tmp = 3;
4415 } else
4416 write_mask = 0x3;
4417 break;
4418 case 0x2:
4419 if (swizzle_x == 2) {
4420 write_mask = 0xc;
4421 use_tmp = 3;
4422 } else {
4423 write_mask = 0x3;
4424 use_tmp = 1;
4425 }
4426 break;
4427 case 0x4:
4428 if (swizzle_x == 0) {
4429 write_mask = 0x3;
4430 use_tmp = 1;
4431 } else
4432 write_mask = 0xc;
4433 break;
4434 case 0x8:
4435 if (swizzle_x == 0) {
4436 write_mask = 0x3;
4437 use_tmp = 1;
4438 } else {
4439 write_mask = 0xc;
4440 use_tmp = 3;
4441 }
4442 break;
4443 }
4444 }
4445
4446 lasti = tgsi_last_instruction(write_mask);
4447 for (i = 0; i <= lasti; i++) {
4448
4449 if (!(write_mask & (1 << i)))
4450 continue;
4451
4452 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4453
4454 if (singledest) {
4455 if (use_tmp || dest_temp) {
4456 alu.dst.sel = use_tmp ? ctx->temp_reg : dest_temp;
4457 alu.dst.chan = i;
4458 alu.dst.write = 1;
4459 } else {
4460 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4461 }
4462 if (i == 1 || i == 3)
4463 alu.dst.write = 0;
4464 } else
4465 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4466
4467 alu.op = op_override ? op_override : ctx->inst_info->op;
4468 if (ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DABS) {
4469 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4470 } else if (!swap) {
4471 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4472 r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
4473 }
4474 } else {
4475 r600_bytecode_src(&alu.src[0], &ctx->src[1], fp64_switch(i));
4476 r600_bytecode_src(&alu.src[1], &ctx->src[0], fp64_switch(i));
4477 }
4478
4479 /* handle some special cases */
4480 if (i == 1 || i == 3) {
4481 switch (ctx->parse.FullToken.FullInstruction.Instruction.Opcode) {
4482 case TGSI_OPCODE_DABS:
4483 r600_bytecode_src_set_abs(&alu.src[0]);
4484 break;
4485 default:
4486 break;
4487 }
4488 }
4489 if (i == lasti) {
4490 alu.last = 1;
4491 }
4492 r = r600_bytecode_add_alu(ctx->bc, &alu);
4493 if (r)
4494 return r;
4495 }
4496
4497 if (use_tmp) {
4498 write_mask = inst->Dst[0].Register.WriteMask;
4499
4500 lasti = tgsi_last_instruction(write_mask);
4501 /* move result from temp to dst */
4502 for (i = 0; i <= lasti; i++) {
4503 if (!(write_mask & (1 << i)))
4504 continue;
4505
4506 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4507 alu.op = ALU_OP1_MOV;
4508
4509 if (dest_temp) {
4510 alu.dst.sel = dest_temp;
4511 alu.dst.chan = i;
4512 alu.dst.write = 1;
4513 } else
4514 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4515 alu.src[0].sel = ctx->temp_reg;
4516 alu.src[0].chan = use_tmp - 1;
4517 alu.last = (i == lasti);
4518
4519 r = r600_bytecode_add_alu(ctx->bc, &alu);
4520 if (r)
4521 return r;
4522 }
4523 }
4524 return 0;
4525 }
4526
4527 static int tgsi_op2_64(struct r600_shader_ctx *ctx)
4528 {
4529 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4530 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4531 /* confirm writemasking */
4532 if ((write_mask & 0x3) != 0x3 &&
4533 (write_mask & 0xc) != 0xc) {
4534 fprintf(stderr, "illegal writemask for 64-bit: 0x%x\n", write_mask);
4535 return -1;
4536 }
4537 return tgsi_op2_64_params(ctx, false, false, 0, 0);
4538 }
4539
4540 static int tgsi_op2_64_single_dest(struct r600_shader_ctx *ctx)
4541 {
4542 return tgsi_op2_64_params(ctx, true, false, 0, 0);
4543 }
4544
4545 static int tgsi_op2_64_single_dest_s(struct r600_shader_ctx *ctx)
4546 {
4547 return tgsi_op2_64_params(ctx, true, true, 0, 0);
4548 }
4549
4550 static int tgsi_op3_64(struct r600_shader_ctx *ctx)
4551 {
4552 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4553 struct r600_bytecode_alu alu;
4554 int i, j, r;
4555 int lasti = 3;
4556 int tmp = r600_get_temp(ctx);
4557
4558 for (i = 0; i < lasti + 1; i++) {
4559
4560 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4561 alu.op = ctx->inst_info->op;
4562 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4563 r600_bytecode_src(&alu.src[j], &ctx->src[j], i == 3 ? 0 : 1);
4564 }
4565
4566 if (inst->Dst[0].Register.WriteMask & (1 << i))
4567 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4568 else
4569 alu.dst.sel = tmp;
4570
4571 alu.dst.chan = i;
4572 alu.is_op3 = 1;
4573 if (i == lasti) {
4574 alu.last = 1;
4575 }
4576 r = r600_bytecode_add_alu(ctx->bc, &alu);
4577 if (r)
4578 return r;
4579 }
4580 return 0;
4581 }
4582
4583 static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap, int trans_only)
4584 {
4585 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4586 struct r600_bytecode_alu alu;
4587 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4588 int i, j, r, lasti = tgsi_last_instruction(write_mask);
4589 /* use temp register if trans_only and more than one dst component */
4590 int use_tmp = trans_only && (write_mask ^ (1 << lasti));
4591 unsigned op = ctx->inst_info->op;
4592
4593 if (op == ALU_OP2_MUL_IEEE &&
4594 ctx->info.properties[TGSI_PROPERTY_MUL_ZERO_WINS])
4595 op = ALU_OP2_MUL;
4596
4597 for (i = 0; i <= lasti; i++) {
4598 if (!(write_mask & (1 << i)))
4599 continue;
4600
4601 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4602 if (use_tmp) {
4603 alu.dst.sel = ctx->temp_reg;
4604 alu.dst.chan = i;
4605 alu.dst.write = 1;
4606 } else
4607 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4608
4609 alu.op = op;
4610 if (!swap) {
4611 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4612 r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
4613 }
4614 } else {
4615 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4616 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4617 }
4618 if (i == lasti || trans_only) {
4619 alu.last = 1;
4620 }
4621 r = r600_bytecode_add_alu(ctx->bc, &alu);
4622 if (r)
4623 return r;
4624 }
4625
4626 if (use_tmp) {
4627 /* move result from temp to dst */
4628 for (i = 0; i <= lasti; i++) {
4629 if (!(write_mask & (1 << i)))
4630 continue;
4631
4632 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4633 alu.op = ALU_OP1_MOV;
4634 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4635 alu.src[0].sel = ctx->temp_reg;
4636 alu.src[0].chan = i;
4637 alu.last = (i == lasti);
4638
4639 r = r600_bytecode_add_alu(ctx->bc, &alu);
4640 if (r)
4641 return r;
4642 }
4643 }
4644 return 0;
4645 }
4646
4647 static int tgsi_op2(struct r600_shader_ctx *ctx)
4648 {
4649 return tgsi_op2_s(ctx, 0, 0);
4650 }
4651
4652 static int tgsi_op2_swap(struct r600_shader_ctx *ctx)
4653 {
4654 return tgsi_op2_s(ctx, 1, 0);
4655 }
4656
4657 static int tgsi_op2_trans(struct r600_shader_ctx *ctx)
4658 {
4659 return tgsi_op2_s(ctx, 0, 1);
4660 }
4661
4662 static int tgsi_ineg(struct r600_shader_ctx *ctx)
4663 {
4664 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4665 struct r600_bytecode_alu alu;
4666 int i, r;
4667 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4668
4669 for (i = 0; i < lasti + 1; i++) {
4670
4671 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4672 continue;
4673 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4674 alu.op = ctx->inst_info->op;
4675
4676 alu.src[0].sel = V_SQ_ALU_SRC_0;
4677
4678 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4679
4680 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4681
4682 if (i == lasti) {
4683 alu.last = 1;
4684 }
4685 r = r600_bytecode_add_alu(ctx->bc, &alu);
4686 if (r)
4687 return r;
4688 }
4689 return 0;
4690
4691 }
4692
4693 static int tgsi_dneg(struct r600_shader_ctx *ctx)
4694 {
4695 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4696 struct r600_bytecode_alu alu;
4697 int i, r;
4698 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4699
4700 for (i = 0; i < lasti + 1; i++) {
4701
4702 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4703 continue;
4704 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4705 alu.op = ALU_OP1_MOV;
4706
4707 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4708
4709 if (i == 1 || i == 3)
4710 r600_bytecode_src_toggle_neg(&alu.src[0]);
4711 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4712
4713 if (i == lasti) {
4714 alu.last = 1;
4715 }
4716 r = r600_bytecode_add_alu(ctx->bc, &alu);
4717 if (r)
4718 return r;
4719 }
4720 return 0;
4721
4722 }
4723
4724 static int tgsi_dfracexp(struct r600_shader_ctx *ctx)
4725 {
4726 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4727 struct r600_bytecode_alu alu;
4728 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4729 int i, j, r;
4730
4731 for (i = 0; i <= 3; i++) {
4732 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4733 alu.op = ctx->inst_info->op;
4734
4735 alu.dst.sel = ctx->temp_reg;
4736 alu.dst.chan = i;
4737 alu.dst.write = 1;
4738 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4739 r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
4740 }
4741
4742 if (i == 3)
4743 alu.last = 1;
4744
4745 r = r600_bytecode_add_alu(ctx->bc, &alu);
4746 if (r)
4747 return r;
4748 }
4749
4750 /* Replicate significand result across channels. */
4751 for (i = 0; i <= 3; i++) {
4752 if (!(write_mask & (1 << i)))
4753 continue;
4754
4755 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4756 alu.op = ALU_OP1_MOV;
4757 alu.src[0].chan = (i & 1) + 2;
4758 alu.src[0].sel = ctx->temp_reg;
4759
4760 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4761 alu.dst.write = 1;
4762 alu.last = 1;
4763 r = r600_bytecode_add_alu(ctx->bc, &alu);
4764 if (r)
4765 return r;
4766 }
4767
4768 for (i = 0; i <= 3; i++) {
4769 if (inst->Dst[1].Register.WriteMask & (1 << i)) {
4770 /* MOV third channels to writemask dst1 */
4771 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4772 alu.op = ALU_OP1_MOV;
4773 alu.src[0].chan = 1;
4774 alu.src[0].sel = ctx->temp_reg;
4775
4776 tgsi_dst(ctx, &inst->Dst[1], i, &alu.dst);
4777 alu.last = 1;
4778 r = r600_bytecode_add_alu(ctx->bc, &alu);
4779 if (r)
4780 return r;
4781 break;
4782 }
4783 }
4784 return 0;
4785 }
4786
4787
4788 static int egcm_int_to_double(struct r600_shader_ctx *ctx)
4789 {
4790 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4791 struct r600_bytecode_alu alu;
4792 int i, c, r;
4793 int write_mask = inst->Dst[0].Register.WriteMask;
4794 int temp_reg = r600_get_temp(ctx);
4795
4796 assert(inst->Instruction.Opcode == TGSI_OPCODE_I2D ||
4797 inst->Instruction.Opcode == TGSI_OPCODE_U2D);
4798
4799 for (c = 0; c < 2; c++) {
4800 int dchan = c * 2;
4801 if (write_mask & (0x3 << dchan)) {
4802 /* split into 24-bit int and 8-bit int */
4803 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4804 alu.op = ALU_OP2_AND_INT;
4805 alu.dst.sel = temp_reg;
4806 alu.dst.chan = dchan;
4807 r600_bytecode_src(&alu.src[0], &ctx->src[0], c);
4808 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4809 alu.src[1].value = 0xffffff00;
4810 alu.dst.write = 1;
4811 r = r600_bytecode_add_alu(ctx->bc, &alu);
4812 if (r)
4813 return r;
4814
4815 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4816 alu.op = ALU_OP2_AND_INT;
4817 alu.dst.sel = temp_reg;
4818 alu.dst.chan = dchan + 1;
4819 r600_bytecode_src(&alu.src[0], &ctx->src[0], c);
4820 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4821 alu.src[1].value = 0xff;
4822 alu.dst.write = 1;
4823 alu.last = 1;
4824 r = r600_bytecode_add_alu(ctx->bc, &alu);
4825 if (r)
4826 return r;
4827 }
4828 }
4829
4830 for (c = 0; c < 2; c++) {
4831 int dchan = c * 2;
4832 if (write_mask & (0x3 << dchan)) {
4833 for (i = dchan; i <= dchan + 1; i++) {
4834 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4835 alu.op = i == dchan ? ctx->inst_info->op : ALU_OP1_UINT_TO_FLT;
4836
4837 alu.src[0].sel = temp_reg;
4838 alu.src[0].chan = i;
4839 alu.dst.sel = temp_reg;
4840 alu.dst.chan = i;
4841 alu.dst.write = 1;
4842 if (ctx->bc->chip_class == CAYMAN)
4843 alu.last = i == dchan + 1;
4844 else
4845 alu.last = 1; /* trans only ops on evergreen */
4846
4847 r = r600_bytecode_add_alu(ctx->bc, &alu);
4848 if (r)
4849 return r;
4850 }
4851 }
4852 }
4853
4854 for (c = 0; c < 2; c++) {
4855 int dchan = c * 2;
4856 if (write_mask & (0x3 << dchan)) {
4857 for (i = 0; i < 4; i++) {
4858 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4859 alu.op = ALU_OP1_FLT32_TO_FLT64;
4860
4861 alu.src[0].chan = dchan + (i / 2);
4862 if (i == 0 || i == 2)
4863 alu.src[0].sel = temp_reg;
4864 else {
4865 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
4866 alu.src[0].value = 0x0;
4867 }
4868 alu.dst.sel = ctx->temp_reg;
4869 alu.dst.chan = i;
4870 alu.last = i == 3;
4871 alu.dst.write = 1;
4872
4873 r = r600_bytecode_add_alu(ctx->bc, &alu);
4874 if (r)
4875 return r;
4876 }
4877
4878 for (i = 0; i <= 1; i++) {
4879 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4880 alu.op = ALU_OP2_ADD_64;
4881
4882 alu.src[0].chan = fp64_switch(i);
4883 alu.src[0].sel = ctx->temp_reg;
4884
4885 alu.src[1].chan = fp64_switch(i + 2);
4886 alu.src[1].sel = ctx->temp_reg;
4887 tgsi_dst(ctx, &inst->Dst[0], dchan + i, &alu.dst);
4888 alu.last = i == 1;
4889
4890 r = r600_bytecode_add_alu(ctx->bc, &alu);
4891 if (r)
4892 return r;
4893 }
4894 }
4895 }
4896
4897 return 0;
4898 }
4899
4900 static int egcm_double_to_int(struct r600_shader_ctx *ctx)
4901 {
4902 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4903 struct r600_bytecode_alu alu;
4904 int i, r;
4905 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4906 int treg = r600_get_temp(ctx);
4907 assert(inst->Instruction.Opcode == TGSI_OPCODE_D2I ||
4908 inst->Instruction.Opcode == TGSI_OPCODE_D2U);
4909
4910 /* do a 64->32 into a temp register */
4911 r = tgsi_op2_64_params(ctx, true, false, treg, ALU_OP1_FLT64_TO_FLT32);
4912 if (r)
4913 return r;
4914
4915 for (i = 0; i <= lasti; i++) {
4916 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4917 continue;
4918 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4919 alu.op = ctx->inst_info->op;
4920
4921 alu.src[0].chan = i;
4922 alu.src[0].sel = treg;
4923 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4924 alu.last = (i == lasti);
4925
4926 r = r600_bytecode_add_alu(ctx->bc, &alu);
4927 if (r)
4928 return r;
4929 }
4930
4931 return 0;
4932 }
4933
4934 static int cayman_emit_unary_double_raw(struct r600_bytecode *bc,
4935 unsigned op,
4936 int dst_reg,
4937 struct r600_shader_src *src,
4938 bool abs)
4939 {
4940 struct r600_bytecode_alu alu;
4941 const int last_slot = 3;
4942 int r;
4943
4944 /* these have to write the result to X/Y by the looks of it */
4945 for (int i = 0 ; i < last_slot; i++) {
4946 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4947 alu.op = op;
4948
4949 r600_bytecode_src(&alu.src[0], src, 1);
4950 r600_bytecode_src(&alu.src[1], src, 0);
4951
4952 if (abs)
4953 r600_bytecode_src_set_abs(&alu.src[1]);
4954
4955 alu.dst.sel = dst_reg;
4956 alu.dst.chan = i;
4957 alu.dst.write = (i == 0 || i == 1);
4958
4959 if (bc->chip_class != CAYMAN || i == last_slot - 1)
4960 alu.last = 1;
4961 r = r600_bytecode_add_alu(bc, &alu);
4962 if (r)
4963 return r;
4964 }
4965
4966 return 0;
4967 }
4968
4969 static int cayman_emit_double_instr(struct r600_shader_ctx *ctx)
4970 {
4971 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4972 int i, r;
4973 struct r600_bytecode_alu alu;
4974 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4975 int t1 = ctx->temp_reg;
4976
4977 /* should only be one src regs */
4978 assert(inst->Instruction.NumSrcRegs == 1);
4979
4980 /* only support one double at a time */
4981 assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
4982 inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);
4983
4984 r = cayman_emit_unary_double_raw(
4985 ctx->bc, ctx->inst_info->op, t1,
4986 &ctx->src[0],
4987 ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DRSQ ||
4988 ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DSQRT);
4989 if (r)
4990 return r;
4991
4992 for (i = 0 ; i <= lasti; i++) {
4993 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4994 continue;
4995 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4996 alu.op = ALU_OP1_MOV;
4997 alu.src[0].sel = t1;
4998 alu.src[0].chan = (i == 0 || i == 2) ? 0 : 1;
4999 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5000 alu.dst.write = 1;
5001 if (i == lasti)
5002 alu.last = 1;
5003 r = r600_bytecode_add_alu(ctx->bc, &alu);
5004 if (r)
5005 return r;
5006 }
5007 return 0;
5008 }
5009
5010 static int cayman_emit_float_instr(struct r600_shader_ctx *ctx)
5011 {
5012 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5013 int i, j, r;
5014 struct r600_bytecode_alu alu;
5015 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
5016
5017 for (i = 0 ; i < last_slot; i++) {
5018 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5019 alu.op = ctx->inst_info->op;
5020 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
5021 r600_bytecode_src(&alu.src[j], &ctx->src[j], 0);
5022
5023 /* RSQ should take the absolute value of src */
5024 if (inst->Instruction.Opcode == TGSI_OPCODE_RSQ) {
5025 r600_bytecode_src_set_abs(&alu.src[j]);
5026 }
5027 }
5028 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5029 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
5030
5031 if (i == last_slot - 1)
5032 alu.last = 1;
5033 r = r600_bytecode_add_alu(ctx->bc, &alu);
5034 if (r)
5035 return r;
5036 }
5037 return 0;
5038 }
5039
5040 static int cayman_mul_int_instr(struct r600_shader_ctx *ctx)
5041 {
5042 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5043 int i, j, k, r;
5044 struct r600_bytecode_alu alu;
5045 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5046 int t1 = ctx->temp_reg;
5047
5048 for (k = 0; k <= lasti; k++) {
5049 if (!(inst->Dst[0].Register.WriteMask & (1 << k)))
5050 continue;
5051
5052 for (i = 0 ; i < 4; i++) {
5053 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5054 alu.op = ctx->inst_info->op;
5055 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
5056 r600_bytecode_src(&alu.src[j], &ctx->src[j], k);
5057 }
5058 alu.dst.sel = t1;
5059 alu.dst.chan = i;
5060 alu.dst.write = (i == k);
5061 if (i == 3)
5062 alu.last = 1;
5063 r = r600_bytecode_add_alu(ctx->bc, &alu);
5064 if (r)
5065 return r;
5066 }
5067 }
5068
5069 for (i = 0 ; i <= lasti; i++) {
5070 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5071 continue;
5072 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5073 alu.op = ALU_OP1_MOV;
5074 alu.src[0].sel = t1;
5075 alu.src[0].chan = i;
5076 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5077 alu.dst.write = 1;
5078 if (i == lasti)
5079 alu.last = 1;
5080 r = r600_bytecode_add_alu(ctx->bc, &alu);
5081 if (r)
5082 return r;
5083 }
5084
5085 return 0;
5086 }
5087
5088
5089 static int cayman_mul_double_instr(struct r600_shader_ctx *ctx)
5090 {
5091 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5092 int i, j, k, r;
5093 struct r600_bytecode_alu alu;
5094 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5095 int t1 = ctx->temp_reg;
5096
5097 /* t1 would get overwritten below if we actually tried to
5098 * multiply two pairs of doubles at a time. */
5099 assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
5100 inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);
5101
5102 k = inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ? 0 : 1;
5103
5104 for (i = 0; i < 4; i++) {
5105 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5106 alu.op = ctx->inst_info->op;
5107 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
5108 r600_bytecode_src(&alu.src[j], &ctx->src[j], k * 2 + ((i == 3) ? 0 : 1));
5109 }
5110 alu.dst.sel = t1;
5111 alu.dst.chan = i;
5112 alu.dst.write = 1;
5113 if (i == 3)
5114 alu.last = 1;
5115 r = r600_bytecode_add_alu(ctx->bc, &alu);
5116 if (r)
5117 return r;
5118 }
5119
5120 for (i = 0; i <= lasti; i++) {
5121 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5122 continue;
5123 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5124 alu.op = ALU_OP1_MOV;
5125 alu.src[0].sel = t1;
5126 alu.src[0].chan = i;
5127 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5128 alu.dst.write = 1;
5129 if (i == lasti)
5130 alu.last = 1;
5131 r = r600_bytecode_add_alu(ctx->bc, &alu);
5132 if (r)
5133 return r;
5134 }
5135
5136 return 0;
5137 }
5138
5139 /*
5140 * Emit RECIP_64 + MUL_64 to implement division.
5141 */
5142 static int cayman_ddiv_instr(struct r600_shader_ctx *ctx)
5143 {
5144 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5145 int r;
5146 struct r600_bytecode_alu alu;
5147 int t1 = ctx->temp_reg;
5148 int k;
5149
5150 /* Only support one double at a time. This is the same constraint as
5151 * in DMUL lowering. */
5152 assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
5153 inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);
5154
5155 k = inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ? 0 : 1;
5156
5157 r = cayman_emit_unary_double_raw(ctx->bc, ALU_OP2_RECIP_64, t1, &ctx->src[1], false);
5158 if (r)
5159 return r;
5160
5161 for (int i = 0; i < 4; i++) {
5162 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5163 alu.op = ALU_OP2_MUL_64;
5164
5165 r600_bytecode_src(&alu.src[0], &ctx->src[0], k * 2 + ((i == 3) ? 0 : 1));
5166
5167 alu.src[1].sel = t1;
5168 alu.src[1].chan = (i == 3) ? 0 : 1;
5169
5170 alu.dst.sel = t1;
5171 alu.dst.chan = i;
5172 alu.dst.write = 1;
5173 if (i == 3)
5174 alu.last = 1;
5175 r = r600_bytecode_add_alu(ctx->bc, &alu);
5176 if (r)
5177 return r;
5178 }
5179
5180 for (int i = 0; i < 2; i++) {
5181 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5182 alu.op = ALU_OP1_MOV;
5183 alu.src[0].sel = t1;
5184 alu.src[0].chan = i;
5185 tgsi_dst(ctx, &inst->Dst[0], k * 2 + i, &alu.dst);
5186 alu.dst.write = 1;
5187 if (i == 1)
5188 alu.last = 1;
5189 r = r600_bytecode_add_alu(ctx->bc, &alu);
5190 if (r)
5191 return r;
5192 }
5193 return 0;
5194 }
5195
5196 /*
5197 * r600 - trunc to -PI..PI range
5198 * r700 - normalize by dividing by 2PI
5199 * see fdo bug 27901
5200 */
5201 static int tgsi_setup_trig(struct r600_shader_ctx *ctx)
5202 {
5203 int r;
5204 struct r600_bytecode_alu alu;
5205
5206 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5207 alu.op = ALU_OP3_MULADD;
5208 alu.is_op3 = 1;
5209
5210 alu.dst.chan = 0;
5211 alu.dst.sel = ctx->temp_reg;
5212 alu.dst.write = 1;
5213
5214 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5215
5216 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
5217 alu.src[1].chan = 0;
5218 alu.src[1].value = u_bitcast_f2u(0.5f * M_1_PI);
5219 alu.src[2].sel = V_SQ_ALU_SRC_0_5;
5220 alu.src[2].chan = 0;
5221 alu.last = 1;
5222 r = r600_bytecode_add_alu(ctx->bc, &alu);
5223 if (r)
5224 return r;
5225
5226 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5227 alu.op = ALU_OP1_FRACT;
5228
5229 alu.dst.chan = 0;
5230 alu.dst.sel = ctx->temp_reg;
5231 alu.dst.write = 1;
5232
5233 alu.src[0].sel = ctx->temp_reg;
5234 alu.src[0].chan = 0;
5235 alu.last = 1;
5236 r = r600_bytecode_add_alu(ctx->bc, &alu);
5237 if (r)
5238 return r;
5239
5240 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5241 alu.op = ALU_OP3_MULADD;
5242 alu.is_op3 = 1;
5243
5244 alu.dst.chan = 0;
5245 alu.dst.sel = ctx->temp_reg;
5246 alu.dst.write = 1;
5247
5248 alu.src[0].sel = ctx->temp_reg;
5249 alu.src[0].chan = 0;
5250
5251 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
5252 alu.src[1].chan = 0;
5253 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
5254 alu.src[2].chan = 0;
5255
5256 if (ctx->bc->chip_class == R600) {
5257 alu.src[1].value = u_bitcast_f2u(2.0f * M_PI);
5258 alu.src[2].value = u_bitcast_f2u(-M_PI);
5259 } else {
5260 alu.src[1].sel = V_SQ_ALU_SRC_1;
5261 alu.src[2].sel = V_SQ_ALU_SRC_0_5;
5262 alu.src[2].neg = 1;
5263 }
5264
5265 alu.last = 1;
5266 r = r600_bytecode_add_alu(ctx->bc, &alu);
5267 if (r)
5268 return r;
5269 return 0;
5270 }
5271
5272 static int cayman_trig(struct r600_shader_ctx *ctx)
5273 {
5274 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5275 struct r600_bytecode_alu alu;
5276 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
5277 int i, r;
5278
5279 r = tgsi_setup_trig(ctx);
5280 if (r)
5281 return r;
5282
5283
5284 for (i = 0; i < last_slot; i++) {
5285 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5286 alu.op = ctx->inst_info->op;
5287 alu.dst.chan = i;
5288
5289 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5290 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
5291
5292 alu.src[0].sel = ctx->temp_reg;
5293 alu.src[0].chan = 0;
5294 if (i == last_slot - 1)
5295 alu.last = 1;
5296 r = r600_bytecode_add_alu(ctx->bc, &alu);
5297 if (r)
5298 return r;
5299 }
5300 return 0;
5301 }
5302
5303 static int tgsi_trig(struct r600_shader_ctx *ctx)
5304 {
5305 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5306 struct r600_bytecode_alu alu;
5307 int i, r;
5308 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5309
5310 r = tgsi_setup_trig(ctx);
5311 if (r)
5312 return r;
5313
5314 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5315 alu.op = ctx->inst_info->op;
5316 alu.dst.chan = 0;
5317 alu.dst.sel = ctx->temp_reg;
5318 alu.dst.write = 1;
5319
5320 alu.src[0].sel = ctx->temp_reg;
5321 alu.src[0].chan = 0;
5322 alu.last = 1;
5323 r = r600_bytecode_add_alu(ctx->bc, &alu);
5324 if (r)
5325 return r;
5326
5327 /* replicate result */
5328 for (i = 0; i < lasti + 1; i++) {
5329 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5330 continue;
5331
5332 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5333 alu.op = ALU_OP1_MOV;
5334
5335 alu.src[0].sel = ctx->temp_reg;
5336 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5337 if (i == lasti)
5338 alu.last = 1;
5339 r = r600_bytecode_add_alu(ctx->bc, &alu);
5340 if (r)
5341 return r;
5342 }
5343 return 0;
5344 }
5345
5346 static int tgsi_kill(struct r600_shader_ctx *ctx)
5347 {
5348 const struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5349 struct r600_bytecode_alu alu;
5350 int i, r;
5351
5352 for (i = 0; i < 4; i++) {
5353 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5354 alu.op = ctx->inst_info->op;
5355
5356 alu.dst.chan = i;
5357
5358 alu.src[0].sel = V_SQ_ALU_SRC_0;
5359
5360 if (inst->Instruction.Opcode == TGSI_OPCODE_KILL) {
5361 alu.src[1].sel = V_SQ_ALU_SRC_1;
5362 alu.src[1].neg = 1;
5363 } else {
5364 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5365 }
5366 if (i == 3) {
5367 alu.last = 1;
5368 }
5369 r = r600_bytecode_add_alu(ctx->bc, &alu);
5370 if (r)
5371 return r;
5372 }
5373
5374 /* kill must be last in ALU */
5375 ctx->bc->force_add_cf = 1;
5376 ctx->shader->uses_kill = TRUE;
5377 return 0;
5378 }
5379
5380 static int tgsi_lit(struct r600_shader_ctx *ctx)
5381 {
5382 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5383 struct r600_bytecode_alu alu;
5384 int r;
5385
5386 /* tmp.x = max(src.y, 0.0) */
5387 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5388 alu.op = ALU_OP2_MAX;
5389 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
5390 alu.src[1].sel = V_SQ_ALU_SRC_0; /*0.0*/
5391 alu.src[1].chan = 1;
5392
5393 alu.dst.sel = ctx->temp_reg;
5394 alu.dst.chan = 0;
5395 alu.dst.write = 1;
5396
5397 alu.last = 1;
5398 r = r600_bytecode_add_alu(ctx->bc, &alu);
5399 if (r)
5400 return r;
5401
5402 if (inst->Dst[0].Register.WriteMask & (1 << 2))
5403 {
5404 int chan;
5405 int sel;
5406 unsigned i;
5407
5408 if (ctx->bc->chip_class == CAYMAN) {
5409 for (i = 0; i < 3; i++) {
5410 /* tmp.z = log(tmp.x) */
5411 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5412 alu.op = ALU_OP1_LOG_CLAMPED;
5413 alu.src[0].sel = ctx->temp_reg;
5414 alu.src[0].chan = 0;
5415 alu.dst.sel = ctx->temp_reg;
5416 alu.dst.chan = i;
5417 if (i == 2) {
5418 alu.dst.write = 1;
5419 alu.last = 1;
5420 } else
5421 alu.dst.write = 0;
5422
5423 r = r600_bytecode_add_alu(ctx->bc, &alu);
5424 if (r)
5425 return r;
5426 }
5427 } else {
5428 /* tmp.z = log(tmp.x) */
5429 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5430 alu.op = ALU_OP1_LOG_CLAMPED;
5431 alu.src[0].sel = ctx->temp_reg;
5432 alu.src[0].chan = 0;
5433 alu.dst.sel = ctx->temp_reg;
5434 alu.dst.chan = 2;
5435 alu.dst.write = 1;
5436 alu.last = 1;
5437 r = r600_bytecode_add_alu(ctx->bc, &alu);
5438 if (r)
5439 return r;
5440 }
5441
5442 chan = alu.dst.chan;
5443 sel = alu.dst.sel;
5444
5445 /* tmp.x = amd MUL_LIT(tmp.z, src.w, src.x ) */
5446 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5447 alu.op = ALU_OP3_MUL_LIT;
5448 alu.src[0].sel = sel;
5449 alu.src[0].chan = chan;
5450 r600_bytecode_src(&alu.src[1], &ctx->src[0], 3);
5451 r600_bytecode_src(&alu.src[2], &ctx->src[0], 0);
5452 alu.dst.sel = ctx->temp_reg;
5453 alu.dst.chan = 0;
5454 alu.dst.write = 1;
5455 alu.is_op3 = 1;
5456 alu.last = 1;
5457 r = r600_bytecode_add_alu(ctx->bc, &alu);
5458 if (r)
5459 return r;
5460
5461 if (ctx->bc->chip_class == CAYMAN) {
5462 for (i = 0; i < 3; i++) {
5463 /* dst.z = exp(tmp.x) */
5464 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5465 alu.op = ALU_OP1_EXP_IEEE;
5466 alu.src[0].sel = ctx->temp_reg;
5467 alu.src[0].chan = 0;
5468 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5469 if (i == 2) {
5470 alu.dst.write = 1;
5471 alu.last = 1;
5472 } else
5473 alu.dst.write = 0;
5474 r = r600_bytecode_add_alu(ctx->bc, &alu);
5475 if (r)
5476 return r;
5477 }
5478 } else {
5479 /* dst.z = exp(tmp.x) */
5480 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5481 alu.op = ALU_OP1_EXP_IEEE;
5482 alu.src[0].sel = ctx->temp_reg;
5483 alu.src[0].chan = 0;
5484 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
5485 alu.last = 1;
5486 r = r600_bytecode_add_alu(ctx->bc, &alu);
5487 if (r)
5488 return r;
5489 }
5490 }
5491
5492 /* dst.x, <- 1.0 */
5493 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5494 alu.op = ALU_OP1_MOV;
5495 alu.src[0].sel = V_SQ_ALU_SRC_1; /*1.0*/
5496 alu.src[0].chan = 0;
5497 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
5498 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 0) & 1;
5499 r = r600_bytecode_add_alu(ctx->bc, &alu);
5500 if (r)
5501 return r;
5502
5503 /* dst.y = max(src.x, 0.0) */
5504 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5505 alu.op = ALU_OP2_MAX;
5506 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5507 alu.src[1].sel = V_SQ_ALU_SRC_0; /*0.0*/
5508 alu.src[1].chan = 0;
5509 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
5510 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 1) & 1;
5511 r = r600_bytecode_add_alu(ctx->bc, &alu);
5512 if (r)
5513 return r;
5514
5515 /* dst.w, <- 1.0 */
5516 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5517 alu.op = ALU_OP1_MOV;
5518 alu.src[0].sel = V_SQ_ALU_SRC_1;
5519 alu.src[0].chan = 0;
5520 tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
5521 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 3) & 1;
5522 alu.last = 1;
5523 r = r600_bytecode_add_alu(ctx->bc, &alu);
5524 if (r)
5525 return r;
5526
5527 return 0;
5528 }
5529
5530 static int tgsi_rsq(struct r600_shader_ctx *ctx)
5531 {
5532 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5533 struct r600_bytecode_alu alu;
5534 int i, r;
5535
5536 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5537
5538 alu.op = ALU_OP1_RECIPSQRT_IEEE;
5539
5540 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
5541 r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
5542 r600_bytecode_src_set_abs(&alu.src[i]);
5543 }
5544 alu.dst.sel = ctx->temp_reg;
5545 alu.dst.write = 1;
5546 alu.last = 1;
5547 r = r600_bytecode_add_alu(ctx->bc, &alu);
5548 if (r)
5549 return r;
5550 /* replicate result */
5551 return tgsi_helper_tempx_replicate(ctx);
5552 }
5553
5554 static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx)
5555 {
5556 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5557 struct r600_bytecode_alu alu;
5558 int i, r;
5559
5560 for (i = 0; i < 4; i++) {
5561 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5562 alu.src[0].sel = ctx->temp_reg;
5563 alu.op = ALU_OP1_MOV;
5564 alu.dst.chan = i;
5565 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5566 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
5567 if (i == 3)
5568 alu.last = 1;
5569 r = r600_bytecode_add_alu(ctx->bc, &alu);
5570 if (r)
5571 return r;
5572 }
5573 return 0;
5574 }
5575
5576 static int tgsi_trans_srcx_replicate(struct r600_shader_ctx *ctx)
5577 {
5578 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5579 struct r600_bytecode_alu alu;
5580 int i, r;
5581
5582 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5583 alu.op = ctx->inst_info->op;
5584 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
5585 r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
5586 }
5587 alu.dst.sel = ctx->temp_reg;
5588 alu.dst.write = 1;
5589 alu.last = 1;
5590 r = r600_bytecode_add_alu(ctx->bc, &alu);
5591 if (r)
5592 return r;
5593 /* replicate result */
5594 return tgsi_helper_tempx_replicate(ctx);
5595 }
5596
5597 static int cayman_pow(struct r600_shader_ctx *ctx)
5598 {
5599 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5600 int i, r;
5601 struct r600_bytecode_alu alu;
5602 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
5603
5604 for (i = 0; i < 3; i++) {
5605 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5606 alu.op = ALU_OP1_LOG_IEEE;
5607 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5608 alu.dst.sel = ctx->temp_reg;
5609 alu.dst.chan = i;
5610 alu.dst.write = 1;
5611 if (i == 2)
5612 alu.last = 1;
5613 r = r600_bytecode_add_alu(ctx->bc, &alu);
5614 if (r)
5615 return r;
5616 }
5617
5618 /* b * LOG2(a) */
5619 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5620 alu.op = ALU_OP2_MUL;
5621 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
5622 alu.src[1].sel = ctx->temp_reg;
5623 alu.dst.sel = ctx->temp_reg;
5624 alu.dst.write = 1;
5625 alu.last = 1;
5626 r = r600_bytecode_add_alu(ctx->bc, &alu);
5627 if (r)
5628 return r;
5629
5630 for (i = 0; i < last_slot; i++) {
5631 /* POW(a,b) = EXP2(b * LOG2(a))*/
5632 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5633 alu.op = ALU_OP1_EXP_IEEE;
5634 alu.src[0].sel = ctx->temp_reg;
5635
5636 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5637 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
5638 if (i == last_slot - 1)
5639 alu.last = 1;
5640 r = r600_bytecode_add_alu(ctx->bc, &alu);
5641 if (r)
5642 return r;
5643 }
5644 return 0;
5645 }
5646
5647 static int tgsi_pow(struct r600_shader_ctx *ctx)
5648 {
5649 struct r600_bytecode_alu alu;
5650 int r;
5651
5652 /* LOG2(a) */
5653 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5654 alu.op = ALU_OP1_LOG_IEEE;
5655 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5656 alu.dst.sel = ctx->temp_reg;
5657 alu.dst.write = 1;
5658 alu.last = 1;
5659 r = r600_bytecode_add_alu(ctx->bc, &alu);
5660 if (r)
5661 return r;
5662 /* b * LOG2(a) */
5663 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5664 alu.op = ALU_OP2_MUL;
5665 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
5666 alu.src[1].sel = ctx->temp_reg;
5667 alu.dst.sel = ctx->temp_reg;
5668 alu.dst.write = 1;
5669 alu.last = 1;
5670 r = r600_bytecode_add_alu(ctx->bc, &alu);
5671 if (r)
5672 return r;
5673 /* POW(a,b) = EXP2(b * LOG2(a))*/
5674 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5675 alu.op = ALU_OP1_EXP_IEEE;
5676 alu.src[0].sel = ctx->temp_reg;
5677 alu.dst.sel = ctx->temp_reg;
5678 alu.dst.write = 1;
5679 alu.last = 1;
5680 r = r600_bytecode_add_alu(ctx->bc, &alu);
5681 if (r)
5682 return r;
5683 return tgsi_helper_tempx_replicate(ctx);
5684 }
5685
5686 static int emit_mul_int_op(struct r600_bytecode *bc,
5687 struct r600_bytecode_alu *alu_src)
5688 {
5689 struct r600_bytecode_alu alu;
5690 int i, r;
5691 alu = *alu_src;
5692 if (bc->chip_class == CAYMAN) {
5693 for (i = 0; i < 4; i++) {
5694 alu.dst.chan = i;
5695 alu.dst.write = (i == alu_src->dst.chan);
5696 alu.last = (i == 3);
5697
5698 r = r600_bytecode_add_alu(bc, &alu);
5699 if (r)
5700 return r;
5701 }
5702 } else {
5703 alu.last = 1;
5704 r = r600_bytecode_add_alu(bc, &alu);
5705 if (r)
5706 return r;
5707 }
5708 return 0;
5709 }
5710
5711 static int tgsi_divmod(struct r600_shader_ctx *ctx, int mod, int signed_op)
5712 {
5713 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5714 struct r600_bytecode_alu alu;
5715 int i, r, j;
5716 unsigned write_mask = inst->Dst[0].Register.WriteMask;
5717 int tmp0 = ctx->temp_reg;
5718 int tmp1 = r600_get_temp(ctx);
5719 int tmp2 = r600_get_temp(ctx);
5720 int tmp3 = r600_get_temp(ctx);
5721 /* Unsigned path:
5722 *
5723 * we need to represent src1 as src2*q + r, where q - quotient, r - remainder
5724 *
5725 * 1. tmp0.x = rcp (src2) = 2^32/src2 + e, where e is rounding error
5726 * 2. tmp0.z = lo (tmp0.x * src2)
5727 * 3. tmp0.w = -tmp0.z
5728 * 4. tmp0.y = hi (tmp0.x * src2)
5729 * 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z) = abs(lo(rcp*src2))
5730 * 6. tmp0.w = hi (tmp0.z * tmp0.x) = e, rounding error
5731 * 7. tmp1.x = tmp0.x - tmp0.w
5732 * 8. tmp1.y = tmp0.x + tmp0.w
5733 * 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x)
5734 * 10. tmp0.z = hi(tmp0.x * src1) = q
5735 * 11. tmp0.y = lo (tmp0.z * src2) = src2*q = src1 - r
5736 *
5737 * 12. tmp0.w = src1 - tmp0.y = r
5738 * 13. tmp1.x = tmp0.w >= src2 = r >= src2 (uint comparison)
5739 * 14. tmp1.y = src1 >= tmp0.y = r >= 0 (uint comparison)
5740 *
5741 * if DIV
5742 *
5743 * 15. tmp1.z = tmp0.z + 1 = q + 1
5744 * 16. tmp1.w = tmp0.z - 1 = q - 1
5745 *
5746 * else MOD
5747 *
5748 * 15. tmp1.z = tmp0.w - src2 = r - src2
5749 * 16. tmp1.w = tmp0.w + src2 = r + src2
5750 *
5751 * endif
5752 *
5753 * 17. tmp1.x = tmp1.x & tmp1.y
5754 *
5755 * DIV: 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z
5756 * MOD: 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z
5757 *
5758 * 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z
5759 * 20. dst = src2==0 ? MAX_UINT : tmp0.z
5760 *
5761 * Signed path:
5762 *
5763 * Same as unsigned, using abs values of the operands,
5764 * and fixing the sign of the result in the end.
5765 */
5766
5767 for (i = 0; i < 4; i++) {
5768 if (!(write_mask & (1<<i)))
5769 continue;
5770
5771 if (signed_op) {
5772
5773 /* tmp2.x = -src0 */
5774 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5775 alu.op = ALU_OP2_SUB_INT;
5776
5777 alu.dst.sel = tmp2;
5778 alu.dst.chan = 0;
5779 alu.dst.write = 1;
5780
5781 alu.src[0].sel = V_SQ_ALU_SRC_0;
5782
5783 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5784
5785 alu.last = 1;
5786 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5787 return r;
5788
5789 /* tmp2.y = -src1 */
5790 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5791 alu.op = ALU_OP2_SUB_INT;
5792
5793 alu.dst.sel = tmp2;
5794 alu.dst.chan = 1;
5795 alu.dst.write = 1;
5796
5797 alu.src[0].sel = V_SQ_ALU_SRC_0;
5798
5799 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5800
5801 alu.last = 1;
5802 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5803 return r;
5804
5805 /* tmp2.z sign bit is set if src0 and src2 signs are different */
5806 /* it will be a sign of the quotient */
5807 if (!mod) {
5808
5809 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5810 alu.op = ALU_OP2_XOR_INT;
5811
5812 alu.dst.sel = tmp2;
5813 alu.dst.chan = 2;
5814 alu.dst.write = 1;
5815
5816 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5817 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5818
5819 alu.last = 1;
5820 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5821 return r;
5822 }
5823
5824 /* tmp2.x = |src0| */
5825 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5826 alu.op = ALU_OP3_CNDGE_INT;
5827 alu.is_op3 = 1;
5828
5829 alu.dst.sel = tmp2;
5830 alu.dst.chan = 0;
5831 alu.dst.write = 1;
5832
5833 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5834 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5835 alu.src[2].sel = tmp2;
5836 alu.src[2].chan = 0;
5837
5838 alu.last = 1;
5839 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5840 return r;
5841
5842 /* tmp2.y = |src1| */
5843 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5844 alu.op = ALU_OP3_CNDGE_INT;
5845 alu.is_op3 = 1;
5846
5847 alu.dst.sel = tmp2;
5848 alu.dst.chan = 1;
5849 alu.dst.write = 1;
5850
5851 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5852 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5853 alu.src[2].sel = tmp2;
5854 alu.src[2].chan = 1;
5855
5856 alu.last = 1;
5857 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5858 return r;
5859
5860 }
5861
5862 /* 1. tmp0.x = rcp_u (src2) = 2^32/src2 + e, where e is rounding error */
5863 if (ctx->bc->chip_class == CAYMAN) {
5864 /* tmp3.x = u2f(src2) */
5865 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5866 alu.op = ALU_OP1_UINT_TO_FLT;
5867
5868 alu.dst.sel = tmp3;
5869 alu.dst.chan = 0;
5870 alu.dst.write = 1;
5871
5872 if (signed_op) {
5873 alu.src[0].sel = tmp2;
5874 alu.src[0].chan = 1;
5875 } else {
5876 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5877 }
5878
5879 alu.last = 1;
5880 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5881 return r;
5882
5883 /* tmp0.x = recip(tmp3.x) */
5884 for (j = 0 ; j < 3; j++) {
5885 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5886 alu.op = ALU_OP1_RECIP_IEEE;
5887
5888 alu.dst.sel = tmp0;
5889 alu.dst.chan = j;
5890 alu.dst.write = (j == 0);
5891
5892 alu.src[0].sel = tmp3;
5893 alu.src[0].chan = 0;
5894
5895 if (j == 2)
5896 alu.last = 1;
5897 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5898 return r;
5899 }
5900
5901 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5902 alu.op = ALU_OP2_MUL;
5903
5904 alu.src[0].sel = tmp0;
5905 alu.src[0].chan = 0;
5906
5907 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
5908 alu.src[1].value = 0x4f800000;
5909
5910 alu.dst.sel = tmp3;
5911 alu.dst.write = 1;
5912 alu.last = 1;
5913 r = r600_bytecode_add_alu(ctx->bc, &alu);
5914 if (r)
5915 return r;
5916
5917 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5918 alu.op = ALU_OP1_FLT_TO_UINT;
5919
5920 alu.dst.sel = tmp0;
5921 alu.dst.chan = 0;
5922 alu.dst.write = 1;
5923
5924 alu.src[0].sel = tmp3;
5925 alu.src[0].chan = 0;
5926
5927 alu.last = 1;
5928 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5929 return r;
5930
5931 } else {
5932 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5933 alu.op = ALU_OP1_RECIP_UINT;
5934
5935 alu.dst.sel = tmp0;
5936 alu.dst.chan = 0;
5937 alu.dst.write = 1;
5938
5939 if (signed_op) {
5940 alu.src[0].sel = tmp2;
5941 alu.src[0].chan = 1;
5942 } else {
5943 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5944 }
5945
5946 alu.last = 1;
5947 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5948 return r;
5949 }
5950
5951 /* 2. tmp0.z = lo (tmp0.x * src2) */
5952 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5953 alu.op = ALU_OP2_MULLO_UINT;
5954
5955 alu.dst.sel = tmp0;
5956 alu.dst.chan = 2;
5957 alu.dst.write = 1;
5958
5959 alu.src[0].sel = tmp0;
5960 alu.src[0].chan = 0;
5961 if (signed_op) {
5962 alu.src[1].sel = tmp2;
5963 alu.src[1].chan = 1;
5964 } else {
5965 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5966 }
5967
5968 if ((r = emit_mul_int_op(ctx->bc, &alu)))
5969 return r;
5970
5971 /* 3. tmp0.w = -tmp0.z */
5972 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5973 alu.op = ALU_OP2_SUB_INT;
5974
5975 alu.dst.sel = tmp0;
5976 alu.dst.chan = 3;
5977 alu.dst.write = 1;
5978
5979 alu.src[0].sel = V_SQ_ALU_SRC_0;
5980 alu.src[1].sel = tmp0;
5981 alu.src[1].chan = 2;
5982
5983 alu.last = 1;
5984 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5985 return r;
5986
5987 /* 4. tmp0.y = hi (tmp0.x * src2) */
5988 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5989 alu.op = ALU_OP2_MULHI_UINT;
5990
5991 alu.dst.sel = tmp0;
5992 alu.dst.chan = 1;
5993 alu.dst.write = 1;
5994
5995 alu.src[0].sel = tmp0;
5996 alu.src[0].chan = 0;
5997
5998 if (signed_op) {
5999 alu.src[1].sel = tmp2;
6000 alu.src[1].chan = 1;
6001 } else {
6002 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
6003 }
6004
6005 if ((r = emit_mul_int_op(ctx->bc, &alu)))
6006 return r;
6007
6008 /* 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z) = abs(lo(rcp*src)) */
6009 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6010 alu.op = ALU_OP3_CNDE_INT;
6011 alu.is_op3 = 1;
6012
6013 alu.dst.sel = tmp0;
6014 alu.dst.chan = 2;
6015 alu.dst.write = 1;
6016
6017 alu.src[0].sel = tmp0;
6018 alu.src[0].chan = 1;
6019 alu.src[1].sel = tmp0;
6020 alu.src[1].chan = 3;
6021 alu.src[2].sel = tmp0;
6022 alu.src[2].chan = 2;
6023
6024 alu.last = 1;
6025 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6026 return r;
6027
6028 /* 6. tmp0.w = hi (tmp0.z * tmp0.x) = e, rounding error */
6029 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6030 alu.op = ALU_OP2_MULHI_UINT;
6031
6032 alu.dst.sel = tmp0;
6033 alu.dst.chan = 3;
6034 alu.dst.write = 1;
6035
6036 alu.src[0].sel = tmp0;
6037 alu.src[0].chan = 2;
6038
6039 alu.src[1].sel = tmp0;
6040 alu.src[1].chan = 0;
6041
6042 if ((r = emit_mul_int_op(ctx->bc, &alu)))
6043 return r;
6044
6045 /* 7. tmp1.x = tmp0.x - tmp0.w */
6046 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6047 alu.op = ALU_OP2_SUB_INT;
6048
6049 alu.dst.sel = tmp1;
6050 alu.dst.chan = 0;
6051 alu.dst.write = 1;
6052
6053 alu.src[0].sel = tmp0;
6054 alu.src[0].chan = 0;
6055 alu.src[1].sel = tmp0;
6056 alu.src[1].chan = 3;
6057
6058 alu.last = 1;
6059 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6060 return r;
6061
6062 /* 8. tmp1.y = tmp0.x + tmp0.w */
6063 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6064 alu.op = ALU_OP2_ADD_INT;
6065
6066 alu.dst.sel = tmp1;
6067 alu.dst.chan = 1;
6068 alu.dst.write = 1;
6069
6070 alu.src[0].sel = tmp0;
6071 alu.src[0].chan = 0;
6072 alu.src[1].sel = tmp0;
6073 alu.src[1].chan = 3;
6074
6075 alu.last = 1;
6076 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6077 return r;
6078
6079 /* 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) */
6080 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6081 alu.op = ALU_OP3_CNDE_INT;
6082 alu.is_op3 = 1;
6083
6084 alu.dst.sel = tmp0;
6085 alu.dst.chan = 0;
6086 alu.dst.write = 1;
6087
6088 alu.src[0].sel = tmp0;
6089 alu.src[0].chan = 1;
6090 alu.src[1].sel = tmp1;
6091 alu.src[1].chan = 1;
6092 alu.src[2].sel = tmp1;
6093 alu.src[2].chan = 0;
6094
6095 alu.last = 1;
6096 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6097 return r;
6098
6099 /* 10. tmp0.z = hi(tmp0.x * src1) = q */
6100 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6101 alu.op = ALU_OP2_MULHI_UINT;
6102
6103 alu.dst.sel = tmp0;
6104 alu.dst.chan = 2;
6105 alu.dst.write = 1;
6106
6107 alu.src[0].sel = tmp0;
6108 alu.src[0].chan = 0;
6109
6110 if (signed_op) {
6111 alu.src[1].sel = tmp2;
6112 alu.src[1].chan = 0;
6113 } else {
6114 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6115 }
6116
6117 if ((r = emit_mul_int_op(ctx->bc, &alu)))
6118 return r;
6119
6120 /* 11. tmp0.y = lo (src2 * tmp0.z) = src2*q = src1 - r */
6121 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6122 alu.op = ALU_OP2_MULLO_UINT;
6123
6124 alu.dst.sel = tmp0;
6125 alu.dst.chan = 1;
6126 alu.dst.write = 1;
6127
6128 if (signed_op) {
6129 alu.src[0].sel = tmp2;
6130 alu.src[0].chan = 1;
6131 } else {
6132 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
6133 }
6134
6135 alu.src[1].sel = tmp0;
6136 alu.src[1].chan = 2;
6137
6138 if ((r = emit_mul_int_op(ctx->bc, &alu)))
6139 return r;
6140
6141 /* 12. tmp0.w = src1 - tmp0.y = r */
6142 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6143 alu.op = ALU_OP2_SUB_INT;
6144
6145 alu.dst.sel = tmp0;
6146 alu.dst.chan = 3;
6147 alu.dst.write = 1;
6148
6149 if (signed_op) {
6150 alu.src[0].sel = tmp2;
6151 alu.src[0].chan = 0;
6152 } else {
6153 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6154 }
6155
6156 alu.src[1].sel = tmp0;
6157 alu.src[1].chan = 1;
6158
6159 alu.last = 1;
6160 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6161 return r;
6162
6163 /* 13. tmp1.x = tmp0.w >= src2 = r >= src2 */
6164 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6165 alu.op = ALU_OP2_SETGE_UINT;
6166
6167 alu.dst.sel = tmp1;
6168 alu.dst.chan = 0;
6169 alu.dst.write = 1;
6170
6171 alu.src[0].sel = tmp0;
6172 alu.src[0].chan = 3;
6173 if (signed_op) {
6174 alu.src[1].sel = tmp2;
6175 alu.src[1].chan = 1;
6176 } else {
6177 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
6178 }
6179
6180 alu.last = 1;
6181 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6182 return r;
6183
6184 /* 14. tmp1.y = src1 >= tmp0.y = r >= 0 */
6185 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6186 alu.op = ALU_OP2_SETGE_UINT;
6187
6188 alu.dst.sel = tmp1;
6189 alu.dst.chan = 1;
6190 alu.dst.write = 1;
6191
6192 if (signed_op) {
6193 alu.src[0].sel = tmp2;
6194 alu.src[0].chan = 0;
6195 } else {
6196 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6197 }
6198
6199 alu.src[1].sel = tmp0;
6200 alu.src[1].chan = 1;
6201
6202 alu.last = 1;
6203 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6204 return r;
6205
6206 if (mod) { /* UMOD */
6207
6208 /* 15. tmp1.z = tmp0.w - src2 = r - src2 */
6209 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6210 alu.op = ALU_OP2_SUB_INT;
6211
6212 alu.dst.sel = tmp1;
6213 alu.dst.chan = 2;
6214 alu.dst.write = 1;
6215
6216 alu.src[0].sel = tmp0;
6217 alu.src[0].chan = 3;
6218
6219 if (signed_op) {
6220 alu.src[1].sel = tmp2;
6221 alu.src[1].chan = 1;
6222 } else {
6223 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
6224 }
6225
6226 alu.last = 1;
6227 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6228 return r;
6229
6230 /* 16. tmp1.w = tmp0.w + src2 = r + src2 */
6231 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6232 alu.op = ALU_OP2_ADD_INT;
6233
6234 alu.dst.sel = tmp1;
6235 alu.dst.chan = 3;
6236 alu.dst.write = 1;
6237
6238 alu.src[0].sel = tmp0;
6239 alu.src[0].chan = 3;
6240 if (signed_op) {
6241 alu.src[1].sel = tmp2;
6242 alu.src[1].chan = 1;
6243 } else {
6244 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
6245 }
6246
6247 alu.last = 1;
6248 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6249 return r;
6250
6251 } else { /* UDIV */
6252
6253 /* 15. tmp1.z = tmp0.z + 1 = q + 1 DIV */
6254 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6255 alu.op = ALU_OP2_ADD_INT;
6256
6257 alu.dst.sel = tmp1;
6258 alu.dst.chan = 2;
6259 alu.dst.write = 1;
6260
6261 alu.src[0].sel = tmp0;
6262 alu.src[0].chan = 2;
6263 alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
6264
6265 alu.last = 1;
6266 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6267 return r;
6268
6269 /* 16. tmp1.w = tmp0.z - 1 = q - 1 */
6270 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6271 alu.op = ALU_OP2_ADD_INT;
6272
6273 alu.dst.sel = tmp1;
6274 alu.dst.chan = 3;
6275 alu.dst.write = 1;
6276
6277 alu.src[0].sel = tmp0;
6278 alu.src[0].chan = 2;
6279 alu.src[1].sel = V_SQ_ALU_SRC_M_1_INT;
6280
6281 alu.last = 1;
6282 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6283 return r;
6284
6285 }
6286
6287 /* 17. tmp1.x = tmp1.x & tmp1.y */
6288 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6289 alu.op = ALU_OP2_AND_INT;
6290
6291 alu.dst.sel = tmp1;
6292 alu.dst.chan = 0;
6293 alu.dst.write = 1;
6294
6295 alu.src[0].sel = tmp1;
6296 alu.src[0].chan = 0;
6297 alu.src[1].sel = tmp1;
6298 alu.src[1].chan = 1;
6299
6300 alu.last = 1;
6301 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6302 return r;
6303
6304 /* 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z DIV */
6305 /* 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z MOD */
6306 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6307 alu.op = ALU_OP3_CNDE_INT;
6308 alu.is_op3 = 1;
6309
6310 alu.dst.sel = tmp0;
6311 alu.dst.chan = 2;
6312 alu.dst.write = 1;
6313
6314 alu.src[0].sel = tmp1;
6315 alu.src[0].chan = 0;
6316 alu.src[1].sel = tmp0;
6317 alu.src[1].chan = mod ? 3 : 2;
6318 alu.src[2].sel = tmp1;
6319 alu.src[2].chan = 2;
6320
6321 alu.last = 1;
6322 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6323 return r;
6324
6325 /* 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z */
6326 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6327 alu.op = ALU_OP3_CNDE_INT;
6328 alu.is_op3 = 1;
6329
6330 if (signed_op) {
6331 alu.dst.sel = tmp0;
6332 alu.dst.chan = 2;
6333 alu.dst.write = 1;
6334 } else {
6335 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6336 }
6337
6338 alu.src[0].sel = tmp1;
6339 alu.src[0].chan = 1;
6340 alu.src[1].sel = tmp1;
6341 alu.src[1].chan = 3;
6342 alu.src[2].sel = tmp0;
6343 alu.src[2].chan = 2;
6344
6345 alu.last = 1;
6346 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6347 return r;
6348
6349 if (signed_op) {
6350
6351 /* fix the sign of the result */
6352
6353 if (mod) {
6354
6355 /* tmp0.x = -tmp0.z */
6356 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6357 alu.op = ALU_OP2_SUB_INT;
6358
6359 alu.dst.sel = tmp0;
6360 alu.dst.chan = 0;
6361 alu.dst.write = 1;
6362
6363 alu.src[0].sel = V_SQ_ALU_SRC_0;
6364 alu.src[1].sel = tmp0;
6365 alu.src[1].chan = 2;
6366
6367 alu.last = 1;
6368 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6369 return r;
6370
6371 /* sign of the remainder is the same as the sign of src0 */
6372 /* tmp0.x = src0>=0 ? tmp0.z : tmp0.x */
6373 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6374 alu.op = ALU_OP3_CNDGE_INT;
6375 alu.is_op3 = 1;
6376
6377 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6378
6379 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6380 alu.src[1].sel = tmp0;
6381 alu.src[1].chan = 2;
6382 alu.src[2].sel = tmp0;
6383 alu.src[2].chan = 0;
6384
6385 alu.last = 1;
6386 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6387 return r;
6388
6389 } else {
6390
6391 /* tmp0.x = -tmp0.z */
6392 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6393 alu.op = ALU_OP2_SUB_INT;
6394
6395 alu.dst.sel = tmp0;
6396 alu.dst.chan = 0;
6397 alu.dst.write = 1;
6398
6399 alu.src[0].sel = V_SQ_ALU_SRC_0;
6400 alu.src[1].sel = tmp0;
6401 alu.src[1].chan = 2;
6402
6403 alu.last = 1;
6404 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6405 return r;
6406
6407 /* fix the quotient sign (same as the sign of src0*src1) */
6408 /* tmp0.x = tmp2.z>=0 ? tmp0.z : tmp0.x */
6409 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6410 alu.op = ALU_OP3_CNDGE_INT;
6411 alu.is_op3 = 1;
6412
6413 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6414
6415 alu.src[0].sel = tmp2;
6416 alu.src[0].chan = 2;
6417 alu.src[1].sel = tmp0;
6418 alu.src[1].chan = 2;
6419 alu.src[2].sel = tmp0;
6420 alu.src[2].chan = 0;
6421
6422 alu.last = 1;
6423 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6424 return r;
6425 }
6426 }
6427 }
6428 return 0;
6429 }
6430
6431 static int tgsi_udiv(struct r600_shader_ctx *ctx)
6432 {
6433 return tgsi_divmod(ctx, 0, 0);
6434 }
6435
6436 static int tgsi_umod(struct r600_shader_ctx *ctx)
6437 {
6438 return tgsi_divmod(ctx, 1, 0);
6439 }
6440
6441 static int tgsi_idiv(struct r600_shader_ctx *ctx)
6442 {
6443 return tgsi_divmod(ctx, 0, 1);
6444 }
6445
6446 static int tgsi_imod(struct r600_shader_ctx *ctx)
6447 {
6448 return tgsi_divmod(ctx, 1, 1);
6449 }
6450
6451
6452 static int tgsi_f2i(struct r600_shader_ctx *ctx)
6453 {
6454 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6455 struct r600_bytecode_alu alu;
6456 int i, r;
6457 unsigned write_mask = inst->Dst[0].Register.WriteMask;
6458 int last_inst = tgsi_last_instruction(write_mask);
6459
6460 for (i = 0; i < 4; i++) {
6461 if (!(write_mask & (1<<i)))
6462 continue;
6463
6464 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6465 alu.op = ALU_OP1_TRUNC;
6466
6467 alu.dst.sel = ctx->temp_reg;
6468 alu.dst.chan = i;
6469 alu.dst.write = 1;
6470
6471 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6472 if (i == last_inst)
6473 alu.last = 1;
6474 r = r600_bytecode_add_alu(ctx->bc, &alu);
6475 if (r)
6476 return r;
6477 }
6478
6479 for (i = 0; i < 4; i++) {
6480 if (!(write_mask & (1<<i)))
6481 continue;
6482
6483 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6484 alu.op = ctx->inst_info->op;
6485
6486 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6487
6488 alu.src[0].sel = ctx->temp_reg;
6489 alu.src[0].chan = i;
6490
6491 if (i == last_inst || alu.op == ALU_OP1_FLT_TO_UINT)
6492 alu.last = 1;
6493 r = r600_bytecode_add_alu(ctx->bc, &alu);
6494 if (r)
6495 return r;
6496 }
6497
6498 return 0;
6499 }
6500
6501 static int tgsi_iabs(struct r600_shader_ctx *ctx)
6502 {
6503 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6504 struct r600_bytecode_alu alu;
6505 int i, r;
6506 unsigned write_mask = inst->Dst[0].Register.WriteMask;
6507 int last_inst = tgsi_last_instruction(write_mask);
6508
6509 /* tmp = -src */
6510 for (i = 0; i < 4; i++) {
6511 if (!(write_mask & (1<<i)))
6512 continue;
6513
6514 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6515 alu.op = ALU_OP2_SUB_INT;
6516
6517 alu.dst.sel = ctx->temp_reg;
6518 alu.dst.chan = i;
6519 alu.dst.write = 1;
6520
6521 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6522 alu.src[0].sel = V_SQ_ALU_SRC_0;
6523
6524 if (i == last_inst)
6525 alu.last = 1;
6526 r = r600_bytecode_add_alu(ctx->bc, &alu);
6527 if (r)
6528 return r;
6529 }
6530
6531 /* dst = (src >= 0 ? src : tmp) */
6532 for (i = 0; i < 4; i++) {
6533 if (!(write_mask & (1<<i)))
6534 continue;
6535
6536 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6537 alu.op = ALU_OP3_CNDGE_INT;
6538 alu.is_op3 = 1;
6539 alu.dst.write = 1;
6540
6541 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6542
6543 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6544 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6545 alu.src[2].sel = ctx->temp_reg;
6546 alu.src[2].chan = i;
6547
6548 if (i == last_inst)
6549 alu.last = 1;
6550 r = r600_bytecode_add_alu(ctx->bc, &alu);
6551 if (r)
6552 return r;
6553 }
6554 return 0;
6555 }
6556
6557 static int tgsi_issg(struct r600_shader_ctx *ctx)
6558 {
6559 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6560 struct r600_bytecode_alu alu;
6561 int i, r;
6562 unsigned write_mask = inst->Dst[0].Register.WriteMask;
6563 int last_inst = tgsi_last_instruction(write_mask);
6564
6565 /* tmp = (src >= 0 ? src : -1) */
6566 for (i = 0; i < 4; i++) {
6567 if (!(write_mask & (1<<i)))
6568 continue;
6569
6570 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6571 alu.op = ALU_OP3_CNDGE_INT;
6572 alu.is_op3 = 1;
6573
6574 alu.dst.sel = ctx->temp_reg;
6575 alu.dst.chan = i;
6576 alu.dst.write = 1;
6577
6578 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6579 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6580 alu.src[2].sel = V_SQ_ALU_SRC_M_1_INT;
6581
6582 if (i == last_inst)
6583 alu.last = 1;
6584 r = r600_bytecode_add_alu(ctx->bc, &alu);
6585 if (r)
6586 return r;
6587 }
6588
6589 /* dst = (tmp > 0 ? 1 : tmp) */
6590 for (i = 0; i < 4; i++) {
6591 if (!(write_mask & (1<<i)))
6592 continue;
6593
6594 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6595 alu.op = ALU_OP3_CNDGT_INT;
6596 alu.is_op3 = 1;
6597 alu.dst.write = 1;
6598
6599 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6600
6601 alu.src[0].sel = ctx->temp_reg;
6602 alu.src[0].chan = i;
6603
6604 alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
6605
6606 alu.src[2].sel = ctx->temp_reg;
6607 alu.src[2].chan = i;
6608
6609 if (i == last_inst)
6610 alu.last = 1;
6611 r = r600_bytecode_add_alu(ctx->bc, &alu);
6612 if (r)
6613 return r;
6614 }
6615 return 0;
6616 }
6617
6618
6619
6620 static int tgsi_ssg(struct r600_shader_ctx *ctx)
6621 {
6622 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6623 struct r600_bytecode_alu alu;
6624 int i, r;
6625
6626 /* tmp = (src > 0 ? 1 : src) */
6627 for (i = 0; i < 4; i++) {
6628 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6629 alu.op = ALU_OP3_CNDGT;
6630 alu.is_op3 = 1;
6631
6632 alu.dst.sel = ctx->temp_reg;
6633 alu.dst.chan = i;
6634
6635 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6636 alu.src[1].sel = V_SQ_ALU_SRC_1;
6637 r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
6638
6639 if (i == 3)
6640 alu.last = 1;
6641 r = r600_bytecode_add_alu(ctx->bc, &alu);
6642 if (r)
6643 return r;
6644 }
6645
6646 /* dst = (-tmp > 0 ? -1 : tmp) */
6647 for (i = 0; i < 4; i++) {
6648 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6649 alu.op = ALU_OP3_CNDGT;
6650 alu.is_op3 = 1;
6651 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6652
6653 alu.src[0].sel = ctx->temp_reg;
6654 alu.src[0].chan = i;
6655 alu.src[0].neg = 1;
6656
6657 alu.src[1].sel = V_SQ_ALU_SRC_1;
6658 alu.src[1].neg = 1;
6659
6660 alu.src[2].sel = ctx->temp_reg;
6661 alu.src[2].chan = i;
6662
6663 if (i == 3)
6664 alu.last = 1;
6665 r = r600_bytecode_add_alu(ctx->bc, &alu);
6666 if (r)
6667 return r;
6668 }
6669 return 0;
6670 }
6671
6672 static int tgsi_bfi(struct r600_shader_ctx *ctx)
6673 {
6674 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6675 struct r600_bytecode_alu alu;
6676 int i, r, t1, t2;
6677
6678 unsigned write_mask = inst->Dst[0].Register.WriteMask;
6679 int last_inst = tgsi_last_instruction(write_mask);
6680
6681 t1 = r600_get_temp(ctx);
6682
6683 for (i = 0; i < 4; i++) {
6684 if (!(write_mask & (1<<i)))
6685 continue;
6686
6687 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6688 alu.op = ALU_OP2_SETGE_INT;
6689 r600_bytecode_src(&alu.src[0], &ctx->src[3], i);
6690 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6691 alu.src[1].value = 32;
6692 alu.dst.sel = ctx->temp_reg;
6693 alu.dst.chan = i;
6694 alu.dst.write = 1;
6695 alu.last = i == last_inst;
6696 r = r600_bytecode_add_alu(ctx->bc, &alu);
6697 if (r)
6698 return r;
6699 }
6700
6701 for (i = 0; i < 4; i++) {
6702 if (!(write_mask & (1<<i)))
6703 continue;
6704
6705 /* create mask tmp */
6706 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6707 alu.op = ALU_OP2_BFM_INT;
6708 alu.dst.sel = t1;
6709 alu.dst.chan = i;
6710 alu.dst.write = 1;
6711 alu.last = i == last_inst;
6712
6713 r600_bytecode_src(&alu.src[0], &ctx->src[3], i);
6714 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6715
6716 r = r600_bytecode_add_alu(ctx->bc, &alu);
6717 if (r)
6718 return r;
6719 }
6720
6721 t2 = r600_get_temp(ctx);
6722
6723 for (i = 0; i < 4; i++) {
6724 if (!(write_mask & (1<<i)))
6725 continue;
6726
6727 /* shift insert left */
6728 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6729 alu.op = ALU_OP2_LSHL_INT;
6730 alu.dst.sel = t2;
6731 alu.dst.chan = i;
6732 alu.dst.write = 1;
6733 alu.last = i == last_inst;
6734
6735 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
6736 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6737
6738 r = r600_bytecode_add_alu(ctx->bc, &alu);
6739 if (r)
6740 return r;
6741 }
6742
6743 for (i = 0; i < 4; i++) {
6744 if (!(write_mask & (1<<i)))
6745 continue;
6746
6747 /* actual bitfield insert */
6748 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6749 alu.op = ALU_OP3_BFI_INT;
6750 alu.is_op3 = 1;
6751 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6752 alu.dst.chan = i;
6753 alu.dst.write = 1;
6754 alu.last = i == last_inst;
6755
6756 alu.src[0].sel = t1;
6757 alu.src[0].chan = i;
6758 alu.src[1].sel = t2;
6759 alu.src[1].chan = i;
6760 r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
6761
6762 r = r600_bytecode_add_alu(ctx->bc, &alu);
6763 if (r)
6764 return r;
6765 }
6766
6767 for (i = 0; i < 4; i++) {
6768 if (!(write_mask & (1<<i)))
6769 continue;
6770 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6771 alu.op = ALU_OP3_CNDE_INT;
6772 alu.is_op3 = 1;
6773 alu.src[0].sel = ctx->temp_reg;
6774 alu.src[0].chan = i;
6775 r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
6776
6777 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6778
6779 alu.src[1].sel = alu.dst.sel;
6780 alu.src[1].chan = i;
6781
6782 alu.last = i == last_inst;
6783 r = r600_bytecode_add_alu(ctx->bc, &alu);
6784 if (r)
6785 return r;
6786 }
6787 return 0;
6788 }
6789
6790 static int tgsi_msb(struct r600_shader_ctx *ctx)
6791 {
6792 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6793 struct r600_bytecode_alu alu;
6794 int i, r, t1, t2;
6795
6796 unsigned write_mask = inst->Dst[0].Register.WriteMask;
6797 int last_inst = tgsi_last_instruction(write_mask);
6798
6799 assert(ctx->inst_info->op == ALU_OP1_FFBH_INT ||
6800 ctx->inst_info->op == ALU_OP1_FFBH_UINT);
6801
6802 t1 = ctx->temp_reg;
6803
6804 /* bit position is indexed from lsb by TGSI, and from msb by the hardware */
6805 for (i = 0; i < 4; i++) {
6806 if (!(write_mask & (1<<i)))
6807 continue;
6808
6809 /* t1 = FFBH_INT / FFBH_UINT */
6810 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6811 alu.op = ctx->inst_info->op;
6812 alu.dst.sel = t1;
6813 alu.dst.chan = i;
6814 alu.dst.write = 1;
6815 alu.last = i == last_inst;
6816
6817 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6818
6819 r = r600_bytecode_add_alu(ctx->bc, &alu);
6820 if (r)
6821 return r;
6822 }
6823
6824 t2 = r600_get_temp(ctx);
6825
6826 for (i = 0; i < 4; i++) {
6827 if (!(write_mask & (1<<i)))
6828 continue;
6829
6830 /* t2 = 31 - t1 */
6831 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6832 alu.op = ALU_OP2_SUB_INT;
6833 alu.dst.sel = t2;
6834 alu.dst.chan = i;
6835 alu.dst.write = 1;
6836 alu.last = i == last_inst;
6837
6838 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
6839 alu.src[0].value = 31;
6840 alu.src[1].sel = t1;
6841 alu.src[1].chan = i;
6842
6843 r = r600_bytecode_add_alu(ctx->bc, &alu);
6844 if (r)
6845 return r;
6846 }
6847
6848 for (i = 0; i < 4; i++) {
6849 if (!(write_mask & (1<<i)))
6850 continue;
6851
6852 /* result = t1 >= 0 ? t2 : t1 */
6853 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6854 alu.op = ALU_OP3_CNDGE_INT;
6855 alu.is_op3 = 1;
6856 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6857 alu.dst.chan = i;
6858 alu.dst.write = 1;
6859 alu.last = i == last_inst;
6860
6861 alu.src[0].sel = t1;
6862 alu.src[0].chan = i;
6863 alu.src[1].sel = t2;
6864 alu.src[1].chan = i;
6865 alu.src[2].sel = t1;
6866 alu.src[2].chan = i;
6867
6868 r = r600_bytecode_add_alu(ctx->bc, &alu);
6869 if (r)
6870 return r;
6871 }
6872
6873 return 0;
6874 }
6875
6876 static int tgsi_interp_egcm(struct r600_shader_ctx *ctx)
6877 {
6878 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6879 struct r600_bytecode_alu alu;
6880 int r, i = 0, k, interp_gpr, interp_base_chan, tmp, lasti;
6881 unsigned location;
6882 const int input = inst->Src[0].Register.Index + ctx->shader->nsys_inputs;
6883
6884 assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
6885
6886 /* Interpolators have been marked for use already by allocate_system_value_inputs */
6887 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
6888 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6889 location = TGSI_INTERPOLATE_LOC_CENTER; /* sample offset will be added explicitly */
6890 }
6891 else {
6892 location = TGSI_INTERPOLATE_LOC_CENTROID;
6893 }
6894
6895 k = eg_get_interpolator_index(ctx->shader->input[input].interpolate, location);
6896 if (k < 0)
6897 k = 0;
6898 interp_gpr = ctx->eg_interpolators[k].ij_index / 2;
6899 interp_base_chan = 2 * (ctx->eg_interpolators[k].ij_index % 2);
6900
6901 /* NOTE: currently offset is not perspective correct */
6902 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
6903 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6904 int sample_gpr = -1;
6905 int gradientsH, gradientsV;
6906 struct r600_bytecode_tex tex;
6907
6908 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6909 sample_gpr = load_sample_position(ctx, &ctx->src[1], ctx->src[1].swizzle[0]);
6910 }
6911
6912 gradientsH = r600_get_temp(ctx);
6913 gradientsV = r600_get_temp(ctx);
6914 for (i = 0; i < 2; i++) {
6915 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
6916 tex.op = i == 0 ? FETCH_OP_GET_GRADIENTS_H : FETCH_OP_GET_GRADIENTS_V;
6917 tex.src_gpr = interp_gpr;
6918 tex.src_sel_x = interp_base_chan + 0;
6919 tex.src_sel_y = interp_base_chan + 1;
6920 tex.src_sel_z = 0;
6921 tex.src_sel_w = 0;
6922 tex.dst_gpr = i == 0 ? gradientsH : gradientsV;
6923 tex.dst_sel_x = 0;
6924 tex.dst_sel_y = 1;
6925 tex.dst_sel_z = 7;
6926 tex.dst_sel_w = 7;
6927 tex.inst_mod = 1; // Use per pixel gradient calculation
6928 tex.sampler_id = 0;
6929 tex.resource_id = tex.sampler_id;
6930 r = r600_bytecode_add_tex(ctx->bc, &tex);
6931 if (r)
6932 return r;
6933 }
6934
6935 for (i = 0; i < 2; i++) {
6936 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6937 alu.op = ALU_OP3_MULADD;
6938 alu.is_op3 = 1;
6939 alu.src[0].sel = gradientsH;
6940 alu.src[0].chan = i;
6941 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6942 alu.src[1].sel = sample_gpr;
6943 alu.src[1].chan = 2;
6944 }
6945 else {
6946 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
6947 }
6948 alu.src[2].sel = interp_gpr;
6949 alu.src[2].chan = interp_base_chan + i;
6950 alu.dst.sel = ctx->temp_reg;
6951 alu.dst.chan = i;
6952 alu.last = i == 1;
6953
6954 r = r600_bytecode_add_alu(ctx->bc, &alu);
6955 if (r)
6956 return r;
6957 }
6958
6959 for (i = 0; i < 2; i++) {
6960 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6961 alu.op = ALU_OP3_MULADD;
6962 alu.is_op3 = 1;
6963 alu.src[0].sel = gradientsV;
6964 alu.src[0].chan = i;
6965 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6966 alu.src[1].sel = sample_gpr;
6967 alu.src[1].chan = 3;
6968 }
6969 else {
6970 r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
6971 }
6972 alu.src[2].sel = ctx->temp_reg;
6973 alu.src[2].chan = i;
6974 alu.dst.sel = ctx->temp_reg;
6975 alu.dst.chan = i;
6976 alu.last = i == 1;
6977
6978 r = r600_bytecode_add_alu(ctx->bc, &alu);
6979 if (r)
6980 return r;
6981 }
6982 }
6983
6984 tmp = r600_get_temp(ctx);
6985 for (i = 0; i < 8; i++) {
6986 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6987 alu.op = i < 4 ? ALU_OP2_INTERP_ZW : ALU_OP2_INTERP_XY;
6988
6989 alu.dst.sel = tmp;
6990 if ((i > 1 && i < 6)) {
6991 alu.dst.write = 1;
6992 }
6993 else {
6994 alu.dst.write = 0;
6995 }
6996 alu.dst.chan = i % 4;
6997
6998 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
6999 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
7000 alu.src[0].sel = ctx->temp_reg;
7001 alu.src[0].chan = 1 - (i % 2);
7002 } else {
7003 alu.src[0].sel = interp_gpr;
7004 alu.src[0].chan = interp_base_chan + 1 - (i % 2);
7005 }
7006 alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
7007 alu.src[1].chan = 0;
7008
7009 alu.last = i % 4 == 3;
7010 alu.bank_swizzle_force = SQ_ALU_VEC_210;
7011
7012 r = r600_bytecode_add_alu(ctx->bc, &alu);
7013 if (r)
7014 return r;
7015 }
7016
7017 // INTERP can't swizzle dst
7018 lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7019 for (i = 0; i <= lasti; i++) {
7020 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7021 continue;
7022
7023 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7024 alu.op = ALU_OP1_MOV;
7025 alu.src[0].sel = tmp;
7026 alu.src[0].chan = ctx->src[0].swizzle[i];
7027 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7028 alu.dst.write = 1;
7029 alu.last = i == lasti;
7030 r = r600_bytecode_add_alu(ctx->bc, &alu);
7031 if (r)
7032 return r;
7033 }
7034
7035 return 0;
7036 }
7037
7038
7039 static int tgsi_helper_copy(struct r600_shader_ctx *ctx, struct tgsi_full_instruction *inst)
7040 {
7041 struct r600_bytecode_alu alu;
7042 int i, r;
7043
7044 for (i = 0; i < 4; i++) {
7045 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7046 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) {
7047 alu.op = ALU_OP0_NOP;
7048 alu.dst.chan = i;
7049 } else {
7050 alu.op = ALU_OP1_MOV;
7051 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7052 alu.src[0].sel = ctx->temp_reg;
7053 alu.src[0].chan = i;
7054 }
7055 if (i == 3) {
7056 alu.last = 1;
7057 }
7058 r = r600_bytecode_add_alu(ctx->bc, &alu);
7059 if (r)
7060 return r;
7061 }
7062 return 0;
7063 }
7064
7065 static int tgsi_make_src_for_op3(struct r600_shader_ctx *ctx,
7066 unsigned temp, int chan,
7067 struct r600_bytecode_alu_src *bc_src,
7068 const struct r600_shader_src *shader_src)
7069 {
7070 struct r600_bytecode_alu alu;
7071 int r;
7072
7073 r600_bytecode_src(bc_src, shader_src, chan);
7074
7075 /* op3 operands don't support abs modifier */
7076 if (bc_src->abs) {
7077 assert(temp!=0); /* we actually need the extra register, make sure it is allocated. */
7078 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7079 alu.op = ALU_OP1_MOV;
7080 alu.dst.sel = temp;
7081 alu.dst.chan = chan;
7082 alu.dst.write = 1;
7083
7084 alu.src[0] = *bc_src;
7085 alu.last = true; // sufficient?
7086 r = r600_bytecode_add_alu(ctx->bc, &alu);
7087 if (r)
7088 return r;
7089
7090 memset(bc_src, 0, sizeof(*bc_src));
7091 bc_src->sel = temp;
7092 bc_src->chan = chan;
7093 }
7094 return 0;
7095 }
7096
7097 static int tgsi_op3_dst(struct r600_shader_ctx *ctx, int dst)
7098 {
7099 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7100 struct r600_bytecode_alu alu;
7101 int i, j, r;
7102 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7103 int temp_regs[4];
7104 unsigned op = ctx->inst_info->op;
7105
7106 if (op == ALU_OP3_MULADD_IEEE &&
7107 ctx->info.properties[TGSI_PROPERTY_MUL_ZERO_WINS])
7108 op = ALU_OP3_MULADD;
7109
7110 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
7111 temp_regs[j] = 0;
7112 if (ctx->src[j].abs)
7113 temp_regs[j] = r600_get_temp(ctx);
7114 }
7115 for (i = 0; i < lasti + 1; i++) {
7116 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7117 continue;
7118
7119 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7120 alu.op = op;
7121 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
7122 r = tgsi_make_src_for_op3(ctx, temp_regs[j], i, &alu.src[j], &ctx->src[j]);
7123 if (r)
7124 return r;
7125 }
7126
7127 if (dst == -1) {
7128 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7129 } else {
7130 alu.dst.sel = dst;
7131 }
7132 alu.dst.chan = i;
7133 alu.dst.write = 1;
7134 alu.is_op3 = 1;
7135 if (i == lasti) {
7136 alu.last = 1;
7137 }
7138 r = r600_bytecode_add_alu(ctx->bc, &alu);
7139 if (r)
7140 return r;
7141 }
7142 return 0;
7143 }
7144
7145 static int tgsi_op3(struct r600_shader_ctx *ctx)
7146 {
7147 return tgsi_op3_dst(ctx, -1);
7148 }
7149
7150 static int tgsi_dp(struct r600_shader_ctx *ctx)
7151 {
7152 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7153 struct r600_bytecode_alu alu;
7154 int i, j, r;
7155 unsigned op = ctx->inst_info->op;
7156 if (op == ALU_OP2_DOT4_IEEE &&
7157 ctx->info.properties[TGSI_PROPERTY_MUL_ZERO_WINS])
7158 op = ALU_OP2_DOT4;
7159
7160 for (i = 0; i < 4; i++) {
7161 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7162 alu.op = op;
7163 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
7164 r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
7165 }
7166
7167 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7168 alu.dst.chan = i;
7169 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
7170 /* handle some special cases */
7171 switch (inst->Instruction.Opcode) {
7172 case TGSI_OPCODE_DP2:
7173 if (i > 1) {
7174 alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
7175 alu.src[0].chan = alu.src[1].chan = 0;
7176 }
7177 break;
7178 case TGSI_OPCODE_DP3:
7179 if (i > 2) {
7180 alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
7181 alu.src[0].chan = alu.src[1].chan = 0;
7182 }
7183 break;
7184 default:
7185 break;
7186 }
7187 if (i == 3) {
7188 alu.last = 1;
7189 }
7190 r = r600_bytecode_add_alu(ctx->bc, &alu);
7191 if (r)
7192 return r;
7193 }
7194 return 0;
7195 }
7196
7197 static inline boolean tgsi_tex_src_requires_loading(struct r600_shader_ctx *ctx,
7198 unsigned index)
7199 {
7200 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7201 return (inst->Src[index].Register.File != TGSI_FILE_TEMPORARY &&
7202 inst->Src[index].Register.File != TGSI_FILE_INPUT &&
7203 inst->Src[index].Register.File != TGSI_FILE_OUTPUT) ||
7204 ctx->src[index].neg || ctx->src[index].abs ||
7205 (inst->Src[index].Register.File == TGSI_FILE_INPUT && ctx->type == PIPE_SHADER_GEOMETRY);
7206 }
7207
7208 static inline unsigned tgsi_tex_get_src_gpr(struct r600_shader_ctx *ctx,
7209 unsigned index)
7210 {
7211 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7212 return ctx->file_offset[inst->Src[index].Register.File] + inst->Src[index].Register.Index;
7213 }
7214
7215 static int do_vtx_fetch_inst(struct r600_shader_ctx *ctx, boolean src_requires_loading)
7216 {
7217 struct r600_bytecode_vtx vtx;
7218 struct r600_bytecode_alu alu;
7219 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7220 int src_gpr, r, i;
7221 int id = tgsi_tex_get_src_gpr(ctx, 1);
7222 int sampler_index_mode = inst->Src[1].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
7223
7224 src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
7225 if (src_requires_loading) {
7226 for (i = 0; i < 4; i++) {
7227 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7228 alu.op = ALU_OP1_MOV;
7229 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7230 alu.dst.sel = ctx->temp_reg;
7231 alu.dst.chan = i;
7232 if (i == 3)
7233 alu.last = 1;
7234 alu.dst.write = 1;
7235 r = r600_bytecode_add_alu(ctx->bc, &alu);
7236 if (r)
7237 return r;
7238 }
7239 src_gpr = ctx->temp_reg;
7240 }
7241
7242 memset(&vtx, 0, sizeof(vtx));
7243 vtx.op = FETCH_OP_VFETCH;
7244 vtx.buffer_id = id + R600_MAX_CONST_BUFFERS;
7245 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
7246 vtx.src_gpr = src_gpr;
7247 vtx.mega_fetch_count = 16;
7248 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
7249 vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; /* SEL_X */
7250 vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; /* SEL_Y */
7251 vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; /* SEL_Z */
7252 vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; /* SEL_W */
7253 vtx.use_const_fields = 1;
7254 vtx.buffer_index_mode = sampler_index_mode;
7255
7256 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
7257 return r;
7258
7259 if (ctx->bc->chip_class >= EVERGREEN)
7260 return 0;
7261
7262 for (i = 0; i < 4; i++) {
7263 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7264 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7265 continue;
7266
7267 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7268 alu.op = ALU_OP2_AND_INT;
7269
7270 alu.dst.chan = i;
7271 alu.dst.sel = vtx.dst_gpr;
7272 alu.dst.write = 1;
7273
7274 alu.src[0].sel = vtx.dst_gpr;
7275 alu.src[0].chan = i;
7276
7277 alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL;
7278 alu.src[1].sel += (id * 2);
7279 alu.src[1].chan = i % 4;
7280 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
7281
7282 if (i == lasti)
7283 alu.last = 1;
7284 r = r600_bytecode_add_alu(ctx->bc, &alu);
7285 if (r)
7286 return r;
7287 }
7288
7289 if (inst->Dst[0].Register.WriteMask & 3) {
7290 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7291 alu.op = ALU_OP2_OR_INT;
7292
7293 alu.dst.chan = 3;
7294 alu.dst.sel = vtx.dst_gpr;
7295 alu.dst.write = 1;
7296
7297 alu.src[0].sel = vtx.dst_gpr;
7298 alu.src[0].chan = 3;
7299
7300 alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL + (id * 2) + 1;
7301 alu.src[1].chan = 0;
7302 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
7303
7304 alu.last = 1;
7305 r = r600_bytecode_add_alu(ctx->bc, &alu);
7306 if (r)
7307 return r;
7308 }
7309 return 0;
7310 }
7311
7312 static int r600_do_buffer_txq(struct r600_shader_ctx *ctx, int reg_idx, int offset, int eg_buffer_base)
7313 {
7314 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7315 int r;
7316 int id = tgsi_tex_get_src_gpr(ctx, reg_idx) + offset;
7317 int sampler_index_mode = inst->Src[reg_idx].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
7318
7319 if (ctx->bc->chip_class < EVERGREEN) {
7320 struct r600_bytecode_alu alu;
7321 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7322 alu.op = ALU_OP1_MOV;
7323 alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
7324 /* r600 we have them at channel 2 of the second dword */
7325 alu.src[0].sel += (id * 2) + 1;
7326 alu.src[0].chan = 1;
7327 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
7328 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
7329 alu.last = 1;
7330 r = r600_bytecode_add_alu(ctx->bc, &alu);
7331 if (r)
7332 return r;
7333 return 0;
7334 } else {
7335 struct r600_bytecode_vtx vtx;
7336 memset(&vtx, 0, sizeof(vtx));
7337 vtx.op = FETCH_OP_GET_BUFFER_RESINFO;
7338 vtx.buffer_id = id + eg_buffer_base;
7339 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
7340 vtx.src_gpr = 0;
7341 vtx.mega_fetch_count = 16; /* no idea here really... */
7342 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
7343 vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; /* SEL_X */
7344 vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 4 : 7; /* SEL_Y */
7345 vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 4 : 7; /* SEL_Z */
7346 vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 4 : 7; /* SEL_W */
7347 vtx.data_format = FMT_32_32_32_32;
7348 vtx.buffer_index_mode = sampler_index_mode;
7349
7350 if ((r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx)))
7351 return r;
7352 return 0;
7353 }
7354 }
7355
7356
7357 static int tgsi_tex(struct r600_shader_ctx *ctx)
7358 {
7359 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7360 struct r600_bytecode_tex tex;
7361 struct r600_bytecode_alu alu;
7362 unsigned src_gpr;
7363 int r, i, j;
7364 int opcode;
7365 bool read_compressed_msaa = ctx->bc->has_compressed_msaa_texturing &&
7366 inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
7367 (inst->Texture.Texture == TGSI_TEXTURE_2D_MSAA ||
7368 inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA);
7369
7370 bool txf_add_offsets = inst->Texture.NumOffsets &&
7371 inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
7372 inst->Texture.Texture != TGSI_TEXTURE_BUFFER;
7373
7374 /* Texture fetch instructions can only use gprs as source.
7375 * Also they cannot negate the source or take the absolute value */
7376 const boolean src_requires_loading = (inst->Instruction.Opcode != TGSI_OPCODE_TXQS &&
7377 tgsi_tex_src_requires_loading(ctx, 0)) ||
7378 read_compressed_msaa || txf_add_offsets;
7379
7380 boolean src_loaded = FALSE;
7381 unsigned sampler_src_reg = 1;
7382 int8_t offset_x = 0, offset_y = 0, offset_z = 0;
7383 boolean has_txq_cube_array_z = false;
7384 unsigned sampler_index_mode;
7385
7386 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ &&
7387 ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7388 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)))
7389 if (inst->Dst[0].Register.WriteMask & 4) {
7390 ctx->shader->has_txq_cube_array_z_comp = true;
7391 has_txq_cube_array_z = true;
7392 }
7393
7394 if (inst->Instruction.Opcode == TGSI_OPCODE_TEX2 ||
7395 inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
7396 inst->Instruction.Opcode == TGSI_OPCODE_TXL2 ||
7397 inst->Instruction.Opcode == TGSI_OPCODE_TG4)
7398 sampler_src_reg = 2;
7399
7400 /* TGSI moves the sampler to src reg 3 for TXD */
7401 if (inst->Instruction.Opcode == TGSI_OPCODE_TXD)
7402 sampler_src_reg = 3;
7403
7404 sampler_index_mode = inst->Src[sampler_src_reg].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
7405
7406 src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
7407
7408 if (inst->Texture.Texture == TGSI_TEXTURE_BUFFER) {
7409 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) {
7410 if (ctx->bc->chip_class < EVERGREEN)
7411 ctx->shader->uses_tex_buffers = true;
7412 return r600_do_buffer_txq(ctx, 1, 0, R600_MAX_CONST_BUFFERS);
7413 }
7414 else if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
7415 if (ctx->bc->chip_class < EVERGREEN)
7416 ctx->shader->uses_tex_buffers = true;
7417 return do_vtx_fetch_inst(ctx, src_requires_loading);
7418 }
7419 }
7420
7421 if (inst->Instruction.Opcode == TGSI_OPCODE_TXP) {
7422 int out_chan;
7423 /* Add perspective divide */
7424 if (ctx->bc->chip_class == CAYMAN) {
7425 out_chan = 2;
7426 for (i = 0; i < 3; i++) {
7427 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7428 alu.op = ALU_OP1_RECIP_IEEE;
7429 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7430
7431 alu.dst.sel = ctx->temp_reg;
7432 alu.dst.chan = i;
7433 if (i == 2)
7434 alu.last = 1;
7435 if (out_chan == i)
7436 alu.dst.write = 1;
7437 r = r600_bytecode_add_alu(ctx->bc, &alu);
7438 if (r)
7439 return r;
7440 }
7441
7442 } else {
7443 out_chan = 3;
7444 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7445 alu.op = ALU_OP1_RECIP_IEEE;
7446 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7447
7448 alu.dst.sel = ctx->temp_reg;
7449 alu.dst.chan = out_chan;
7450 alu.last = 1;
7451 alu.dst.write = 1;
7452 r = r600_bytecode_add_alu(ctx->bc, &alu);
7453 if (r)
7454 return r;
7455 }
7456
7457 for (i = 0; i < 3; i++) {
7458 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7459 alu.op = ALU_OP2_MUL;
7460 alu.src[0].sel = ctx->temp_reg;
7461 alu.src[0].chan = out_chan;
7462 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
7463 alu.dst.sel = ctx->temp_reg;
7464 alu.dst.chan = i;
7465 alu.dst.write = 1;
7466 r = r600_bytecode_add_alu(ctx->bc, &alu);
7467 if (r)
7468 return r;
7469 }
7470 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7471 alu.op = ALU_OP1_MOV;
7472 alu.src[0].sel = V_SQ_ALU_SRC_1;
7473 alu.src[0].chan = 0;
7474 alu.dst.sel = ctx->temp_reg;
7475 alu.dst.chan = 3;
7476 alu.last = 1;
7477 alu.dst.write = 1;
7478 r = r600_bytecode_add_alu(ctx->bc, &alu);
7479 if (r)
7480 return r;
7481 src_loaded = TRUE;
7482 src_gpr = ctx->temp_reg;
7483 }
7484
7485
7486 if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
7487 inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7488 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
7489 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
7490 inst->Instruction.Opcode != TGSI_OPCODE_TXQ) {
7491
7492 static const unsigned src0_swizzle[] = {2, 2, 0, 1};
7493 static const unsigned src1_swizzle[] = {1, 0, 2, 2};
7494
7495 /* tmp1.xyzw = CUBE(R0.zzxy, R0.yxzz) */
7496 for (i = 0; i < 4; i++) {
7497 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7498 alu.op = ALU_OP2_CUBE;
7499 r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
7500 r600_bytecode_src(&alu.src[1], &ctx->src[0], src1_swizzle[i]);
7501 alu.dst.sel = ctx->temp_reg;
7502 alu.dst.chan = i;
7503 if (i == 3)
7504 alu.last = 1;
7505 alu.dst.write = 1;
7506 r = r600_bytecode_add_alu(ctx->bc, &alu);
7507 if (r)
7508 return r;
7509 }
7510
7511 /* tmp1.z = RCP_e(|tmp1.z|) */
7512 if (ctx->bc->chip_class == CAYMAN) {
7513 for (i = 0; i < 3; i++) {
7514 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7515 alu.op = ALU_OP1_RECIP_IEEE;
7516 alu.src[0].sel = ctx->temp_reg;
7517 alu.src[0].chan = 2;
7518 alu.src[0].abs = 1;
7519 alu.dst.sel = ctx->temp_reg;
7520 alu.dst.chan = i;
7521 if (i == 2)
7522 alu.dst.write = 1;
7523 if (i == 2)
7524 alu.last = 1;
7525 r = r600_bytecode_add_alu(ctx->bc, &alu);
7526 if (r)
7527 return r;
7528 }
7529 } else {
7530 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7531 alu.op = ALU_OP1_RECIP_IEEE;
7532 alu.src[0].sel = ctx->temp_reg;
7533 alu.src[0].chan = 2;
7534 alu.src[0].abs = 1;
7535 alu.dst.sel = ctx->temp_reg;
7536 alu.dst.chan = 2;
7537 alu.dst.write = 1;
7538 alu.last = 1;
7539 r = r600_bytecode_add_alu(ctx->bc, &alu);
7540 if (r)
7541 return r;
7542 }
7543
7544 /* MULADD R0.x, R0.x, PS1, (0x3FC00000, 1.5f).x
7545 * MULADD R0.y, R0.y, PS1, (0x3FC00000, 1.5f).x
7546 * muladd has no writemask, have to use another temp
7547 */
7548 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7549 alu.op = ALU_OP3_MULADD;
7550 alu.is_op3 = 1;
7551
7552 alu.src[0].sel = ctx->temp_reg;
7553 alu.src[0].chan = 0;
7554 alu.src[1].sel = ctx->temp_reg;
7555 alu.src[1].chan = 2;
7556
7557 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
7558 alu.src[2].chan = 0;
7559 alu.src[2].value = u_bitcast_f2u(1.5f);
7560
7561 alu.dst.sel = ctx->temp_reg;
7562 alu.dst.chan = 0;
7563 alu.dst.write = 1;
7564
7565 r = r600_bytecode_add_alu(ctx->bc, &alu);
7566 if (r)
7567 return r;
7568
7569 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7570 alu.op = ALU_OP3_MULADD;
7571 alu.is_op3 = 1;
7572
7573 alu.src[0].sel = ctx->temp_reg;
7574 alu.src[0].chan = 1;
7575 alu.src[1].sel = ctx->temp_reg;
7576 alu.src[1].chan = 2;
7577
7578 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
7579 alu.src[2].chan = 0;
7580 alu.src[2].value = u_bitcast_f2u(1.5f);
7581
7582 alu.dst.sel = ctx->temp_reg;
7583 alu.dst.chan = 1;
7584 alu.dst.write = 1;
7585
7586 alu.last = 1;
7587 r = r600_bytecode_add_alu(ctx->bc, &alu);
7588 if (r)
7589 return r;
7590 /* write initial compare value into Z component
7591 - W src 0 for shadow cube
7592 - X src 1 for shadow cube array */
7593 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
7594 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
7595 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7596 alu.op = ALU_OP1_MOV;
7597 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
7598 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
7599 else
7600 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7601 alu.dst.sel = ctx->temp_reg;
7602 alu.dst.chan = 2;
7603 alu.dst.write = 1;
7604 alu.last = 1;
7605 r = r600_bytecode_add_alu(ctx->bc, &alu);
7606 if (r)
7607 return r;
7608 }
7609
7610 if (inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7611 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
7612 if (ctx->bc->chip_class >= EVERGREEN) {
7613 int mytmp = r600_get_temp(ctx);
7614 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7615 alu.op = ALU_OP1_MOV;
7616 alu.src[0].sel = ctx->temp_reg;
7617 alu.src[0].chan = 3;
7618 alu.dst.sel = mytmp;
7619 alu.dst.chan = 0;
7620 alu.dst.write = 1;
7621 alu.last = 1;
7622 r = r600_bytecode_add_alu(ctx->bc, &alu);
7623 if (r)
7624 return r;
7625
7626 /* have to multiply original layer by 8 and add to face id (temp.w) in Z */
7627 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7628 alu.op = ALU_OP3_MULADD;
7629 alu.is_op3 = 1;
7630 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7631 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7632 alu.src[1].chan = 0;
7633 alu.src[1].value = u_bitcast_f2u(8.0f);
7634 alu.src[2].sel = mytmp;
7635 alu.src[2].chan = 0;
7636 alu.dst.sel = ctx->temp_reg;
7637 alu.dst.chan = 3;
7638 alu.dst.write = 1;
7639 alu.last = 1;
7640 r = r600_bytecode_add_alu(ctx->bc, &alu);
7641 if (r)
7642 return r;
7643 } else if (ctx->bc->chip_class < EVERGREEN) {
7644 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7645 tex.op = FETCH_OP_SET_CUBEMAP_INDEX;
7646 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7647 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7648 tex.src_gpr = r600_get_temp(ctx);
7649 tex.src_sel_x = 0;
7650 tex.src_sel_y = 0;
7651 tex.src_sel_z = 0;
7652 tex.src_sel_w = 0;
7653 tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
7654 tex.coord_type_x = 1;
7655 tex.coord_type_y = 1;
7656 tex.coord_type_z = 1;
7657 tex.coord_type_w = 1;
7658 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7659 alu.op = ALU_OP1_MOV;
7660 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7661 alu.dst.sel = tex.src_gpr;
7662 alu.dst.chan = 0;
7663 alu.last = 1;
7664 alu.dst.write = 1;
7665 r = r600_bytecode_add_alu(ctx->bc, &alu);
7666 if (r)
7667 return r;
7668
7669 r = r600_bytecode_add_tex(ctx->bc, &tex);
7670 if (r)
7671 return r;
7672 }
7673
7674 }
7675
7676 /* for cube forms of lod and bias we need to route things */
7677 if (inst->Instruction.Opcode == TGSI_OPCODE_TXB ||
7678 inst->Instruction.Opcode == TGSI_OPCODE_TXL ||
7679 inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
7680 inst->Instruction.Opcode == TGSI_OPCODE_TXL2) {
7681 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7682 alu.op = ALU_OP1_MOV;
7683 if (inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
7684 inst->Instruction.Opcode == TGSI_OPCODE_TXL2)
7685 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
7686 else
7687 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7688 alu.dst.sel = ctx->temp_reg;
7689 alu.dst.chan = 2;
7690 alu.last = 1;
7691 alu.dst.write = 1;
7692 r = r600_bytecode_add_alu(ctx->bc, &alu);
7693 if (r)
7694 return r;
7695 }
7696
7697 src_loaded = TRUE;
7698 src_gpr = ctx->temp_reg;
7699 }
7700
7701 if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) {
7702 int temp_h = 0, temp_v = 0;
7703 int start_val = 0;
7704
7705 /* if we've already loaded the src (i.e. CUBE don't reload it). */
7706 if (src_loaded == TRUE)
7707 start_val = 1;
7708 else
7709 src_loaded = TRUE;
7710 for (i = start_val; i < 3; i++) {
7711 int treg = r600_get_temp(ctx);
7712
7713 if (i == 0)
7714 src_gpr = treg;
7715 else if (i == 1)
7716 temp_h = treg;
7717 else
7718 temp_v = treg;
7719
7720 for (j = 0; j < 4; j++) {
7721 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7722 alu.op = ALU_OP1_MOV;
7723 r600_bytecode_src(&alu.src[0], &ctx->src[i], j);
7724 alu.dst.sel = treg;
7725 alu.dst.chan = j;
7726 if (j == 3)
7727 alu.last = 1;
7728 alu.dst.write = 1;
7729 r = r600_bytecode_add_alu(ctx->bc, &alu);
7730 if (r)
7731 return r;
7732 }
7733 }
7734 for (i = 1; i < 3; i++) {
7735 /* set gradients h/v */
7736 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7737 tex.op = (i == 1) ? FETCH_OP_SET_GRADIENTS_H :
7738 FETCH_OP_SET_GRADIENTS_V;
7739 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7740 tex.sampler_index_mode = sampler_index_mode;
7741 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7742 tex.resource_index_mode = sampler_index_mode;
7743
7744 tex.src_gpr = (i == 1) ? temp_h : temp_v;
7745 tex.src_sel_x = 0;
7746 tex.src_sel_y = 1;
7747 tex.src_sel_z = 2;
7748 tex.src_sel_w = 3;
7749
7750 tex.dst_gpr = r600_get_temp(ctx); /* just to avoid confusing the asm scheduler */
7751 tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
7752 if (inst->Texture.Texture != TGSI_TEXTURE_RECT) {
7753 tex.coord_type_x = 1;
7754 tex.coord_type_y = 1;
7755 tex.coord_type_z = 1;
7756 tex.coord_type_w = 1;
7757 }
7758 r = r600_bytecode_add_tex(ctx->bc, &tex);
7759 if (r)
7760 return r;
7761 }
7762 }
7763
7764 if (inst->Instruction.Opcode == TGSI_OPCODE_TG4) {
7765 /* Gather4 should follow the same rules as bilinear filtering, but the hardware
7766 * incorrectly forces nearest filtering if the texture format is integer.
7767 * The only effect it has on Gather4, which always returns 4 texels for
7768 * bilinear filtering, is that the final coordinates are off by 0.5 of
7769 * the texel size.
7770 *
7771 * The workaround is to subtract 0.5 from the unnormalized coordinates,
7772 * or (0.5 / size) from the normalized coordinates.
7773 */
7774 if (inst->Texture.ReturnType == TGSI_RETURN_TYPE_SINT ||
7775 inst->Texture.ReturnType == TGSI_RETURN_TYPE_UINT) {
7776 int treg = r600_get_temp(ctx);
7777
7778 /* mov array and comparison oordinate to temp_reg if needed */
7779 if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
7780 inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
7781 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY) && !src_loaded) {
7782 int end = inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ? 3 : 2;
7783 for (i = 2; i <= end; i++) {
7784 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7785 alu.op = ALU_OP1_MOV;
7786 alu.dst.sel = ctx->temp_reg;
7787 alu.dst.chan = i;
7788 alu.dst.write = 1;
7789 alu.last = (i == end);
7790 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7791 r = r600_bytecode_add_alu(ctx->bc, &alu);
7792 if (r)
7793 return r;
7794 }
7795 }
7796
7797 if (inst->Texture.Texture == TGSI_TEXTURE_RECT ||
7798 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT) {
7799 for (i = 0; i < 2; i++) {
7800 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7801 alu.op = ALU_OP2_ADD;
7802 alu.dst.sel = ctx->temp_reg;
7803 alu.dst.chan = i;
7804 alu.dst.write = 1;
7805 alu.last = i == 1;
7806 if (src_loaded) {
7807 alu.src[0].sel = ctx->temp_reg;
7808 alu.src[0].chan = i;
7809 } else
7810 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7811 alu.src[1].sel = V_SQ_ALU_SRC_0_5;
7812 alu.src[1].neg = 1;
7813 r = r600_bytecode_add_alu(ctx->bc, &alu);
7814 if (r)
7815 return r;
7816 }
7817 } else {
7818 /* execute a TXQ */
7819 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7820 tex.op = FETCH_OP_GET_TEXTURE_RESINFO;
7821 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7822 tex.sampler_index_mode = sampler_index_mode;
7823 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7824 tex.resource_index_mode = sampler_index_mode;
7825 tex.dst_gpr = treg;
7826 tex.src_sel_x = 4;
7827 tex.src_sel_y = 4;
7828 tex.src_sel_z = 4;
7829 tex.src_sel_w = 4;
7830 tex.dst_sel_x = 0;
7831 tex.dst_sel_y = 1;
7832 tex.dst_sel_z = 7;
7833 tex.dst_sel_w = 7;
7834 r = r600_bytecode_add_tex(ctx->bc, &tex);
7835 if (r)
7836 return r;
7837
7838 /* coord.xy = -0.5 * (1.0/int_to_flt(size)) + coord.xy */
7839 if (ctx->bc->chip_class == CAYMAN) {
7840 /* */
7841 for (i = 0; i < 2; i++) {
7842 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7843 alu.op = ALU_OP1_INT_TO_FLT;
7844 alu.dst.sel = treg;
7845 alu.dst.chan = i;
7846 alu.dst.write = 1;
7847 alu.src[0].sel = treg;
7848 alu.src[0].chan = i;
7849 alu.last = (i == 1) ? 1 : 0;
7850 r = r600_bytecode_add_alu(ctx->bc, &alu);
7851 if (r)
7852 return r;
7853 }
7854 for (j = 0; j < 2; j++) {
7855 for (i = 0; i < 3; i++) {
7856 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7857 alu.op = ALU_OP1_RECIP_IEEE;
7858 alu.src[0].sel = treg;
7859 alu.src[0].chan = j;
7860 alu.dst.sel = treg;
7861 alu.dst.chan = i;
7862 if (i == 2)
7863 alu.last = 1;
7864 if (i == j)
7865 alu.dst.write = 1;
7866 r = r600_bytecode_add_alu(ctx->bc, &alu);
7867 if (r)
7868 return r;
7869 }
7870 }
7871 } else {
7872 for (i = 0; i < 2; i++) {
7873 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7874 alu.op = ALU_OP1_INT_TO_FLT;
7875 alu.dst.sel = treg;
7876 alu.dst.chan = i;
7877 alu.dst.write = 1;
7878 alu.src[0].sel = treg;
7879 alu.src[0].chan = i;
7880 alu.last = 1;
7881 r = r600_bytecode_add_alu(ctx->bc, &alu);
7882 if (r)
7883 return r;
7884 }
7885 for (i = 0; i < 2; i++) {
7886 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7887 alu.op = ALU_OP1_RECIP_IEEE;
7888 alu.src[0].sel = treg;
7889 alu.src[0].chan = i;
7890 alu.dst.sel = treg;
7891 alu.dst.chan = i;
7892 alu.last = 1;
7893 alu.dst.write = 1;
7894 r = r600_bytecode_add_alu(ctx->bc, &alu);
7895 if (r)
7896 return r;
7897 }
7898 }
7899 for (i = 0; i < 2; i++) {
7900 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7901 alu.op = ALU_OP3_MULADD;
7902 alu.is_op3 = 1;
7903 alu.dst.sel = ctx->temp_reg;
7904 alu.dst.chan = i;
7905 alu.dst.write = 1;
7906 alu.last = i == 1;
7907 alu.src[0].sel = treg;
7908 alu.src[0].chan = i;
7909 alu.src[1].sel = V_SQ_ALU_SRC_0_5;
7910 alu.src[1].neg = 1;
7911 if (src_loaded) {
7912 alu.src[2].sel = ctx->temp_reg;
7913 alu.src[2].chan = i;
7914 } else
7915 r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
7916 r = r600_bytecode_add_alu(ctx->bc, &alu);
7917 if (r)
7918 return r;
7919 }
7920 }
7921 src_loaded = TRUE;
7922 src_gpr = ctx->temp_reg;
7923 }
7924 }
7925
7926 if (src_requires_loading && !src_loaded) {
7927 for (i = 0; i < 4; i++) {
7928 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7929 alu.op = ALU_OP1_MOV;
7930 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7931 alu.dst.sel = ctx->temp_reg;
7932 alu.dst.chan = i;
7933 if (i == 3)
7934 alu.last = 1;
7935 alu.dst.write = 1;
7936 r = r600_bytecode_add_alu(ctx->bc, &alu);
7937 if (r)
7938 return r;
7939 }
7940 src_loaded = TRUE;
7941 src_gpr = ctx->temp_reg;
7942 }
7943
7944 /* get offset values */
7945 if (inst->Texture.NumOffsets) {
7946 assert(inst->Texture.NumOffsets == 1);
7947
7948 /* The texture offset feature doesn't work with the TXF instruction
7949 * and must be emulated by adding the offset to the texture coordinates. */
7950 if (txf_add_offsets) {
7951 const struct tgsi_texture_offset *off = inst->TexOffsets;
7952
7953 switch (inst->Texture.Texture) {
7954 case TGSI_TEXTURE_3D:
7955 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7956 alu.op = ALU_OP2_ADD_INT;
7957 alu.src[0].sel = src_gpr;
7958 alu.src[0].chan = 2;
7959 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7960 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleZ];
7961 alu.dst.sel = src_gpr;
7962 alu.dst.chan = 2;
7963 alu.dst.write = 1;
7964 alu.last = 1;
7965 r = r600_bytecode_add_alu(ctx->bc, &alu);
7966 if (r)
7967 return r;
7968 /* fall through */
7969
7970 case TGSI_TEXTURE_2D:
7971 case TGSI_TEXTURE_SHADOW2D:
7972 case TGSI_TEXTURE_RECT:
7973 case TGSI_TEXTURE_SHADOWRECT:
7974 case TGSI_TEXTURE_2D_ARRAY:
7975 case TGSI_TEXTURE_SHADOW2D_ARRAY:
7976 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7977 alu.op = ALU_OP2_ADD_INT;
7978 alu.src[0].sel = src_gpr;
7979 alu.src[0].chan = 1;
7980 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7981 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleY];
7982 alu.dst.sel = src_gpr;
7983 alu.dst.chan = 1;
7984 alu.dst.write = 1;
7985 alu.last = 1;
7986 r = r600_bytecode_add_alu(ctx->bc, &alu);
7987 if (r)
7988 return r;
7989 /* fall through */
7990
7991 case TGSI_TEXTURE_1D:
7992 case TGSI_TEXTURE_SHADOW1D:
7993 case TGSI_TEXTURE_1D_ARRAY:
7994 case TGSI_TEXTURE_SHADOW1D_ARRAY:
7995 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7996 alu.op = ALU_OP2_ADD_INT;
7997 alu.src[0].sel = src_gpr;
7998 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7999 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleX];
8000 alu.dst.sel = src_gpr;
8001 alu.dst.write = 1;
8002 alu.last = 1;
8003 r = r600_bytecode_add_alu(ctx->bc, &alu);
8004 if (r)
8005 return r;
8006 break;
8007 /* texture offsets do not apply to other texture targets */
8008 }
8009 } else {
8010 switch (inst->Texture.Texture) {
8011 case TGSI_TEXTURE_3D:
8012 offset_z = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleZ] << 1;
8013 /* fallthrough */
8014 case TGSI_TEXTURE_2D:
8015 case TGSI_TEXTURE_SHADOW2D:
8016 case TGSI_TEXTURE_RECT:
8017 case TGSI_TEXTURE_SHADOWRECT:
8018 case TGSI_TEXTURE_2D_ARRAY:
8019 case TGSI_TEXTURE_SHADOW2D_ARRAY:
8020 offset_y = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleY] << 1;
8021 /* fallthrough */
8022 case TGSI_TEXTURE_1D:
8023 case TGSI_TEXTURE_SHADOW1D:
8024 case TGSI_TEXTURE_1D_ARRAY:
8025 case TGSI_TEXTURE_SHADOW1D_ARRAY:
8026 offset_x = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleX] << 1;
8027 }
8028 }
8029 }
8030
8031 /* Obtain the sample index for reading a compressed MSAA color texture.
8032 * To read the FMASK, we use the ldfptr instruction, which tells us
8033 * where the samples are stored.
8034 * For uncompressed 8x MSAA surfaces, ldfptr should return 0x76543210,
8035 * which is the identity mapping. Each nibble says which physical sample
8036 * should be fetched to get that sample.
8037 *
8038 * Assume src.z contains the sample index. It should be modified like this:
8039 * src.z = (ldfptr() >> (src.z * 4)) & 0xF;
8040 * Then fetch the texel with src.
8041 */
8042 if (read_compressed_msaa) {
8043 unsigned sample_chan = 3;
8044 unsigned temp = r600_get_temp(ctx);
8045 assert(src_loaded);
8046
8047 /* temp.w = ldfptr() */
8048 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
8049 tex.op = FETCH_OP_LD;
8050 tex.inst_mod = 1; /* to indicate this is ldfptr */
8051 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
8052 tex.sampler_index_mode = sampler_index_mode;
8053 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
8054 tex.resource_index_mode = sampler_index_mode;
8055 tex.src_gpr = src_gpr;
8056 tex.dst_gpr = temp;
8057 tex.dst_sel_x = 7; /* mask out these components */
8058 tex.dst_sel_y = 7;
8059 tex.dst_sel_z = 7;
8060 tex.dst_sel_w = 0; /* store X */
8061 tex.src_sel_x = 0;
8062 tex.src_sel_y = 1;
8063 tex.src_sel_z = 2;
8064 tex.src_sel_w = 3;
8065 tex.offset_x = offset_x;
8066 tex.offset_y = offset_y;
8067 tex.offset_z = offset_z;
8068 r = r600_bytecode_add_tex(ctx->bc, &tex);
8069 if (r)
8070 return r;
8071
8072 /* temp.x = sample_index*4 */
8073 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8074 alu.op = ALU_OP2_MULLO_INT;
8075 alu.src[0].sel = src_gpr;
8076 alu.src[0].chan = sample_chan;
8077 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8078 alu.src[1].value = 4;
8079 alu.dst.sel = temp;
8080 alu.dst.chan = 0;
8081 alu.dst.write = 1;
8082 r = emit_mul_int_op(ctx->bc, &alu);
8083 if (r)
8084 return r;
8085
8086 /* sample_index = temp.w >> temp.x */
8087 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8088 alu.op = ALU_OP2_LSHR_INT;
8089 alu.src[0].sel = temp;
8090 alu.src[0].chan = 3;
8091 alu.src[1].sel = temp;
8092 alu.src[1].chan = 0;
8093 alu.dst.sel = src_gpr;
8094 alu.dst.chan = sample_chan;
8095 alu.dst.write = 1;
8096 alu.last = 1;
8097 r = r600_bytecode_add_alu(ctx->bc, &alu);
8098 if (r)
8099 return r;
8100
8101 /* sample_index & 0xF */
8102 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8103 alu.op = ALU_OP2_AND_INT;
8104 alu.src[0].sel = src_gpr;
8105 alu.src[0].chan = sample_chan;
8106 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8107 alu.src[1].value = 0xF;
8108 alu.dst.sel = src_gpr;
8109 alu.dst.chan = sample_chan;
8110 alu.dst.write = 1;
8111 alu.last = 1;
8112 r = r600_bytecode_add_alu(ctx->bc, &alu);
8113 if (r)
8114 return r;
8115 #if 0
8116 /* visualize the FMASK */
8117 for (i = 0; i < 4; i++) {
8118 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8119 alu.op = ALU_OP1_INT_TO_FLT;
8120 alu.src[0].sel = src_gpr;
8121 alu.src[0].chan = sample_chan;
8122 alu.dst.sel = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8123 alu.dst.chan = i;
8124 alu.dst.write = 1;
8125 alu.last = 1;
8126 r = r600_bytecode_add_alu(ctx->bc, &alu);
8127 if (r)
8128 return r;
8129 }
8130 return 0;
8131 #endif
8132 }
8133
8134 /* does this shader want a num layers from TXQ for a cube array? */
8135 if (has_txq_cube_array_z) {
8136 int id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
8137
8138 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8139 alu.op = ALU_OP1_MOV;
8140
8141 alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
8142 if (ctx->bc->chip_class >= EVERGREEN) {
8143 /* with eg each dword is number of cubes */
8144 alu.src[0].sel += id / 4;
8145 alu.src[0].chan = id % 4;
8146 } else {
8147 /* r600 we have them at channel 2 of the second dword */
8148 alu.src[0].sel += (id * 2) + 1;
8149 alu.src[0].chan = 2;
8150 }
8151 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
8152 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
8153 alu.last = 1;
8154 r = r600_bytecode_add_alu(ctx->bc, &alu);
8155 if (r)
8156 return r;
8157 /* disable writemask from texture instruction */
8158 inst->Dst[0].Register.WriteMask &= ~4;
8159 }
8160
8161 opcode = ctx->inst_info->op;
8162 if (opcode == FETCH_OP_GATHER4 &&
8163 inst->TexOffsets[0].File != TGSI_FILE_NULL &&
8164 inst->TexOffsets[0].File != TGSI_FILE_IMMEDIATE) {
8165 opcode = FETCH_OP_GATHER4_O;
8166
8167 /* GATHER4_O/GATHER4_C_O use offset values loaded by
8168 SET_TEXTURE_OFFSETS instruction. The immediate offset values
8169 encoded in the instruction are ignored. */
8170 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
8171 tex.op = FETCH_OP_SET_TEXTURE_OFFSETS;
8172 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
8173 tex.sampler_index_mode = sampler_index_mode;
8174 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
8175 tex.resource_index_mode = sampler_index_mode;
8176
8177 tex.src_gpr = ctx->file_offset[inst->TexOffsets[0].File] + inst->TexOffsets[0].Index;
8178 tex.src_sel_x = inst->TexOffsets[0].SwizzleX;
8179 tex.src_sel_y = inst->TexOffsets[0].SwizzleY;
8180 tex.src_sel_z = inst->TexOffsets[0].SwizzleZ;
8181 tex.src_sel_w = 4;
8182
8183 tex.dst_sel_x = 7;
8184 tex.dst_sel_y = 7;
8185 tex.dst_sel_z = 7;
8186 tex.dst_sel_w = 7;
8187
8188 r = r600_bytecode_add_tex(ctx->bc, &tex);
8189 if (r)
8190 return r;
8191 }
8192
8193 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
8194 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
8195 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
8196 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
8197 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY ||
8198 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
8199 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
8200 switch (opcode) {
8201 case FETCH_OP_SAMPLE:
8202 opcode = FETCH_OP_SAMPLE_C;
8203 break;
8204 case FETCH_OP_SAMPLE_L:
8205 opcode = FETCH_OP_SAMPLE_C_L;
8206 break;
8207 case FETCH_OP_SAMPLE_LB:
8208 opcode = FETCH_OP_SAMPLE_C_LB;
8209 break;
8210 case FETCH_OP_SAMPLE_G:
8211 opcode = FETCH_OP_SAMPLE_C_G;
8212 break;
8213 /* Texture gather variants */
8214 case FETCH_OP_GATHER4:
8215 opcode = FETCH_OP_GATHER4_C;
8216 break;
8217 case FETCH_OP_GATHER4_O:
8218 opcode = FETCH_OP_GATHER4_C_O;
8219 break;
8220 }
8221 }
8222
8223 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
8224 tex.op = opcode;
8225
8226 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
8227 tex.sampler_index_mode = sampler_index_mode;
8228 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
8229 tex.resource_index_mode = sampler_index_mode;
8230 tex.src_gpr = src_gpr;
8231 tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8232
8233 if (inst->Instruction.Opcode == TGSI_OPCODE_DDX_FINE ||
8234 inst->Instruction.Opcode == TGSI_OPCODE_DDY_FINE) {
8235 tex.inst_mod = 1; /* per pixel gradient calculation instead of per 2x2 quad */
8236 }
8237
8238 if (inst->Instruction.Opcode == TGSI_OPCODE_TG4) {
8239 int8_t texture_component_select = ctx->literals[4 * inst->Src[1].Register.Index + inst->Src[1].Register.SwizzleX];
8240 tex.inst_mod = texture_component_select;
8241
8242 if (ctx->bc->chip_class == CAYMAN) {
8243 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
8244 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
8245 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
8246 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
8247 } else {
8248 /* GATHER4 result order is different from TGSI TG4 */
8249 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 1 : 7;
8250 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 2 : 7;
8251 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 0 : 7;
8252 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
8253 }
8254 }
8255 else if (inst->Instruction.Opcode == TGSI_OPCODE_LODQ) {
8256 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
8257 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
8258 tex.dst_sel_z = 7;
8259 tex.dst_sel_w = 7;
8260 }
8261 else if (inst->Instruction.Opcode == TGSI_OPCODE_TXQS) {
8262 tex.dst_sel_x = 3;
8263 tex.dst_sel_y = 7;
8264 tex.dst_sel_z = 7;
8265 tex.dst_sel_w = 7;
8266 }
8267 else {
8268 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
8269 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
8270 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
8271 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
8272 }
8273
8274
8275 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQS) {
8276 tex.src_sel_x = 4;
8277 tex.src_sel_y = 4;
8278 tex.src_sel_z = 4;
8279 tex.src_sel_w = 4;
8280 } else if (src_loaded) {
8281 tex.src_sel_x = 0;
8282 tex.src_sel_y = 1;
8283 tex.src_sel_z = 2;
8284 tex.src_sel_w = 3;
8285 } else {
8286 tex.src_sel_x = ctx->src[0].swizzle[0];
8287 tex.src_sel_y = ctx->src[0].swizzle[1];
8288 tex.src_sel_z = ctx->src[0].swizzle[2];
8289 tex.src_sel_w = ctx->src[0].swizzle[3];
8290 tex.src_rel = ctx->src[0].rel;
8291 }
8292
8293 if (inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
8294 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
8295 inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
8296 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
8297 tex.src_sel_x = 1;
8298 tex.src_sel_y = 0;
8299 tex.src_sel_z = 3;
8300 tex.src_sel_w = 2; /* route Z compare or Lod value into W */
8301 }
8302
8303 if (inst->Texture.Texture != TGSI_TEXTURE_RECT &&
8304 inst->Texture.Texture != TGSI_TEXTURE_SHADOWRECT) {
8305 tex.coord_type_x = 1;
8306 tex.coord_type_y = 1;
8307 }
8308 tex.coord_type_z = 1;
8309 tex.coord_type_w = 1;
8310
8311 tex.offset_x = offset_x;
8312 tex.offset_y = offset_y;
8313 if (inst->Instruction.Opcode == TGSI_OPCODE_TG4 &&
8314 (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
8315 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY)) {
8316 tex.offset_z = 0;
8317 }
8318 else {
8319 tex.offset_z = offset_z;
8320 }
8321
8322 /* Put the depth for comparison in W.
8323 * TGSI_TEXTURE_SHADOW2D_ARRAY already has the depth in W.
8324 * Some instructions expect the depth in Z. */
8325 if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
8326 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
8327 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
8328 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) &&
8329 opcode != FETCH_OP_SAMPLE_C_L &&
8330 opcode != FETCH_OP_SAMPLE_C_LB) {
8331 tex.src_sel_w = tex.src_sel_z;
8332 }
8333
8334 if (inst->Texture.Texture == TGSI_TEXTURE_1D_ARRAY ||
8335 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) {
8336 if (opcode == FETCH_OP_SAMPLE_C_L ||
8337 opcode == FETCH_OP_SAMPLE_C_LB) {
8338 /* the array index is read from Y */
8339 tex.coord_type_y = 0;
8340 } else {
8341 /* the array index is read from Z */
8342 tex.coord_type_z = 0;
8343 tex.src_sel_z = tex.src_sel_y;
8344 }
8345 } else if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
8346 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
8347 ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
8348 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
8349 (ctx->bc->chip_class >= EVERGREEN)))
8350 /* the array index is read from Z */
8351 tex.coord_type_z = 0;
8352
8353 /* mask unused source components */
8354 if (opcode == FETCH_OP_SAMPLE || opcode == FETCH_OP_GATHER4) {
8355 switch (inst->Texture.Texture) {
8356 case TGSI_TEXTURE_2D:
8357 case TGSI_TEXTURE_RECT:
8358 tex.src_sel_z = 7;
8359 tex.src_sel_w = 7;
8360 break;
8361 case TGSI_TEXTURE_1D_ARRAY:
8362 tex.src_sel_y = 7;
8363 tex.src_sel_w = 7;
8364 break;
8365 case TGSI_TEXTURE_1D:
8366 tex.src_sel_y = 7;
8367 tex.src_sel_z = 7;
8368 tex.src_sel_w = 7;
8369 break;
8370 }
8371 }
8372
8373 r = r600_bytecode_add_tex(ctx->bc, &tex);
8374 if (r)
8375 return r;
8376
8377 /* add shadow ambient support - gallium doesn't do it yet */
8378 return 0;
8379 }
8380
8381 static int find_hw_atomic_counter(struct r600_shader_ctx *ctx,
8382 struct tgsi_full_src_register *src)
8383 {
8384 unsigned i;
8385
8386 if (src->Register.Indirect) {
8387 for (i = 0; i < ctx->shader->nhwatomic_ranges; i++) {
8388 if (src->Indirect.ArrayID == ctx->shader->atomics[i].array_id)
8389 return ctx->shader->atomics[i].hw_idx;
8390 }
8391 } else {
8392 uint32_t index = src->Register.Index;
8393 for (i = 0; i < ctx->shader->nhwatomic_ranges; i++) {
8394 if (ctx->shader->atomics[i].buffer_id != (unsigned)src->Dimension.Index)
8395 continue;
8396 if (index > ctx->shader->atomics[i].end)
8397 continue;
8398 if (index < ctx->shader->atomics[i].start)
8399 continue;
8400 uint32_t offset = (index - ctx->shader->atomics[i].start);
8401 return ctx->shader->atomics[i].hw_idx + offset;
8402 }
8403 }
8404 assert(0);
8405 return -1;
8406 }
8407
8408 static int tgsi_set_gds_temp(struct r600_shader_ctx *ctx,
8409 int *uav_id_p, int *uav_index_mode_p)
8410 {
8411 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8412 int uav_id, uav_index_mode = 0;
8413 int r;
8414 bool is_cm = (ctx->bc->chip_class == CAYMAN);
8415
8416 uav_id = find_hw_atomic_counter(ctx, &inst->Src[0]);
8417
8418 if (inst->Src[0].Register.Indirect) {
8419 if (is_cm) {
8420 struct r600_bytecode_alu alu;
8421 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8422 alu.op = ALU_OP2_LSHL_INT;
8423 alu.src[0].sel = get_address_file_reg(ctx, inst->Src[0].Indirect.Index);
8424 alu.src[0].chan = 0;
8425 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8426 alu.src[1].value = 2;
8427 alu.dst.sel = ctx->temp_reg;
8428 alu.dst.chan = 0;
8429 alu.dst.write = 1;
8430 alu.last = 1;
8431 r = r600_bytecode_add_alu(ctx->bc, &alu);
8432 if (r)
8433 return r;
8434
8435 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
8436 ctx->temp_reg, 0,
8437 ctx->temp_reg, 0,
8438 V_SQ_ALU_SRC_LITERAL, uav_id * 4);
8439 if (r)
8440 return r;
8441 } else
8442 uav_index_mode = 2;
8443 } else if (is_cm) {
8444 r = single_alu_op2(ctx, ALU_OP1_MOV,
8445 ctx->temp_reg, 0,
8446 V_SQ_ALU_SRC_LITERAL, uav_id * 4,
8447 0, 0);
8448 if (r)
8449 return r;
8450 }
8451 *uav_id_p = uav_id;
8452 *uav_index_mode_p = uav_index_mode;
8453 return 0;
8454 }
8455
8456 static int tgsi_load_gds(struct r600_shader_ctx *ctx)
8457 {
8458 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8459 int r;
8460 struct r600_bytecode_gds gds;
8461 int uav_id = 0;
8462 int uav_index_mode = 0;
8463 bool is_cm = (ctx->bc->chip_class == CAYMAN);
8464
8465 r = tgsi_set_gds_temp(ctx, &uav_id, &uav_index_mode);
8466 if (r)
8467 return r;
8468
8469 memset(&gds, 0, sizeof(struct r600_bytecode_gds));
8470 gds.op = FETCH_OP_GDS_READ_RET;
8471 gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8472 gds.uav_id = is_cm ? 0 : uav_id;
8473 gds.uav_index_mode = is_cm ? 0 : uav_index_mode;
8474 gds.src_gpr = ctx->temp_reg;
8475 gds.src_sel_x = (is_cm) ? 0 : 4;
8476 gds.src_sel_y = 4;
8477 gds.src_sel_z = 4;
8478 gds.dst_sel_x = 0;
8479 gds.dst_sel_y = 7;
8480 gds.dst_sel_z = 7;
8481 gds.dst_sel_w = 7;
8482 gds.src_gpr2 = 0;
8483 gds.alloc_consume = !is_cm;
8484 r = r600_bytecode_add_gds(ctx->bc, &gds);
8485 if (r)
8486 return r;
8487
8488 ctx->bc->cf_last->vpm = 1;
8489 return 0;
8490 }
8491
8492 /* this fixes up 1D arrays properly */
8493 static int load_index_src(struct r600_shader_ctx *ctx, int src_index, int *idx_gpr)
8494 {
8495 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8496 int r, i;
8497 struct r600_bytecode_alu alu;
8498 int temp_reg = r600_get_temp(ctx);
8499
8500 for (i = 0; i < 4; i++) {
8501 bool def_val = true, write_zero = false;
8502 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8503 alu.op = ALU_OP1_MOV;
8504 alu.dst.sel = temp_reg;
8505 alu.dst.chan = i;
8506
8507 switch (inst->Memory.Texture) {
8508 case TGSI_TEXTURE_BUFFER:
8509 case TGSI_TEXTURE_1D:
8510 if (i == 1 || i == 2 || i == 3) {
8511 write_zero = true;
8512 }
8513 break;
8514 case TGSI_TEXTURE_1D_ARRAY:
8515 if (i == 1 || i == 3)
8516 write_zero = true;
8517 else if (i == 2) {
8518 r600_bytecode_src(&alu.src[0], &ctx->src[src_index], 1);
8519 def_val = false;
8520 }
8521 break;
8522 case TGSI_TEXTURE_2D:
8523 if (i == 2 || i == 3)
8524 write_zero = true;
8525 break;
8526 default:
8527 if (i == 3)
8528 write_zero = true;
8529 break;
8530 }
8531
8532 if (write_zero) {
8533 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
8534 alu.src[0].value = 0;
8535 } else if (def_val) {
8536 r600_bytecode_src(&alu.src[0], &ctx->src[src_index], i);
8537 }
8538
8539 if (i == 3)
8540 alu.last = 1;
8541 alu.dst.write = 1;
8542 r = r600_bytecode_add_alu(ctx->bc, &alu);
8543 if (r)
8544 return r;
8545 }
8546 *idx_gpr = temp_reg;
8547 return 0;
8548 }
8549
8550 static int load_buffer_coord(struct r600_shader_ctx *ctx, int src_idx,
8551 int temp_reg)
8552 {
8553 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8554 int r;
8555 if (inst->Src[src_idx].Register.File == TGSI_FILE_IMMEDIATE) {
8556 int value = (ctx->literals[4 * inst->Src[src_idx].Register.Index + inst->Src[src_idx].Register.SwizzleX]);
8557 r = single_alu_op2(ctx, ALU_OP1_MOV,
8558 temp_reg, 0,
8559 V_SQ_ALU_SRC_LITERAL, value >> 2,
8560 0, 0);
8561 if (r)
8562 return r;
8563 } else {
8564 struct r600_bytecode_alu alu;
8565 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8566 alu.op = ALU_OP2_LSHR_INT;
8567 r600_bytecode_src(&alu.src[0], &ctx->src[src_idx], 0);
8568 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
8569 alu.src[1].value = 2;
8570 alu.dst.sel = temp_reg;
8571 alu.dst.write = 1;
8572 alu.last = 1;
8573 r = r600_bytecode_add_alu(ctx->bc, &alu);
8574 if (r)
8575 return r;
8576 }
8577 return 0;
8578 }
8579
8580 static int tgsi_load_buffer(struct r600_shader_ctx *ctx)
8581 {
8582 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8583 /* have to work out the offset into the RAT immediate return buffer */
8584 struct r600_bytecode_vtx vtx;
8585 struct r600_bytecode_cf *cf;
8586 int r;
8587 int temp_reg = r600_get_temp(ctx);
8588 unsigned rat_index_mode;
8589 unsigned base;
8590
8591 rat_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
8592 base = R600_IMAGE_REAL_RESOURCE_OFFSET + ctx->info.file_count[TGSI_FILE_IMAGE];
8593
8594 r = load_buffer_coord(ctx, 1, temp_reg);
8595 if (r)
8596 return r;
8597 ctx->bc->cf_last->barrier = 1;
8598 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
8599 vtx.op = FETCH_OP_VFETCH;
8600 vtx.buffer_id = inst->Src[0].Register.Index + base;
8601 vtx.buffer_index_mode = rat_index_mode;
8602 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
8603 vtx.src_gpr = temp_reg;
8604 vtx.src_sel_x = 0;
8605 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8606 vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; /* SEL_X */
8607 vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; /* SEL_Y */
8608 vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; /* SEL_Z */
8609 vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; /* SEL_W */
8610 vtx.num_format_all = 1;
8611 vtx.format_comp_all = 1;
8612 vtx.srf_mode_all = 0;
8613
8614 if (inst->Dst[0].Register.WriteMask & 8) {
8615 vtx.data_format = FMT_32_32_32_32;
8616 vtx.use_const_fields = 0;
8617 } else if (inst->Dst[0].Register.WriteMask & 4) {
8618 vtx.data_format = FMT_32_32_32;
8619 vtx.use_const_fields = 0;
8620 } else if (inst->Dst[0].Register.WriteMask & 2) {
8621 vtx.data_format = FMT_32_32;
8622 vtx.use_const_fields = 0;
8623 } else {
8624 vtx.data_format = FMT_32;
8625 vtx.use_const_fields = 0;
8626 }
8627
8628 r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx);
8629 if (r)
8630 return r;
8631 cf = ctx->bc->cf_last;
8632 cf->barrier = 1;
8633 return 0;
8634 }
8635
8636 static int tgsi_load_rat(struct r600_shader_ctx *ctx)
8637 {
8638 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8639 /* have to work out the offset into the RAT immediate return buffer */
8640 struct r600_bytecode_vtx vtx;
8641 struct r600_bytecode_cf *cf;
8642 int r;
8643 int idx_gpr;
8644 unsigned format, num_format, format_comp, endian;
8645 const struct util_format_description *desc;
8646 unsigned rat_index_mode;
8647 unsigned immed_base;
8648
8649 r = load_thread_id_gpr(ctx);
8650 if (r)
8651 return r;
8652
8653 rat_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
8654
8655 immed_base = R600_IMAGE_IMMED_RESOURCE_OFFSET;
8656 r = load_index_src(ctx, 1, &idx_gpr);
8657 if (r)
8658 return r;
8659
8660 if (rat_index_mode)
8661 egcm_load_index_reg(ctx->bc, 1, false);
8662
8663 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
8664 cf = ctx->bc->cf_last;
8665
8666 cf->rat.id = ctx->shader->rat_base + inst->Src[0].Register.Index;
8667 cf->rat.inst = V_RAT_INST_NOP_RTN;
8668 cf->rat.index_mode = rat_index_mode;
8669 cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND;
8670 cf->output.gpr = ctx->thread_id_gpr;
8671 cf->output.index_gpr = idx_gpr;
8672 cf->output.comp_mask = 0xf;
8673 cf->output.burst_count = 1;
8674 cf->vpm = 1;
8675 cf->barrier = 1;
8676 cf->mark = 1;
8677 cf->output.elem_size = 0;
8678
8679 r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK);
8680 cf = ctx->bc->cf_last;
8681 cf->barrier = 1;
8682
8683 desc = util_format_description(inst->Memory.Format);
8684 r600_vertex_data_type(inst->Memory.Format,
8685 &format, &num_format, &format_comp, &endian);
8686 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
8687 vtx.op = FETCH_OP_VFETCH;
8688 vtx.buffer_id = immed_base + inst->Src[0].Register.Index;
8689 vtx.buffer_index_mode = rat_index_mode;
8690 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
8691 vtx.src_gpr = ctx->thread_id_gpr;
8692 vtx.src_sel_x = 1;
8693 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8694 vtx.dst_sel_x = desc->swizzle[0];
8695 vtx.dst_sel_y = desc->swizzle[1];
8696 vtx.dst_sel_z = desc->swizzle[2];
8697 vtx.dst_sel_w = desc->swizzle[3];
8698 vtx.srf_mode_all = 1;
8699 vtx.data_format = format;
8700 vtx.num_format_all = num_format;
8701 vtx.format_comp_all = format_comp;
8702 vtx.endian = endian;
8703 vtx.offset = 0;
8704 vtx.mega_fetch_count = 3;
8705 r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx);
8706 if (r)
8707 return r;
8708 cf = ctx->bc->cf_last;
8709 cf->barrier = 1;
8710 return 0;
8711 }
8712
8713 static int tgsi_load_lds(struct r600_shader_ctx *ctx)
8714 {
8715 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8716 struct r600_bytecode_alu alu;
8717 int r;
8718 int temp_reg = r600_get_temp(ctx);
8719
8720 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8721 alu.op = ALU_OP1_MOV;
8722 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
8723 alu.dst.sel = temp_reg;
8724 alu.dst.write = 1;
8725 alu.last = 1;
8726 r = r600_bytecode_add_alu(ctx->bc, &alu);
8727 if (r)
8728 return r;
8729
8730 r = do_lds_fetch_values(ctx, temp_reg,
8731 ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index, inst->Dst[0].Register.WriteMask);
8732 if (r)
8733 return r;
8734 return 0;
8735 }
8736
8737 static int tgsi_load(struct r600_shader_ctx *ctx)
8738 {
8739 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8740 if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
8741 return tgsi_load_rat(ctx);
8742 if (inst->Src[0].Register.File == TGSI_FILE_HW_ATOMIC)
8743 return tgsi_load_gds(ctx);
8744 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
8745 return tgsi_load_buffer(ctx);
8746 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY)
8747 return tgsi_load_lds(ctx);
8748 return 0;
8749 }
8750
8751 static int tgsi_store_buffer_rat(struct r600_shader_ctx *ctx)
8752 {
8753 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8754 struct r600_bytecode_cf *cf;
8755 int r, i;
8756 unsigned rat_index_mode;
8757 int lasti;
8758 int temp_reg = r600_get_temp(ctx), treg2 = r600_get_temp(ctx);
8759
8760 r = load_buffer_coord(ctx, 0, treg2);
8761 if (r)
8762 return r;
8763
8764 rat_index_mode = inst->Dst[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
8765 if (rat_index_mode)
8766 egcm_load_index_reg(ctx->bc, 1, false);
8767
8768 for (i = 0; i <= 3; i++) {
8769 struct r600_bytecode_alu alu;
8770 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8771 alu.op = ALU_OP1_MOV;
8772 alu.dst.sel = temp_reg;
8773 alu.dst.chan = i;
8774 alu.src[0].sel = V_SQ_ALU_SRC_0;
8775 alu.last = (i == 3);
8776 alu.dst.write = 1;
8777 r = r600_bytecode_add_alu(ctx->bc, &alu);
8778 if (r)
8779 return r;
8780 }
8781
8782 lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
8783 for (i = 0; i <= lasti; i++) {
8784 struct r600_bytecode_alu alu;
8785 if (!((1 << i) & inst->Dst[0].Register.WriteMask))
8786 continue;
8787
8788 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
8789 temp_reg, 0,
8790 treg2, 0,
8791 V_SQ_ALU_SRC_LITERAL, i);
8792 if (r)
8793 return r;
8794
8795 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8796 alu.op = ALU_OP1_MOV;
8797 alu.dst.sel = ctx->temp_reg;
8798 alu.dst.chan = 0;
8799
8800 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
8801 alu.last = 1;
8802 alu.dst.write = 1;
8803 r = r600_bytecode_add_alu(ctx->bc, &alu);
8804 if (r)
8805 return r;
8806
8807 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
8808 cf = ctx->bc->cf_last;
8809
8810 cf->rat.id = ctx->shader->rat_base + inst->Dst[0].Register.Index + ctx->info.file_count[TGSI_FILE_IMAGE];
8811 cf->rat.inst = V_RAT_INST_STORE_TYPED;
8812 cf->rat.index_mode = rat_index_mode;
8813 cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
8814 cf->output.gpr = ctx->temp_reg;
8815 cf->output.index_gpr = temp_reg;
8816 cf->output.comp_mask = 1;
8817 cf->output.burst_count = 1;
8818 cf->vpm = 1;
8819 cf->barrier = 1;
8820 cf->output.elem_size = 0;
8821 }
8822 return 0;
8823 }
8824
8825 static int tgsi_store_rat(struct r600_shader_ctx *ctx)
8826 {
8827 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8828 struct r600_bytecode_cf *cf;
8829 bool src_requires_loading = false;
8830 int val_gpr, idx_gpr;
8831 int r, i;
8832 unsigned rat_index_mode;
8833
8834 rat_index_mode = inst->Dst[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
8835
8836 r = load_index_src(ctx, 0, &idx_gpr);
8837 if (r)
8838 return r;
8839
8840 if (inst->Src[1].Register.File != TGSI_FILE_TEMPORARY)
8841 src_requires_loading = true;
8842
8843 if (src_requires_loading) {
8844 struct r600_bytecode_alu alu;
8845 for (i = 0; i < 4; i++) {
8846 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8847 alu.op = ALU_OP1_MOV;
8848 alu.dst.sel = ctx->temp_reg;
8849 alu.dst.chan = i;
8850
8851 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
8852 if (i == 3)
8853 alu.last = 1;
8854 alu.dst.write = 1;
8855 r = r600_bytecode_add_alu(ctx->bc, &alu);
8856 if (r)
8857 return r;
8858 }
8859 val_gpr = ctx->temp_reg;
8860 } else
8861 val_gpr = tgsi_tex_get_src_gpr(ctx, 1);
8862 if (rat_index_mode)
8863 egcm_load_index_reg(ctx->bc, 1, false);
8864
8865 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
8866 cf = ctx->bc->cf_last;
8867
8868 cf->rat.id = ctx->shader->rat_base + inst->Dst[0].Register.Index;
8869 cf->rat.inst = V_RAT_INST_STORE_TYPED;
8870 cf->rat.index_mode = rat_index_mode;
8871 cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
8872 cf->output.gpr = val_gpr;
8873 cf->output.index_gpr = idx_gpr;
8874 cf->output.comp_mask = 0xf;
8875 cf->output.burst_count = 1;
8876 cf->vpm = 1;
8877 cf->barrier = 1;
8878 cf->output.elem_size = 0;
8879 return 0;
8880 }
8881
8882 static int tgsi_store_lds(struct r600_shader_ctx *ctx)
8883 {
8884 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8885 struct r600_bytecode_alu alu;
8886 int r, i, lasti;
8887 int write_mask = inst->Dst[0].Register.WriteMask;
8888 int temp_reg = r600_get_temp(ctx);
8889
8890 /* LDS write */
8891 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8892 alu.op = ALU_OP1_MOV;
8893 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
8894 alu.dst.sel = temp_reg;
8895 alu.dst.write = 1;
8896 alu.last = 1;
8897 r = r600_bytecode_add_alu(ctx->bc, &alu);
8898 if (r)
8899 return r;
8900
8901 lasti = tgsi_last_instruction(write_mask);
8902 for (i = 1; i <= lasti; i++) {
8903 if (!(write_mask & (1 << i)))
8904 continue;
8905 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
8906 temp_reg, i,
8907 temp_reg, 0,
8908 V_SQ_ALU_SRC_LITERAL, 4 * i);
8909 if (r)
8910 return r;
8911 }
8912 for (i = 0; i <= lasti; i++) {
8913 if (!(write_mask & (1 << i)))
8914 continue;
8915
8916 if ((i == 0 && ((write_mask & 3) == 3)) ||
8917 (i == 2 && ((write_mask & 0xc) == 0xc))) {
8918 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8919 alu.op = LDS_OP3_LDS_WRITE_REL;
8920
8921 alu.src[0].sel = temp_reg;
8922 alu.src[0].chan = i;
8923 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
8924 r600_bytecode_src(&alu.src[2], &ctx->src[1], i + 1);
8925 alu.last = 1;
8926 alu.is_lds_idx_op = true;
8927 alu.lds_idx = 1;
8928 r = r600_bytecode_add_alu(ctx->bc, &alu);
8929 if (r)
8930 return r;
8931 i += 1;
8932 continue;
8933 }
8934 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8935 alu.op = LDS_OP2_LDS_WRITE;
8936
8937 alu.src[0].sel = temp_reg;
8938 alu.src[0].chan = i;
8939 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
8940
8941 alu.last = 1;
8942 alu.is_lds_idx_op = true;
8943
8944 r = r600_bytecode_add_alu(ctx->bc, &alu);
8945 if (r)
8946 return r;
8947 }
8948 return 0;
8949 }
8950
8951 static int tgsi_store(struct r600_shader_ctx *ctx)
8952 {
8953 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8954 if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER)
8955 return tgsi_store_buffer_rat(ctx);
8956 else if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY)
8957 return tgsi_store_lds(ctx);
8958 else
8959 return tgsi_store_rat(ctx);
8960 }
8961
8962 static int tgsi_atomic_op_rat(struct r600_shader_ctx *ctx)
8963 {
8964 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8965 /* have to work out the offset into the RAT immediate return buffer */
8966 struct r600_bytecode_alu alu;
8967 struct r600_bytecode_vtx vtx;
8968 struct r600_bytecode_cf *cf;
8969 int r;
8970 int idx_gpr;
8971 unsigned format, num_format, format_comp, endian;
8972 const struct util_format_description *desc;
8973 unsigned rat_index_mode;
8974 unsigned immed_base;
8975 unsigned rat_base;
8976
8977 immed_base = R600_IMAGE_IMMED_RESOURCE_OFFSET;
8978 rat_base = ctx->shader->rat_base;
8979
8980 r = load_thread_id_gpr(ctx);
8981 if (r)
8982 return r;
8983
8984 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
8985 immed_base += ctx->info.file_count[TGSI_FILE_IMAGE];
8986 rat_base += ctx->info.file_count[TGSI_FILE_IMAGE];
8987
8988 r = load_buffer_coord(ctx, 1, ctx->temp_reg);
8989 if (r)
8990 return r;
8991 idx_gpr = ctx->temp_reg;
8992 } else {
8993 r = load_index_src(ctx, 1, &idx_gpr);
8994 if (r)
8995 return r;
8996 }
8997
8998 rat_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
8999
9000 if (ctx->inst_info->op == V_RAT_INST_CMPXCHG_INT_RTN) {
9001 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9002 alu.op = ALU_OP1_MOV;
9003 alu.dst.sel = ctx->thread_id_gpr;
9004 alu.dst.chan = 0;
9005 alu.dst.write = 1;
9006 r600_bytecode_src(&alu.src[0], &ctx->src[3], 0);
9007 alu.last = 1;
9008 r = r600_bytecode_add_alu(ctx->bc, &alu);
9009 if (r)
9010 return r;
9011
9012 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9013 alu.op = ALU_OP1_MOV;
9014 alu.dst.sel = ctx->thread_id_gpr;
9015 if (ctx->bc->chip_class == CAYMAN)
9016 alu.dst.chan = 2;
9017 else
9018 alu.dst.chan = 3;
9019 alu.dst.write = 1;
9020 r600_bytecode_src(&alu.src[0], &ctx->src[2], 0);
9021 alu.last = 1;
9022 r = r600_bytecode_add_alu(ctx->bc, &alu);
9023 if (r)
9024 return r;
9025 } else {
9026 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9027 alu.op = ALU_OP1_MOV;
9028 alu.dst.sel = ctx->thread_id_gpr;
9029 alu.dst.chan = 0;
9030 alu.dst.write = 1;
9031 r600_bytecode_src(&alu.src[0], &ctx->src[2], 0);
9032 alu.last = 1;
9033 r = r600_bytecode_add_alu(ctx->bc, &alu);
9034 if (r)
9035 return r;
9036 }
9037
9038 if (rat_index_mode)
9039 egcm_load_index_reg(ctx->bc, 1, false);
9040 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
9041 cf = ctx->bc->cf_last;
9042
9043 cf->rat.id = rat_base + inst->Src[0].Register.Index;
9044 cf->rat.inst = ctx->inst_info->op;
9045 cf->rat.index_mode = rat_index_mode;
9046 cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND;
9047 cf->output.gpr = ctx->thread_id_gpr;
9048 cf->output.index_gpr = idx_gpr;
9049 cf->output.comp_mask = 0xf;
9050 cf->output.burst_count = 1;
9051 cf->vpm = 1;
9052 cf->barrier = 1;
9053 cf->mark = 1;
9054 cf->output.elem_size = 0;
9055 r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK);
9056 cf = ctx->bc->cf_last;
9057 cf->barrier = 1;
9058 cf->cf_addr = 1;
9059
9060 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
9061 if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
9062 desc = util_format_description(inst->Memory.Format);
9063 r600_vertex_data_type(inst->Memory.Format,
9064 &format, &num_format, &format_comp, &endian);
9065 vtx.dst_sel_x = desc->swizzle[0];
9066 } else {
9067 format = FMT_32;
9068 num_format = 1;
9069 format_comp = 0;
9070 endian = 0;
9071 vtx.dst_sel_x = 0;
9072 }
9073 vtx.op = FETCH_OP_VFETCH;
9074 vtx.buffer_id = immed_base + inst->Src[0].Register.Index;
9075 vtx.buffer_index_mode = rat_index_mode;
9076 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
9077 vtx.src_gpr = ctx->thread_id_gpr;
9078 vtx.src_sel_x = 1;
9079 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
9080 vtx.dst_sel_y = 7;
9081 vtx.dst_sel_z = 7;
9082 vtx.dst_sel_w = 7;
9083 vtx.use_const_fields = 0;
9084 vtx.srf_mode_all = 1;
9085 vtx.data_format = format;
9086 vtx.num_format_all = num_format;
9087 vtx.format_comp_all = format_comp;
9088 vtx.endian = endian;
9089 vtx.offset = 0;
9090 vtx.mega_fetch_count = 0xf;
9091 r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx);
9092 if (r)
9093 return r;
9094 cf = ctx->bc->cf_last;
9095 cf->vpm = 1;
9096 cf->barrier = 1;
9097 return 0;
9098 }
9099
9100 static int get_gds_op(int opcode)
9101 {
9102 switch (opcode) {
9103 case TGSI_OPCODE_ATOMUADD:
9104 return FETCH_OP_GDS_ADD_RET;
9105 case TGSI_OPCODE_ATOMAND:
9106 return FETCH_OP_GDS_AND_RET;
9107 case TGSI_OPCODE_ATOMOR:
9108 return FETCH_OP_GDS_OR_RET;
9109 case TGSI_OPCODE_ATOMXOR:
9110 return FETCH_OP_GDS_XOR_RET;
9111 case TGSI_OPCODE_ATOMUMIN:
9112 return FETCH_OP_GDS_MIN_UINT_RET;
9113 case TGSI_OPCODE_ATOMUMAX:
9114 return FETCH_OP_GDS_MAX_UINT_RET;
9115 case TGSI_OPCODE_ATOMXCHG:
9116 return FETCH_OP_GDS_XCHG_RET;
9117 case TGSI_OPCODE_ATOMCAS:
9118 return FETCH_OP_GDS_CMP_XCHG_RET;
9119 default:
9120 return -1;
9121 }
9122 }
9123
9124 static int tgsi_atomic_op_gds(struct r600_shader_ctx *ctx)
9125 {
9126 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9127 struct r600_bytecode_gds gds;
9128 struct r600_bytecode_alu alu;
9129 int gds_op = get_gds_op(inst->Instruction.Opcode);
9130 int r;
9131 int uav_id = 0;
9132 int uav_index_mode = 0;
9133 bool is_cm = (ctx->bc->chip_class == CAYMAN);
9134
9135 if (gds_op == -1) {
9136 fprintf(stderr, "unknown GDS op for opcode %d\n", inst->Instruction.Opcode);
9137 return -1;
9138 }
9139
9140 r = tgsi_set_gds_temp(ctx, &uav_id, &uav_index_mode);
9141 if (r)
9142 return r;
9143
9144 if (gds_op == FETCH_OP_GDS_CMP_XCHG_RET) {
9145 if (inst->Src[3].Register.File == TGSI_FILE_IMMEDIATE) {
9146 int value = (ctx->literals[4 * inst->Src[3].Register.Index + inst->Src[3].Register.SwizzleX]);
9147 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9148 alu.op = ALU_OP1_MOV;
9149 alu.dst.sel = ctx->temp_reg;
9150 alu.dst.chan = is_cm ? 2 : 1;
9151 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
9152 alu.src[0].value = value;
9153 alu.last = 1;
9154 alu.dst.write = 1;
9155 r = r600_bytecode_add_alu(ctx->bc, &alu);
9156 if (r)
9157 return r;
9158 } else {
9159 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9160 alu.op = ALU_OP1_MOV;
9161 alu.dst.sel = ctx->temp_reg;
9162 alu.dst.chan = is_cm ? 2 : 1;
9163 r600_bytecode_src(&alu.src[0], &ctx->src[3], 0);
9164 alu.last = 1;
9165 alu.dst.write = 1;
9166 r = r600_bytecode_add_alu(ctx->bc, &alu);
9167 if (r)
9168 return r;
9169 }
9170 }
9171 if (inst->Src[2].Register.File == TGSI_FILE_IMMEDIATE) {
9172 int value = (ctx->literals[4 * inst->Src[2].Register.Index + inst->Src[2].Register.SwizzleX]);
9173 int abs_value = abs(value);
9174 if (abs_value != value && gds_op == FETCH_OP_GDS_ADD_RET)
9175 gds_op = FETCH_OP_GDS_SUB_RET;
9176 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9177 alu.op = ALU_OP1_MOV;
9178 alu.dst.sel = ctx->temp_reg;
9179 alu.dst.chan = is_cm ? 1 : 0;
9180 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
9181 alu.src[0].value = abs_value;
9182 alu.last = 1;
9183 alu.dst.write = 1;
9184 r = r600_bytecode_add_alu(ctx->bc, &alu);
9185 if (r)
9186 return r;
9187 } else {
9188 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9189 alu.op = ALU_OP1_MOV;
9190 alu.dst.sel = ctx->temp_reg;
9191 alu.dst.chan = is_cm ? 1 : 0;
9192 r600_bytecode_src(&alu.src[0], &ctx->src[2], 0);
9193 alu.last = 1;
9194 alu.dst.write = 1;
9195 r = r600_bytecode_add_alu(ctx->bc, &alu);
9196 if (r)
9197 return r;
9198 }
9199
9200
9201 memset(&gds, 0, sizeof(struct r600_bytecode_gds));
9202 gds.op = gds_op;
9203 gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
9204 gds.uav_id = is_cm ? 0 : uav_id;
9205 gds.uav_index_mode = is_cm ? 0 : uav_index_mode;
9206 gds.src_gpr = ctx->temp_reg;
9207 gds.src_gpr2 = 0;
9208 gds.src_sel_x = is_cm ? 0 : 4;
9209 gds.src_sel_y = is_cm ? 1 : 0;
9210 if (gds_op == FETCH_OP_GDS_CMP_XCHG_RET)
9211 gds.src_sel_z = is_cm ? 2 : 1;
9212 else
9213 gds.src_sel_z = 7;
9214 gds.dst_sel_x = 0;
9215 gds.dst_sel_y = 7;
9216 gds.dst_sel_z = 7;
9217 gds.dst_sel_w = 7;
9218 gds.alloc_consume = !is_cm;
9219
9220 r = r600_bytecode_add_gds(ctx->bc, &gds);
9221 if (r)
9222 return r;
9223 ctx->bc->cf_last->vpm = 1;
9224 return 0;
9225 }
9226
9227 static int get_lds_op(int opcode)
9228 {
9229 switch (opcode) {
9230 case TGSI_OPCODE_ATOMUADD:
9231 return LDS_OP2_LDS_ADD_RET;
9232 case TGSI_OPCODE_ATOMAND:
9233 return LDS_OP2_LDS_AND_RET;
9234 case TGSI_OPCODE_ATOMOR:
9235 return LDS_OP2_LDS_OR_RET;
9236 case TGSI_OPCODE_ATOMXOR:
9237 return LDS_OP2_LDS_XOR_RET;
9238 case TGSI_OPCODE_ATOMUMIN:
9239 return LDS_OP2_LDS_MIN_UINT_RET;
9240 case TGSI_OPCODE_ATOMUMAX:
9241 return LDS_OP2_LDS_MAX_UINT_RET;
9242 case TGSI_OPCODE_ATOMIMIN:
9243 return LDS_OP2_LDS_MIN_INT_RET;
9244 case TGSI_OPCODE_ATOMIMAX:
9245 return LDS_OP2_LDS_MAX_INT_RET;
9246 case TGSI_OPCODE_ATOMXCHG:
9247 return LDS_OP2_LDS_XCHG_RET;
9248 case TGSI_OPCODE_ATOMCAS:
9249 return LDS_OP3_LDS_CMP_XCHG_RET;
9250 default:
9251 return -1;
9252 }
9253 }
9254
9255 static int tgsi_atomic_op_lds(struct r600_shader_ctx *ctx)
9256 {
9257 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9258 int lds_op = get_lds_op(inst->Instruction.Opcode);
9259 int r;
9260
9261 struct r600_bytecode_alu alu;
9262 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9263 alu.op = lds_op;
9264 alu.is_lds_idx_op = true;
9265 alu.last = 1;
9266 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
9267 r600_bytecode_src(&alu.src[1], &ctx->src[2], 0);
9268 if (lds_op == LDS_OP3_LDS_CMP_XCHG_RET)
9269 r600_bytecode_src(&alu.src[2], &ctx->src[3], 0);
9270 else
9271 alu.src[2].sel = V_SQ_ALU_SRC_0;
9272 r = r600_bytecode_add_alu(ctx->bc, &alu);
9273 if (r)
9274 return r;
9275
9276 /* then read from LDS_OQ_A_POP */
9277 memset(&alu, 0, sizeof(alu));
9278
9279 alu.op = ALU_OP1_MOV;
9280 alu.src[0].sel = EG_V_SQ_ALU_SRC_LDS_OQ_A_POP;
9281 alu.src[0].chan = 0;
9282 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
9283 alu.dst.write = 1;
9284 alu.last = 1;
9285 r = r600_bytecode_add_alu(ctx->bc, &alu);
9286 if (r)
9287 return r;
9288
9289 return 0;
9290 }
9291
9292 static int tgsi_atomic_op(struct r600_shader_ctx *ctx)
9293 {
9294 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9295 if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
9296 return tgsi_atomic_op_rat(ctx);
9297 if (inst->Src[0].Register.File == TGSI_FILE_HW_ATOMIC)
9298 return tgsi_atomic_op_gds(ctx);
9299 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
9300 return tgsi_atomic_op_rat(ctx);
9301 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY)
9302 return tgsi_atomic_op_lds(ctx);
9303 return 0;
9304 }
9305
9306 static int tgsi_resq(struct r600_shader_ctx *ctx)
9307 {
9308 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9309 unsigned sampler_index_mode;
9310 struct r600_bytecode_tex tex;
9311 int r;
9312 boolean has_txq_cube_array_z = false;
9313
9314 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
9315 (inst->Src[0].Register.File == TGSI_FILE_IMAGE && inst->Memory.Texture == TGSI_TEXTURE_BUFFER)) {
9316 if (ctx->bc->chip_class < EVERGREEN)
9317 ctx->shader->uses_tex_buffers = true;
9318 unsigned eg_buffer_base = 0;
9319 eg_buffer_base = R600_IMAGE_REAL_RESOURCE_OFFSET;
9320 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
9321 eg_buffer_base += ctx->info.file_count[TGSI_FILE_IMAGE];
9322 return r600_do_buffer_txq(ctx, 0, ctx->shader->image_size_const_offset, eg_buffer_base);
9323 }
9324
9325 if (inst->Memory.Texture == TGSI_TEXTURE_CUBE_ARRAY &&
9326 inst->Dst[0].Register.WriteMask & 4) {
9327 ctx->shader->has_txq_cube_array_z_comp = true;
9328 has_txq_cube_array_z = true;
9329 }
9330
9331 sampler_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
9332 if (sampler_index_mode)
9333 egcm_load_index_reg(ctx->bc, 1, false);
9334
9335
9336 /* does this shader want a num layers from TXQ for a cube array? */
9337 if (has_txq_cube_array_z) {
9338 int id = tgsi_tex_get_src_gpr(ctx, 0) + ctx->shader->image_size_const_offset;
9339 struct r600_bytecode_alu alu;
9340
9341 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9342 alu.op = ALU_OP1_MOV;
9343
9344 alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
9345 /* with eg each dword is either number of cubes */
9346 alu.src[0].sel += id / 4;
9347 alu.src[0].chan = id % 4;
9348 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
9349 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
9350 alu.last = 1;
9351 r = r600_bytecode_add_alu(ctx->bc, &alu);
9352 if (r)
9353 return r;
9354 /* disable writemask from texture instruction */
9355 inst->Dst[0].Register.WriteMask &= ~4;
9356 }
9357 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
9358 tex.op = ctx->inst_info->op;
9359 tex.sampler_id = R600_IMAGE_REAL_RESOURCE_OFFSET + inst->Src[0].Register.Index;
9360 tex.sampler_index_mode = sampler_index_mode;
9361 tex.resource_id = tex.sampler_id;
9362 tex.resource_index_mode = sampler_index_mode;
9363 tex.src_sel_x = 4;
9364 tex.src_sel_y = 4;
9365 tex.src_sel_z = 4;
9366 tex.src_sel_w = 4;
9367 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
9368 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
9369 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
9370 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
9371 tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
9372 r = r600_bytecode_add_tex(ctx->bc, &tex);
9373 if (r)
9374 return r;
9375
9376 return 0;
9377 }
9378
9379 static int tgsi_lrp(struct r600_shader_ctx *ctx)
9380 {
9381 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9382 struct r600_bytecode_alu alu;
9383 unsigned lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9384 unsigned i, temp_regs[2];
9385 int r;
9386
9387 /* optimize if it's just an equal balance */
9388 if (ctx->src[0].sel == V_SQ_ALU_SRC_0_5) {
9389 for (i = 0; i < lasti + 1; i++) {
9390 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9391 continue;
9392
9393 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9394 alu.op = ALU_OP2_ADD;
9395 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
9396 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
9397 alu.omod = 3;
9398 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9399 alu.dst.chan = i;
9400 if (i == lasti) {
9401 alu.last = 1;
9402 }
9403 r = r600_bytecode_add_alu(ctx->bc, &alu);
9404 if (r)
9405 return r;
9406 }
9407 return 0;
9408 }
9409
9410 /* 1 - src0 */
9411 for (i = 0; i < lasti + 1; i++) {
9412 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9413 continue;
9414
9415 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9416 alu.op = ALU_OP2_ADD;
9417 alu.src[0].sel = V_SQ_ALU_SRC_1;
9418 alu.src[0].chan = 0;
9419 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
9420 r600_bytecode_src_toggle_neg(&alu.src[1]);
9421 alu.dst.sel = ctx->temp_reg;
9422 alu.dst.chan = i;
9423 if (i == lasti) {
9424 alu.last = 1;
9425 }
9426 alu.dst.write = 1;
9427 r = r600_bytecode_add_alu(ctx->bc, &alu);
9428 if (r)
9429 return r;
9430 }
9431
9432 /* (1 - src0) * src2 */
9433 for (i = 0; i < lasti + 1; i++) {
9434 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9435 continue;
9436
9437 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9438 alu.op = ALU_OP2_MUL;
9439 alu.src[0].sel = ctx->temp_reg;
9440 alu.src[0].chan = i;
9441 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
9442 alu.dst.sel = ctx->temp_reg;
9443 alu.dst.chan = i;
9444 if (i == lasti) {
9445 alu.last = 1;
9446 }
9447 alu.dst.write = 1;
9448 r = r600_bytecode_add_alu(ctx->bc, &alu);
9449 if (r)
9450 return r;
9451 }
9452
9453 /* src0 * src1 + (1 - src0) * src2 */
9454 if (ctx->src[0].abs)
9455 temp_regs[0] = r600_get_temp(ctx);
9456 else
9457 temp_regs[0] = 0;
9458 if (ctx->src[1].abs)
9459 temp_regs[1] = r600_get_temp(ctx);
9460 else
9461 temp_regs[1] = 0;
9462
9463 for (i = 0; i < lasti + 1; i++) {
9464 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9465 continue;
9466
9467 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9468 alu.op = ALU_OP3_MULADD;
9469 alu.is_op3 = 1;
9470 r = tgsi_make_src_for_op3(ctx, temp_regs[0], i, &alu.src[0], &ctx->src[0]);
9471 if (r)
9472 return r;
9473 r = tgsi_make_src_for_op3(ctx, temp_regs[1], i, &alu.src[1], &ctx->src[1]);
9474 if (r)
9475 return r;
9476 alu.src[2].sel = ctx->temp_reg;
9477 alu.src[2].chan = i;
9478
9479 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9480 alu.dst.chan = i;
9481 if (i == lasti) {
9482 alu.last = 1;
9483 }
9484 r = r600_bytecode_add_alu(ctx->bc, &alu);
9485 if (r)
9486 return r;
9487 }
9488 return 0;
9489 }
9490
9491 static int tgsi_cmp(struct r600_shader_ctx *ctx)
9492 {
9493 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9494 struct r600_bytecode_alu alu;
9495 int i, r, j;
9496 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9497 int temp_regs[3];
9498 unsigned op;
9499
9500 if (ctx->src[0].abs && ctx->src[0].neg) {
9501 op = ALU_OP3_CNDE;
9502 ctx->src[0].abs = 0;
9503 ctx->src[0].neg = 0;
9504 } else {
9505 op = ALU_OP3_CNDGE;
9506 }
9507
9508 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
9509 temp_regs[j] = 0;
9510 if (ctx->src[j].abs)
9511 temp_regs[j] = r600_get_temp(ctx);
9512 }
9513
9514 for (i = 0; i < lasti + 1; i++) {
9515 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9516 continue;
9517
9518 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9519 alu.op = op;
9520 r = tgsi_make_src_for_op3(ctx, temp_regs[0], i, &alu.src[0], &ctx->src[0]);
9521 if (r)
9522 return r;
9523 r = tgsi_make_src_for_op3(ctx, temp_regs[2], i, &alu.src[1], &ctx->src[2]);
9524 if (r)
9525 return r;
9526 r = tgsi_make_src_for_op3(ctx, temp_regs[1], i, &alu.src[2], &ctx->src[1]);
9527 if (r)
9528 return r;
9529 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9530 alu.dst.chan = i;
9531 alu.dst.write = 1;
9532 alu.is_op3 = 1;
9533 if (i == lasti)
9534 alu.last = 1;
9535 r = r600_bytecode_add_alu(ctx->bc, &alu);
9536 if (r)
9537 return r;
9538 }
9539 return 0;
9540 }
9541
9542 static int tgsi_ucmp(struct r600_shader_ctx *ctx)
9543 {
9544 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9545 struct r600_bytecode_alu alu;
9546 int i, r;
9547 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9548
9549 for (i = 0; i < lasti + 1; i++) {
9550 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9551 continue;
9552
9553 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9554 alu.op = ALU_OP3_CNDE_INT;
9555 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
9556 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
9557 r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
9558 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9559 alu.dst.chan = i;
9560 alu.dst.write = 1;
9561 alu.is_op3 = 1;
9562 if (i == lasti)
9563 alu.last = 1;
9564 r = r600_bytecode_add_alu(ctx->bc, &alu);
9565 if (r)
9566 return r;
9567 }
9568 return 0;
9569 }
9570
9571 static int tgsi_exp(struct r600_shader_ctx *ctx)
9572 {
9573 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9574 struct r600_bytecode_alu alu;
9575 int r;
9576 unsigned i;
9577
9578 /* result.x = 2^floor(src); */
9579 if (inst->Dst[0].Register.WriteMask & 1) {
9580 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9581
9582 alu.op = ALU_OP1_FLOOR;
9583 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9584
9585 alu.dst.sel = ctx->temp_reg;
9586 alu.dst.chan = 0;
9587 alu.dst.write = 1;
9588 alu.last = 1;
9589 r = r600_bytecode_add_alu(ctx->bc, &alu);
9590 if (r)
9591 return r;
9592
9593 if (ctx->bc->chip_class == CAYMAN) {
9594 for (i = 0; i < 3; i++) {
9595 alu.op = ALU_OP1_EXP_IEEE;
9596 alu.src[0].sel = ctx->temp_reg;
9597 alu.src[0].chan = 0;
9598
9599 alu.dst.sel = ctx->temp_reg;
9600 alu.dst.chan = i;
9601 alu.dst.write = i == 0;
9602 alu.last = i == 2;
9603 r = r600_bytecode_add_alu(ctx->bc, &alu);
9604 if (r)
9605 return r;
9606 }
9607 } else {
9608 alu.op = ALU_OP1_EXP_IEEE;
9609 alu.src[0].sel = ctx->temp_reg;
9610 alu.src[0].chan = 0;
9611
9612 alu.dst.sel = ctx->temp_reg;
9613 alu.dst.chan = 0;
9614 alu.dst.write = 1;
9615 alu.last = 1;
9616 r = r600_bytecode_add_alu(ctx->bc, &alu);
9617 if (r)
9618 return r;
9619 }
9620 }
9621
9622 /* result.y = tmp - floor(tmp); */
9623 if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
9624 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9625
9626 alu.op = ALU_OP1_FRACT;
9627 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9628
9629 alu.dst.sel = ctx->temp_reg;
9630 #if 0
9631 r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9632 if (r)
9633 return r;
9634 #endif
9635 alu.dst.write = 1;
9636 alu.dst.chan = 1;
9637
9638 alu.last = 1;
9639
9640 r = r600_bytecode_add_alu(ctx->bc, &alu);
9641 if (r)
9642 return r;
9643 }
9644
9645 /* result.z = RoughApprox2ToX(tmp);*/
9646 if ((inst->Dst[0].Register.WriteMask >> 2) & 0x1) {
9647 if (ctx->bc->chip_class == CAYMAN) {
9648 for (i = 0; i < 3; i++) {
9649 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9650 alu.op = ALU_OP1_EXP_IEEE;
9651 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9652
9653 alu.dst.sel = ctx->temp_reg;
9654 alu.dst.chan = i;
9655 if (i == 2) {
9656 alu.dst.write = 1;
9657 alu.last = 1;
9658 }
9659
9660 r = r600_bytecode_add_alu(ctx->bc, &alu);
9661 if (r)
9662 return r;
9663 }
9664 } else {
9665 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9666 alu.op = ALU_OP1_EXP_IEEE;
9667 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9668
9669 alu.dst.sel = ctx->temp_reg;
9670 alu.dst.write = 1;
9671 alu.dst.chan = 2;
9672
9673 alu.last = 1;
9674
9675 r = r600_bytecode_add_alu(ctx->bc, &alu);
9676 if (r)
9677 return r;
9678 }
9679 }
9680
9681 /* result.w = 1.0;*/
9682 if ((inst->Dst[0].Register.WriteMask >> 3) & 0x1) {
9683 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9684
9685 alu.op = ALU_OP1_MOV;
9686 alu.src[0].sel = V_SQ_ALU_SRC_1;
9687 alu.src[0].chan = 0;
9688
9689 alu.dst.sel = ctx->temp_reg;
9690 alu.dst.chan = 3;
9691 alu.dst.write = 1;
9692 alu.last = 1;
9693 r = r600_bytecode_add_alu(ctx->bc, &alu);
9694 if (r)
9695 return r;
9696 }
9697 return tgsi_helper_copy(ctx, inst);
9698 }
9699
9700 static int tgsi_log(struct r600_shader_ctx *ctx)
9701 {
9702 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9703 struct r600_bytecode_alu alu;
9704 int r;
9705 unsigned i;
9706
9707 /* result.x = floor(log2(|src|)); */
9708 if (inst->Dst[0].Register.WriteMask & 1) {
9709 if (ctx->bc->chip_class == CAYMAN) {
9710 for (i = 0; i < 3; i++) {
9711 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9712
9713 alu.op = ALU_OP1_LOG_IEEE;
9714 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9715 r600_bytecode_src_set_abs(&alu.src[0]);
9716
9717 alu.dst.sel = ctx->temp_reg;
9718 alu.dst.chan = i;
9719 if (i == 0)
9720 alu.dst.write = 1;
9721 if (i == 2)
9722 alu.last = 1;
9723 r = r600_bytecode_add_alu(ctx->bc, &alu);
9724 if (r)
9725 return r;
9726 }
9727
9728 } else {
9729 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9730
9731 alu.op = ALU_OP1_LOG_IEEE;
9732 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9733 r600_bytecode_src_set_abs(&alu.src[0]);
9734
9735 alu.dst.sel = ctx->temp_reg;
9736 alu.dst.chan = 0;
9737 alu.dst.write = 1;
9738 alu.last = 1;
9739 r = r600_bytecode_add_alu(ctx->bc, &alu);
9740 if (r)
9741 return r;
9742 }
9743
9744 alu.op = ALU_OP1_FLOOR;
9745 alu.src[0].sel = ctx->temp_reg;
9746 alu.src[0].chan = 0;
9747
9748 alu.dst.sel = ctx->temp_reg;
9749 alu.dst.chan = 0;
9750 alu.dst.write = 1;
9751 alu.last = 1;
9752
9753 r = r600_bytecode_add_alu(ctx->bc, &alu);
9754 if (r)
9755 return r;
9756 }
9757
9758 /* result.y = |src.x| / (2 ^ floor(log2(|src.x|))); */
9759 if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
9760
9761 if (ctx->bc->chip_class == CAYMAN) {
9762 for (i = 0; i < 3; i++) {
9763 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9764
9765 alu.op = ALU_OP1_LOG_IEEE;
9766 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9767 r600_bytecode_src_set_abs(&alu.src[0]);
9768
9769 alu.dst.sel = ctx->temp_reg;
9770 alu.dst.chan = i;
9771 if (i == 1)
9772 alu.dst.write = 1;
9773 if (i == 2)
9774 alu.last = 1;
9775
9776 r = r600_bytecode_add_alu(ctx->bc, &alu);
9777 if (r)
9778 return r;
9779 }
9780 } else {
9781 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9782
9783 alu.op = ALU_OP1_LOG_IEEE;
9784 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9785 r600_bytecode_src_set_abs(&alu.src[0]);
9786
9787 alu.dst.sel = ctx->temp_reg;
9788 alu.dst.chan = 1;
9789 alu.dst.write = 1;
9790 alu.last = 1;
9791
9792 r = r600_bytecode_add_alu(ctx->bc, &alu);
9793 if (r)
9794 return r;
9795 }
9796
9797 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9798
9799 alu.op = ALU_OP1_FLOOR;
9800 alu.src[0].sel = ctx->temp_reg;
9801 alu.src[0].chan = 1;
9802
9803 alu.dst.sel = ctx->temp_reg;
9804 alu.dst.chan = 1;
9805 alu.dst.write = 1;
9806 alu.last = 1;
9807
9808 r = r600_bytecode_add_alu(ctx->bc, &alu);
9809 if (r)
9810 return r;
9811
9812 if (ctx->bc->chip_class == CAYMAN) {
9813 for (i = 0; i < 3; i++) {
9814 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9815 alu.op = ALU_OP1_EXP_IEEE;
9816 alu.src[0].sel = ctx->temp_reg;
9817 alu.src[0].chan = 1;
9818
9819 alu.dst.sel = ctx->temp_reg;
9820 alu.dst.chan = i;
9821 if (i == 1)
9822 alu.dst.write = 1;
9823 if (i == 2)
9824 alu.last = 1;
9825
9826 r = r600_bytecode_add_alu(ctx->bc, &alu);
9827 if (r)
9828 return r;
9829 }
9830 } else {
9831 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9832 alu.op = ALU_OP1_EXP_IEEE;
9833 alu.src[0].sel = ctx->temp_reg;
9834 alu.src[0].chan = 1;
9835
9836 alu.dst.sel = ctx->temp_reg;
9837 alu.dst.chan = 1;
9838 alu.dst.write = 1;
9839 alu.last = 1;
9840
9841 r = r600_bytecode_add_alu(ctx->bc, &alu);
9842 if (r)
9843 return r;
9844 }
9845
9846 if (ctx->bc->chip_class == CAYMAN) {
9847 for (i = 0; i < 3; i++) {
9848 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9849 alu.op = ALU_OP1_RECIP_IEEE;
9850 alu.src[0].sel = ctx->temp_reg;
9851 alu.src[0].chan = 1;
9852
9853 alu.dst.sel = ctx->temp_reg;
9854 alu.dst.chan = i;
9855 if (i == 1)
9856 alu.dst.write = 1;
9857 if (i == 2)
9858 alu.last = 1;
9859
9860 r = r600_bytecode_add_alu(ctx->bc, &alu);
9861 if (r)
9862 return r;
9863 }
9864 } else {
9865 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9866 alu.op = ALU_OP1_RECIP_IEEE;
9867 alu.src[0].sel = ctx->temp_reg;
9868 alu.src[0].chan = 1;
9869
9870 alu.dst.sel = ctx->temp_reg;
9871 alu.dst.chan = 1;
9872 alu.dst.write = 1;
9873 alu.last = 1;
9874
9875 r = r600_bytecode_add_alu(ctx->bc, &alu);
9876 if (r)
9877 return r;
9878 }
9879
9880 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9881
9882 alu.op = ALU_OP2_MUL;
9883
9884 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9885 r600_bytecode_src_set_abs(&alu.src[0]);
9886
9887 alu.src[1].sel = ctx->temp_reg;
9888 alu.src[1].chan = 1;
9889
9890 alu.dst.sel = ctx->temp_reg;
9891 alu.dst.chan = 1;
9892 alu.dst.write = 1;
9893 alu.last = 1;
9894
9895 r = r600_bytecode_add_alu(ctx->bc, &alu);
9896 if (r)
9897 return r;
9898 }
9899
9900 /* result.z = log2(|src|);*/
9901 if ((inst->Dst[0].Register.WriteMask >> 2) & 1) {
9902 if (ctx->bc->chip_class == CAYMAN) {
9903 for (i = 0; i < 3; i++) {
9904 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9905
9906 alu.op = ALU_OP1_LOG_IEEE;
9907 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9908 r600_bytecode_src_set_abs(&alu.src[0]);
9909
9910 alu.dst.sel = ctx->temp_reg;
9911 if (i == 2)
9912 alu.dst.write = 1;
9913 alu.dst.chan = i;
9914 if (i == 2)
9915 alu.last = 1;
9916
9917 r = r600_bytecode_add_alu(ctx->bc, &alu);
9918 if (r)
9919 return r;
9920 }
9921 } else {
9922 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9923
9924 alu.op = ALU_OP1_LOG_IEEE;
9925 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9926 r600_bytecode_src_set_abs(&alu.src[0]);
9927
9928 alu.dst.sel = ctx->temp_reg;
9929 alu.dst.write = 1;
9930 alu.dst.chan = 2;
9931 alu.last = 1;
9932
9933 r = r600_bytecode_add_alu(ctx->bc, &alu);
9934 if (r)
9935 return r;
9936 }
9937 }
9938
9939 /* result.w = 1.0; */
9940 if ((inst->Dst[0].Register.WriteMask >> 3) & 1) {
9941 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9942
9943 alu.op = ALU_OP1_MOV;
9944 alu.src[0].sel = V_SQ_ALU_SRC_1;
9945 alu.src[0].chan = 0;
9946
9947 alu.dst.sel = ctx->temp_reg;
9948 alu.dst.chan = 3;
9949 alu.dst.write = 1;
9950 alu.last = 1;
9951
9952 r = r600_bytecode_add_alu(ctx->bc, &alu);
9953 if (r)
9954 return r;
9955 }
9956
9957 return tgsi_helper_copy(ctx, inst);
9958 }
9959
9960 static int tgsi_eg_arl(struct r600_shader_ctx *ctx)
9961 {
9962 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9963 struct r600_bytecode_alu alu;
9964 int r;
9965 int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9966 unsigned reg = get_address_file_reg(ctx, inst->Dst[0].Register.Index);
9967
9968 assert(inst->Dst[0].Register.Index < 3);
9969 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9970
9971 switch (inst->Instruction.Opcode) {
9972 case TGSI_OPCODE_ARL:
9973 alu.op = ALU_OP1_FLT_TO_INT_FLOOR;
9974 break;
9975 case TGSI_OPCODE_ARR:
9976 alu.op = ALU_OP1_FLT_TO_INT;
9977 break;
9978 case TGSI_OPCODE_UARL:
9979 alu.op = ALU_OP1_MOV;
9980 break;
9981 default:
9982 assert(0);
9983 return -1;
9984 }
9985
9986 for (i = 0; i <= lasti; ++i) {
9987 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9988 continue;
9989 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
9990 alu.last = i == lasti;
9991 alu.dst.sel = reg;
9992 alu.dst.chan = i;
9993 alu.dst.write = 1;
9994 r = r600_bytecode_add_alu(ctx->bc, &alu);
9995 if (r)
9996 return r;
9997 }
9998
9999 if (inst->Dst[0].Register.Index > 0)
10000 ctx->bc->index_loaded[inst->Dst[0].Register.Index - 1] = 0;
10001 else
10002 ctx->bc->ar_loaded = 0;
10003
10004 return 0;
10005 }
10006 static int tgsi_r600_arl(struct r600_shader_ctx *ctx)
10007 {
10008 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10009 struct r600_bytecode_alu alu;
10010 int r;
10011 int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10012
10013 switch (inst->Instruction.Opcode) {
10014 case TGSI_OPCODE_ARL:
10015 memset(&alu, 0, sizeof(alu));
10016 alu.op = ALU_OP1_FLOOR;
10017 alu.dst.sel = ctx->bc->ar_reg;
10018 alu.dst.write = 1;
10019 for (i = 0; i <= lasti; ++i) {
10020 if (inst->Dst[0].Register.WriteMask & (1 << i)) {
10021 alu.dst.chan = i;
10022 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
10023 alu.last = i == lasti;
10024 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
10025 return r;
10026 }
10027 }
10028
10029 memset(&alu, 0, sizeof(alu));
10030 alu.op = ALU_OP1_FLT_TO_INT;
10031 alu.src[0].sel = ctx->bc->ar_reg;
10032 alu.dst.sel = ctx->bc->ar_reg;
10033 alu.dst.write = 1;
10034 /* FLT_TO_INT is trans-only on r600/r700 */
10035 alu.last = TRUE;
10036 for (i = 0; i <= lasti; ++i) {
10037 alu.dst.chan = i;
10038 alu.src[0].chan = i;
10039 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
10040 return r;
10041 }
10042 break;
10043 case TGSI_OPCODE_ARR:
10044 memset(&alu, 0, sizeof(alu));
10045 alu.op = ALU_OP1_FLT_TO_INT;
10046 alu.dst.sel = ctx->bc->ar_reg;
10047 alu.dst.write = 1;
10048 /* FLT_TO_INT is trans-only on r600/r700 */
10049 alu.last = TRUE;
10050 for (i = 0; i <= lasti; ++i) {
10051 if (inst->Dst[0].Register.WriteMask & (1 << i)) {
10052 alu.dst.chan = i;
10053 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
10054 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
10055 return r;
10056 }
10057 }
10058 break;
10059 case TGSI_OPCODE_UARL:
10060 memset(&alu, 0, sizeof(alu));
10061 alu.op = ALU_OP1_MOV;
10062 alu.dst.sel = ctx->bc->ar_reg;
10063 alu.dst.write = 1;
10064 for (i = 0; i <= lasti; ++i) {
10065 if (inst->Dst[0].Register.WriteMask & (1 << i)) {
10066 alu.dst.chan = i;
10067 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
10068 alu.last = i == lasti;
10069 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
10070 return r;
10071 }
10072 }
10073 break;
10074 default:
10075 assert(0);
10076 return -1;
10077 }
10078
10079 ctx->bc->ar_loaded = 0;
10080 return 0;
10081 }
10082
10083 static int tgsi_opdst(struct r600_shader_ctx *ctx)
10084 {
10085 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10086 struct r600_bytecode_alu alu;
10087 int i, r = 0;
10088
10089 for (i = 0; i < 4; i++) {
10090 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10091
10092 alu.op = ALU_OP2_MUL;
10093 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10094
10095 if (i == 0 || i == 3) {
10096 alu.src[0].sel = V_SQ_ALU_SRC_1;
10097 } else {
10098 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
10099 }
10100
10101 if (i == 0 || i == 2) {
10102 alu.src[1].sel = V_SQ_ALU_SRC_1;
10103 } else {
10104 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
10105 }
10106 if (i == 3)
10107 alu.last = 1;
10108 r = r600_bytecode_add_alu(ctx->bc, &alu);
10109 if (r)
10110 return r;
10111 }
10112 return 0;
10113 }
10114
10115 static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode, int alu_type,
10116 struct r600_bytecode_alu_src *src)
10117 {
10118 struct r600_bytecode_alu alu;
10119 int r;
10120
10121 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10122 alu.op = opcode;
10123 alu.execute_mask = 1;
10124 alu.update_pred = 1;
10125
10126 alu.dst.sel = ctx->temp_reg;
10127 alu.dst.write = 1;
10128 alu.dst.chan = 0;
10129
10130 alu.src[0] = *src;
10131 alu.src[1].sel = V_SQ_ALU_SRC_0;
10132 alu.src[1].chan = 0;
10133
10134 alu.last = 1;
10135
10136 r = r600_bytecode_add_alu_type(ctx->bc, &alu, alu_type);
10137 if (r)
10138 return r;
10139 return 0;
10140 }
10141
10142 static int pops(struct r600_shader_ctx *ctx, int pops)
10143 {
10144 unsigned force_pop = ctx->bc->force_add_cf;
10145
10146 if (!force_pop) {
10147 int alu_pop = 3;
10148 if (ctx->bc->cf_last) {
10149 if (ctx->bc->cf_last->op == CF_OP_ALU)
10150 alu_pop = 0;
10151 else if (ctx->bc->cf_last->op == CF_OP_ALU_POP_AFTER)
10152 alu_pop = 1;
10153 }
10154 alu_pop += pops;
10155 if (alu_pop == 1) {
10156 ctx->bc->cf_last->op = CF_OP_ALU_POP_AFTER;
10157 ctx->bc->force_add_cf = 1;
10158 } else if (alu_pop == 2) {
10159 ctx->bc->cf_last->op = CF_OP_ALU_POP2_AFTER;
10160 ctx->bc->force_add_cf = 1;
10161 } else {
10162 force_pop = 1;
10163 }
10164 }
10165
10166 if (force_pop) {
10167 r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP);
10168 ctx->bc->cf_last->pop_count = pops;
10169 ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
10170 }
10171
10172 return 0;
10173 }
10174
10175 static inline void callstack_update_max_depth(struct r600_shader_ctx *ctx,
10176 unsigned reason)
10177 {
10178 struct r600_stack_info *stack = &ctx->bc->stack;
10179 unsigned elements;
10180 int entries;
10181
10182 unsigned entry_size = stack->entry_size;
10183
10184 elements = (stack->loop + stack->push_wqm ) * entry_size;
10185 elements += stack->push;
10186
10187 switch (ctx->bc->chip_class) {
10188 case R600:
10189 case R700:
10190 /* pre-r8xx: if any non-WQM PUSH instruction is invoked, 2 elements on
10191 * the stack must be reserved to hold the current active/continue
10192 * masks */
10193 if (reason == FC_PUSH_VPM) {
10194 elements += 2;
10195 }
10196 break;
10197
10198 case CAYMAN:
10199 /* r9xx: any stack operation on empty stack consumes 2 additional
10200 * elements */
10201 elements += 2;
10202
10203 /* fallthrough */
10204 /* FIXME: do the two elements added above cover the cases for the
10205 * r8xx+ below? */
10206
10207 case EVERGREEN:
10208 /* r8xx+: 2 extra elements are not always required, but one extra
10209 * element must be added for each of the following cases:
10210 * 1. There is an ALU_ELSE_AFTER instruction at the point of greatest
10211 * stack usage.
10212 * (Currently we don't use ALU_ELSE_AFTER.)
10213 * 2. There are LOOP/WQM frames on the stack when any flavor of non-WQM
10214 * PUSH instruction executed.
10215 *
10216 * NOTE: it seems we also need to reserve additional element in some
10217 * other cases, e.g. when we have 4 levels of PUSH_VPM in the shader,
10218 * then STACK_SIZE should be 2 instead of 1 */
10219 if (reason == FC_PUSH_VPM) {
10220 elements += 1;
10221 }
10222 break;
10223
10224 default:
10225 assert(0);
10226 break;
10227 }
10228
10229 /* NOTE: it seems STACK_SIZE is interpreted by hw as if entry_size is 4
10230 * for all chips, so we use 4 in the final formula, not the real entry_size
10231 * for the chip */
10232 entry_size = 4;
10233
10234 entries = (elements + (entry_size - 1)) / entry_size;
10235
10236 if (entries > stack->max_entries)
10237 stack->max_entries = entries;
10238 }
10239
10240 static inline void callstack_pop(struct r600_shader_ctx *ctx, unsigned reason)
10241 {
10242 switch(reason) {
10243 case FC_PUSH_VPM:
10244 --ctx->bc->stack.push;
10245 assert(ctx->bc->stack.push >= 0);
10246 break;
10247 case FC_PUSH_WQM:
10248 --ctx->bc->stack.push_wqm;
10249 assert(ctx->bc->stack.push_wqm >= 0);
10250 break;
10251 case FC_LOOP:
10252 --ctx->bc->stack.loop;
10253 assert(ctx->bc->stack.loop >= 0);
10254 break;
10255 default:
10256 assert(0);
10257 break;
10258 }
10259 }
10260
10261 static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason)
10262 {
10263 switch (reason) {
10264 case FC_PUSH_VPM:
10265 ++ctx->bc->stack.push;
10266 break;
10267 case FC_PUSH_WQM:
10268 ++ctx->bc->stack.push_wqm;
10269 case FC_LOOP:
10270 ++ctx->bc->stack.loop;
10271 break;
10272 default:
10273 assert(0);
10274 }
10275
10276 callstack_update_max_depth(ctx, reason);
10277 }
10278
10279 static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp)
10280 {
10281 struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[fc_sp];
10282
10283 sp->mid = realloc((void *)sp->mid,
10284 sizeof(struct r600_bytecode_cf *) * (sp->num_mid + 1));
10285 sp->mid[sp->num_mid] = ctx->bc->cf_last;
10286 sp->num_mid++;
10287 }
10288
10289 static void fc_pushlevel(struct r600_shader_ctx *ctx, int type)
10290 {
10291 assert(ctx->bc->fc_sp < ARRAY_SIZE(ctx->bc->fc_stack));
10292 ctx->bc->fc_stack[ctx->bc->fc_sp].type = type;
10293 ctx->bc->fc_stack[ctx->bc->fc_sp].start = ctx->bc->cf_last;
10294 ctx->bc->fc_sp++;
10295 }
10296
10297 static void fc_poplevel(struct r600_shader_ctx *ctx)
10298 {
10299 struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[ctx->bc->fc_sp - 1];
10300 free(sp->mid);
10301 sp->mid = NULL;
10302 sp->num_mid = 0;
10303 sp->start = NULL;
10304 sp->type = 0;
10305 ctx->bc->fc_sp--;
10306 }
10307
10308 #if 0
10309 static int emit_return(struct r600_shader_ctx *ctx)
10310 {
10311 r600_bytecode_add_cfinst(ctx->bc, CF_OP_RETURN));
10312 return 0;
10313 }
10314
10315 static int emit_jump_to_offset(struct r600_shader_ctx *ctx, int pops, int offset)
10316 {
10317
10318 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP));
10319 ctx->bc->cf_last->pop_count = pops;
10320 /* XXX work out offset */
10321 return 0;
10322 }
10323
10324 static int emit_setret_in_loop_flag(struct r600_shader_ctx *ctx, unsigned flag_value)
10325 {
10326 return 0;
10327 }
10328
10329 static void emit_testflag(struct r600_shader_ctx *ctx)
10330 {
10331
10332 }
10333
10334 static void emit_return_on_flag(struct r600_shader_ctx *ctx, unsigned ifidx)
10335 {
10336 emit_testflag(ctx);
10337 emit_jump_to_offset(ctx, 1, 4);
10338 emit_setret_in_loop_flag(ctx, V_SQ_ALU_SRC_0);
10339 pops(ctx, ifidx + 1);
10340 emit_return(ctx);
10341 }
10342
10343 static void break_loop_on_flag(struct r600_shader_ctx *ctx, unsigned fc_sp)
10344 {
10345 emit_testflag(ctx);
10346
10347 r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
10348 ctx->bc->cf_last->pop_count = 1;
10349
10350 fc_set_mid(ctx, fc_sp);
10351
10352 pops(ctx, 1);
10353 }
10354 #endif
10355
10356 static int emit_if(struct r600_shader_ctx *ctx, int opcode,
10357 struct r600_bytecode_alu_src *src)
10358 {
10359 int alu_type = CF_OP_ALU_PUSH_BEFORE;
10360
10361 /* There is a hardware bug on Cayman where a BREAK/CONTINUE followed by
10362 * LOOP_STARTxxx for nested loops may put the branch stack into a state
10363 * such that ALU_PUSH_BEFORE doesn't work as expected. Workaround this
10364 * by replacing the ALU_PUSH_BEFORE with a PUSH + ALU */
10365 if (ctx->bc->chip_class == CAYMAN && ctx->bc->stack.loop > 1) {
10366 r600_bytecode_add_cfinst(ctx->bc, CF_OP_PUSH);
10367 ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
10368 alu_type = CF_OP_ALU;
10369 }
10370
10371 emit_logic_pred(ctx, opcode, alu_type, src);
10372
10373 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
10374
10375 fc_pushlevel(ctx, FC_IF);
10376
10377 callstack_push(ctx, FC_PUSH_VPM);
10378 return 0;
10379 }
10380
10381 static int tgsi_if(struct r600_shader_ctx *ctx)
10382 {
10383 struct r600_bytecode_alu_src alu_src;
10384 r600_bytecode_src(&alu_src, &ctx->src[0], 0);
10385
10386 return emit_if(ctx, ALU_OP2_PRED_SETNE, &alu_src);
10387 }
10388
10389 static int tgsi_uif(struct r600_shader_ctx *ctx)
10390 {
10391 struct r600_bytecode_alu_src alu_src;
10392 r600_bytecode_src(&alu_src, &ctx->src[0], 0);
10393 return emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
10394 }
10395
10396 static int tgsi_else(struct r600_shader_ctx *ctx)
10397 {
10398 r600_bytecode_add_cfinst(ctx->bc, CF_OP_ELSE);
10399 ctx->bc->cf_last->pop_count = 1;
10400
10401 fc_set_mid(ctx, ctx->bc->fc_sp - 1);
10402 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id;
10403 return 0;
10404 }
10405
10406 static int tgsi_endif(struct r600_shader_ctx *ctx)
10407 {
10408 pops(ctx, 1);
10409 if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].type != FC_IF) {
10410 R600_ERR("if/endif unbalanced in shader\n");
10411 return -1;
10412 }
10413
10414 if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid == NULL) {
10415 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id + 2;
10416 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->pop_count = 1;
10417 } else {
10418 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid[0]->cf_addr = ctx->bc->cf_last->id + 2;
10419 }
10420 fc_poplevel(ctx);
10421
10422 callstack_pop(ctx, FC_PUSH_VPM);
10423 return 0;
10424 }
10425
10426 static int tgsi_bgnloop(struct r600_shader_ctx *ctx)
10427 {
10428 /* LOOP_START_DX10 ignores the LOOP_CONFIG* registers, so it is not
10429 * limited to 4096 iterations, like the other LOOP_* instructions. */
10430 r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_START_DX10);
10431
10432 fc_pushlevel(ctx, FC_LOOP);
10433
10434 /* check stack depth */
10435 callstack_push(ctx, FC_LOOP);
10436 return 0;
10437 }
10438
10439 static int tgsi_endloop(struct r600_shader_ctx *ctx)
10440 {
10441 int i;
10442
10443 r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_END);
10444
10445 if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].type != FC_LOOP) {
10446 R600_ERR("loop/endloop in shader code are not paired.\n");
10447 return -EINVAL;
10448 }
10449
10450 /* fixup loop pointers - from r600isa
10451 LOOP END points to CF after LOOP START,
10452 LOOP START point to CF after LOOP END
10453 BRK/CONT point to LOOP END CF
10454 */
10455 ctx->bc->cf_last->cf_addr = ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->id + 2;
10456
10457 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id + 2;
10458
10459 for (i = 0; i < ctx->bc->fc_stack[ctx->bc->fc_sp - 1].num_mid; i++) {
10460 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid[i]->cf_addr = ctx->bc->cf_last->id;
10461 }
10462 /* XXX add LOOPRET support */
10463 fc_poplevel(ctx);
10464 callstack_pop(ctx, FC_LOOP);
10465 return 0;
10466 }
10467
10468 static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx)
10469 {
10470 unsigned int fscp;
10471
10472 for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
10473 {
10474 if (FC_LOOP == ctx->bc->fc_stack[fscp - 1].type)
10475 break;
10476 }
10477
10478 if (fscp == 0) {
10479 R600_ERR("Break not inside loop/endloop pair\n");
10480 return -EINVAL;
10481 }
10482
10483 r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
10484
10485 fc_set_mid(ctx, fscp - 1);
10486
10487 return 0;
10488 }
10489
10490 static int tgsi_gs_emit(struct r600_shader_ctx *ctx)
10491 {
10492 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10493 int stream = ctx->literals[inst->Src[0].Register.Index * 4 + inst->Src[0].Register.SwizzleX];
10494 int r;
10495
10496 if (ctx->inst_info->op == CF_OP_EMIT_VERTEX)
10497 emit_gs_ring_writes(ctx, ctx->gs_stream_output_info, stream, TRUE);
10498
10499 r = r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
10500 if (!r) {
10501 ctx->bc->cf_last->count = stream; // Count field for CUT/EMIT_VERTEX indicates which stream
10502 if (ctx->inst_info->op == CF_OP_EMIT_VERTEX)
10503 return emit_inc_ring_offset(ctx, stream, TRUE);
10504 }
10505 return r;
10506 }
10507
10508 static int tgsi_umad(struct r600_shader_ctx *ctx)
10509 {
10510 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10511 struct r600_bytecode_alu alu;
10512 int i, j, r;
10513 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10514
10515 /* src0 * src1 */
10516 for (i = 0; i < lasti + 1; i++) {
10517 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
10518 continue;
10519
10520 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10521
10522 alu.dst.chan = i;
10523 alu.dst.sel = ctx->temp_reg;
10524 alu.dst.write = 1;
10525
10526 alu.op = ALU_OP2_MULLO_UINT;
10527 for (j = 0; j < 2; j++) {
10528 r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
10529 }
10530
10531 alu.last = 1;
10532 r = emit_mul_int_op(ctx->bc, &alu);
10533 if (r)
10534 return r;
10535 }
10536
10537
10538 for (i = 0; i < lasti + 1; i++) {
10539 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
10540 continue;
10541
10542 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10543 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10544
10545 alu.op = ALU_OP2_ADD_INT;
10546
10547 alu.src[0].sel = ctx->temp_reg;
10548 alu.src[0].chan = i;
10549
10550 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
10551 if (i == lasti) {
10552 alu.last = 1;
10553 }
10554 r = r600_bytecode_add_alu(ctx->bc, &alu);
10555 if (r)
10556 return r;
10557 }
10558 return 0;
10559 }
10560
10561 static int tgsi_pk2h(struct r600_shader_ctx *ctx)
10562 {
10563 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10564 struct r600_bytecode_alu alu;
10565 int r, i;
10566 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10567
10568 /* temp.xy = f32_to_f16(src) */
10569 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10570 alu.op = ALU_OP1_FLT32_TO_FLT16;
10571 alu.dst.chan = 0;
10572 alu.dst.sel = ctx->temp_reg;
10573 alu.dst.write = 1;
10574 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10575 r = r600_bytecode_add_alu(ctx->bc, &alu);
10576 if (r)
10577 return r;
10578 alu.dst.chan = 1;
10579 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
10580 alu.last = 1;
10581 r = r600_bytecode_add_alu(ctx->bc, &alu);
10582 if (r)
10583 return r;
10584
10585 /* dst.x = temp.y * 0x10000 + temp.x */
10586 for (i = 0; i < lasti + 1; i++) {
10587 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
10588 continue;
10589
10590 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10591 alu.op = ALU_OP3_MULADD_UINT24;
10592 alu.is_op3 = 1;
10593 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10594 alu.last = i == lasti;
10595 alu.src[0].sel = ctx->temp_reg;
10596 alu.src[0].chan = 1;
10597 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
10598 alu.src[1].value = 0x10000;
10599 alu.src[2].sel = ctx->temp_reg;
10600 alu.src[2].chan = 0;
10601 r = r600_bytecode_add_alu(ctx->bc, &alu);
10602 if (r)
10603 return r;
10604 }
10605
10606 return 0;
10607 }
10608
10609 static int tgsi_up2h(struct r600_shader_ctx *ctx)
10610 {
10611 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10612 struct r600_bytecode_alu alu;
10613 int r, i;
10614 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10615
10616 /* temp.x = src.x */
10617 /* note: no need to mask out the high bits */
10618 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10619 alu.op = ALU_OP1_MOV;
10620 alu.dst.chan = 0;
10621 alu.dst.sel = ctx->temp_reg;
10622 alu.dst.write = 1;
10623 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10624 r = r600_bytecode_add_alu(ctx->bc, &alu);
10625 if (r)
10626 return r;
10627
10628 /* temp.y = src.x >> 16 */
10629 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10630 alu.op = ALU_OP2_LSHR_INT;
10631 alu.dst.chan = 1;
10632 alu.dst.sel = ctx->temp_reg;
10633 alu.dst.write = 1;
10634 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10635 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
10636 alu.src[1].value = 16;
10637 alu.last = 1;
10638 r = r600_bytecode_add_alu(ctx->bc, &alu);
10639 if (r)
10640 return r;
10641
10642 /* dst.wz = dst.xy = f16_to_f32(temp.xy) */
10643 for (i = 0; i < lasti + 1; i++) {
10644 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
10645 continue;
10646 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10647 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10648 alu.op = ALU_OP1_FLT16_TO_FLT32;
10649 alu.src[0].sel = ctx->temp_reg;
10650 alu.src[0].chan = i % 2;
10651 alu.last = i == lasti;
10652 r = r600_bytecode_add_alu(ctx->bc, &alu);
10653 if (r)
10654 return r;
10655 }
10656
10657 return 0;
10658 }
10659
10660 static int tgsi_bfe(struct r600_shader_ctx *ctx)
10661 {
10662 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10663 struct r600_bytecode_alu alu;
10664 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10665 int r, i;
10666 int dst = -1;
10667
10668 if ((inst->Src[0].Register.File == inst->Dst[0].Register.File &&
10669 inst->Src[0].Register.Index == inst->Dst[0].Register.Index) ||
10670 (inst->Src[2].Register.File == inst->Dst[0].Register.File &&
10671 inst->Src[2].Register.Index == inst->Dst[0].Register.Index))
10672 dst = r600_get_temp(ctx);
10673
10674 r = tgsi_op3_dst(ctx, dst);
10675 if (r)
10676 return r;
10677
10678 for (i = 0; i < lasti + 1; i++) {
10679 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10680 alu.op = ALU_OP2_SETGE_INT;
10681 r600_bytecode_src(&alu.src[0], &ctx->src[2], i);
10682 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
10683 alu.src[1].value = 32;
10684 alu.dst.sel = ctx->temp_reg;
10685 alu.dst.chan = i;
10686 alu.dst.write = 1;
10687 if (i == lasti)
10688 alu.last = 1;
10689 r = r600_bytecode_add_alu(ctx->bc, &alu);
10690 if (r)
10691 return r;
10692 }
10693
10694 for (i = 0; i < lasti + 1; i++) {
10695 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10696 alu.op = ALU_OP3_CNDE_INT;
10697 alu.is_op3 = 1;
10698 alu.src[0].sel = ctx->temp_reg;
10699 alu.src[0].chan = i;
10700
10701 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10702 if (dst != -1)
10703 alu.src[1].sel = dst;
10704 else
10705 alu.src[1].sel = alu.dst.sel;
10706 alu.src[1].chan = i;
10707 r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
10708 alu.dst.write = 1;
10709 if (i == lasti)
10710 alu.last = 1;
10711 r = r600_bytecode_add_alu(ctx->bc, &alu);
10712 if (r)
10713 return r;
10714 }
10715
10716 return 0;
10717 }
10718
10719 static int tgsi_clock(struct r600_shader_ctx *ctx)
10720 {
10721 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10722 struct r600_bytecode_alu alu;
10723 int r;
10724
10725 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10726 alu.op = ALU_OP1_MOV;
10727 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
10728 alu.src[0].sel = EG_V_SQ_ALU_SRC_TIME_LO;
10729 r = r600_bytecode_add_alu(ctx->bc, &alu);
10730 if (r)
10731 return r;
10732 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10733 alu.op = ALU_OP1_MOV;
10734 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
10735 alu.src[0].sel = EG_V_SQ_ALU_SRC_TIME_HI;
10736 r = r600_bytecode_add_alu(ctx->bc, &alu);
10737 if (r)
10738 return r;
10739 return 0;
10740 }
10741
10742 static int emit_u64add(struct r600_shader_ctx *ctx, int op,
10743 int treg,
10744 int src0_sel, int src0_chan,
10745 int src1_sel, int src1_chan)
10746 {
10747 struct r600_bytecode_alu alu;
10748 int r;
10749 int opc;
10750
10751 if (op == ALU_OP2_ADD_INT)
10752 opc = ALU_OP2_ADDC_UINT;
10753 else
10754 opc = ALU_OP2_SUBB_UINT;
10755
10756 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10757 alu.op = op; ;
10758 alu.dst.sel = treg;
10759 alu.dst.chan = 0;
10760 alu.dst.write = 1;
10761 alu.src[0].sel = src0_sel;
10762 alu.src[0].chan = src0_chan + 0;
10763 alu.src[1].sel = src1_sel;
10764 alu.src[1].chan = src1_chan + 0;
10765 alu.src[1].neg = 0;
10766 r = r600_bytecode_add_alu(ctx->bc, &alu);
10767 if (r)
10768 return r;
10769
10770 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10771 alu.op = op;
10772 alu.dst.sel = treg;
10773 alu.dst.chan = 1;
10774 alu.dst.write = 1;
10775 alu.src[0].sel = src0_sel;
10776 alu.src[0].chan = src0_chan + 1;
10777 alu.src[1].sel = src1_sel;
10778 alu.src[1].chan = src1_chan + 1;
10779 alu.src[1].neg = 0;
10780 r = r600_bytecode_add_alu(ctx->bc, &alu);
10781 if (r)
10782 return r;
10783
10784 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10785 alu.op = opc;
10786 alu.dst.sel = treg;
10787 alu.dst.chan = 2;
10788 alu.dst.write = 1;
10789 alu.last = 1;
10790 alu.src[0].sel = src0_sel;
10791 alu.src[0].chan = src0_chan + 0;
10792 alu.src[1].sel = src1_sel;
10793 alu.src[1].chan = src1_chan + 0;
10794 alu.src[1].neg = 0;
10795 r = r600_bytecode_add_alu(ctx->bc, &alu);
10796 if (r)
10797 return r;
10798
10799 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10800 alu.op = op;
10801 alu.dst.sel = treg;
10802 alu.dst.chan = 1;
10803 alu.dst.write = 1;
10804 alu.src[0].sel = treg;
10805 alu.src[0].chan = 1;
10806 alu.src[1].sel = treg;
10807 alu.src[1].chan = 2;
10808 alu.last = 1;
10809 r = r600_bytecode_add_alu(ctx->bc, &alu);
10810 if (r)
10811 return r;
10812 return 0;
10813 }
10814
10815 static int egcm_u64add(struct r600_shader_ctx *ctx)
10816 {
10817 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10818 struct r600_bytecode_alu alu;
10819 int r;
10820 int treg = ctx->temp_reg;
10821 int op = ALU_OP2_ADD_INT, opc = ALU_OP2_ADDC_UINT;
10822
10823 if (ctx->src[1].neg) {
10824 op = ALU_OP2_SUB_INT;
10825 opc = ALU_OP2_SUBB_UINT;
10826 }
10827 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10828 alu.op = op; ;
10829 alu.dst.sel = treg;
10830 alu.dst.chan = 0;
10831 alu.dst.write = 1;
10832 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10833 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
10834 alu.src[1].neg = 0;
10835 r = r600_bytecode_add_alu(ctx->bc, &alu);
10836 if (r)
10837 return r;
10838
10839 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10840 alu.op = op;
10841 alu.dst.sel = treg;
10842 alu.dst.chan = 1;
10843 alu.dst.write = 1;
10844 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
10845 r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
10846 alu.src[1].neg = 0;
10847 r = r600_bytecode_add_alu(ctx->bc, &alu);
10848 if (r)
10849 return r;
10850
10851 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10852 alu.op = opc ;
10853 alu.dst.sel = treg;
10854 alu.dst.chan = 2;
10855 alu.dst.write = 1;
10856 alu.last = 1;
10857 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10858 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
10859 alu.src[1].neg = 0;
10860 r = r600_bytecode_add_alu(ctx->bc, &alu);
10861 if (r)
10862 return r;
10863
10864 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10865 alu.op = op;
10866 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
10867 alu.src[0].sel = treg;
10868 alu.src[0].chan = 1;
10869 alu.src[1].sel = treg;
10870 alu.src[1].chan = 2;
10871 alu.last = 1;
10872 r = r600_bytecode_add_alu(ctx->bc, &alu);
10873 if (r)
10874 return r;
10875 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10876 alu.op = ALU_OP1_MOV;
10877 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
10878 alu.src[0].sel = treg;
10879 alu.src[0].chan = 0;
10880 alu.last = 1;
10881 r = r600_bytecode_add_alu(ctx->bc, &alu);
10882 if (r)
10883 return r;
10884 return 0;
10885 }
10886
10887 /* result.y = mul_high a, b
10888 result.x = mul a,b
10889 result.y += a.x * b.y + a.y * b.x;
10890 */
10891 static int egcm_u64mul(struct r600_shader_ctx *ctx)
10892 {
10893 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10894 struct r600_bytecode_alu alu;
10895 int r;
10896 int treg = ctx->temp_reg;
10897
10898 /* temp.x = mul_lo a.x, b.x */
10899 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10900 alu.op = ALU_OP2_MULLO_UINT;
10901 alu.dst.sel = treg;
10902 alu.dst.chan = 0;
10903 alu.dst.write = 1;
10904 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10905 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
10906 r = emit_mul_int_op(ctx->bc, &alu);
10907 if (r)
10908 return r;
10909
10910 /* temp.y = mul_hi a.x, b.x */
10911 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10912 alu.op = ALU_OP2_MULHI_UINT;
10913 alu.dst.sel = treg;
10914 alu.dst.chan = 1;
10915 alu.dst.write = 1;
10916 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10917 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
10918 r = emit_mul_int_op(ctx->bc, &alu);
10919 if (r)
10920 return r;
10921
10922 /* temp.z = mul a.x, b.y */
10923 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10924 alu.op = ALU_OP2_MULLO_UINT;
10925 alu.dst.sel = treg;
10926 alu.dst.chan = 2;
10927 alu.dst.write = 1;
10928 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
10929 r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
10930 r = emit_mul_int_op(ctx->bc, &alu);
10931 if (r)
10932 return r;
10933
10934 /* temp.w = mul a.y, b.x */
10935 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10936 alu.op = ALU_OP2_MULLO_UINT;
10937 alu.dst.sel = treg;
10938 alu.dst.chan = 3;
10939 alu.dst.write = 1;
10940 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
10941 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
10942 r = emit_mul_int_op(ctx->bc, &alu);
10943 if (r)
10944 return r;
10945
10946 /* temp.z = temp.z + temp.w */
10947 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10948 alu.op = ALU_OP2_ADD_INT;
10949 alu.dst.sel = treg;
10950 alu.dst.chan = 2;
10951 alu.dst.write = 1;
10952 alu.src[0].sel = treg;
10953 alu.src[0].chan = 2;
10954 alu.src[1].sel = treg;
10955 alu.src[1].chan = 3;
10956 alu.last = 1;
10957 r = r600_bytecode_add_alu(ctx->bc, &alu);
10958 if (r)
10959 return r;
10960
10961 /* temp.y = temp.y + temp.z */
10962 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10963 alu.op = ALU_OP2_ADD_INT;
10964 alu.dst.sel = treg;
10965 alu.dst.chan = 1;
10966 alu.dst.write = 1;
10967 alu.src[0].sel = treg;
10968 alu.src[0].chan = 1;
10969 alu.src[1].sel = treg;
10970 alu.src[1].chan = 2;
10971 alu.last = 1;
10972 r = r600_bytecode_add_alu(ctx->bc, &alu);
10973 if (r)
10974 return r;
10975
10976 /* dst.x = temp.x */
10977 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10978 alu.op = ALU_OP1_MOV;
10979 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
10980 alu.src[0].sel = treg;
10981 alu.src[0].chan = 0;
10982 r = r600_bytecode_add_alu(ctx->bc, &alu);
10983 if (r)
10984 return r;
10985
10986 /* dst.y = temp.y */
10987 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10988 alu.op = ALU_OP1_MOV;
10989 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
10990 alu.src[0].sel = treg;
10991 alu.src[0].chan = 1;
10992 alu.last = 1;
10993 r = r600_bytecode_add_alu(ctx->bc, &alu);
10994 if (r)
10995 return r;
10996
10997 return 0;
10998 }
10999
11000 static int emit_u64sge(struct r600_shader_ctx *ctx,
11001 int treg,
11002 int src0_sel, int src0_base_chan,
11003 int src1_sel, int src1_base_chan)
11004 {
11005 int r;
11006 /* for 64-bit sge */
11007 /* result = (src0.y > src1.y) || ((src0.y == src1.y) && src0.x >= src1.x)) */
11008 r = single_alu_op2(ctx, ALU_OP2_SETGT_UINT,
11009 treg, 1,
11010 src0_sel, src0_base_chan + 1,
11011 src1_sel, src1_base_chan + 1);
11012 if (r)
11013 return r;
11014
11015 r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,
11016 treg, 0,
11017 src0_sel, src0_base_chan,
11018 src1_sel, src1_base_chan);
11019 if (r)
11020 return r;
11021
11022 r = single_alu_op2(ctx, ALU_OP2_SETE_INT,
11023 treg, 2,
11024 src0_sel, src0_base_chan + 1,
11025 src1_sel, src1_base_chan + 1);
11026 if (r)
11027 return r;
11028
11029 r = single_alu_op2(ctx, ALU_OP2_AND_INT,
11030 treg, 0,
11031 treg, 0,
11032 treg, 2);
11033 if (r)
11034 return r;
11035
11036 r = single_alu_op2(ctx, ALU_OP2_OR_INT,
11037 treg, 0,
11038 treg, 0,
11039 treg, 1);
11040 if (r)
11041 return r;
11042 return 0;
11043 }
11044
11045 /* this isn't a complete div it's just enough for qbo shader to work */
11046 static int egcm_u64div(struct r600_shader_ctx *ctx)
11047 {
11048 struct r600_bytecode_alu alu;
11049 struct r600_bytecode_alu_src alu_num_hi, alu_num_lo, alu_denom_hi, alu_denom_lo, alu_src;
11050 int r, i;
11051 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
11052
11053 /* make sure we are dividing my a const with 0 in the high bits */
11054 if (ctx->src[1].sel != V_SQ_ALU_SRC_LITERAL)
11055 return -1;
11056 if (ctx->src[1].value[ctx->src[1].swizzle[1]] != 0)
11057 return -1;
11058 /* make sure we are doing one division */
11059 if (inst->Dst[0].Register.WriteMask != 0x3)
11060 return -1;
11061
11062 /* emit_if uses ctx->temp_reg so we can't */
11063 int treg = r600_get_temp(ctx);
11064 int tmp_num = r600_get_temp(ctx);
11065 int sub_tmp = r600_get_temp(ctx);
11066
11067 /* tmp quot are tmp_num.zw */
11068 r600_bytecode_src(&alu_num_lo, &ctx->src[0], 0);
11069 r600_bytecode_src(&alu_num_hi, &ctx->src[0], 1);
11070 r600_bytecode_src(&alu_denom_lo, &ctx->src[1], 0);
11071 r600_bytecode_src(&alu_denom_hi, &ctx->src[1], 1);
11072
11073 /* MOV tmp_num.xy, numerator */
11074 r = single_alu_op2(ctx, ALU_OP1_MOV,
11075 tmp_num, 0,
11076 alu_num_lo.sel, alu_num_lo.chan,
11077 0, 0);
11078 if (r)
11079 return r;
11080 r = single_alu_op2(ctx, ALU_OP1_MOV,
11081 tmp_num, 1,
11082 alu_num_hi.sel, alu_num_hi.chan,
11083 0, 0);
11084 if (r)
11085 return r;
11086
11087 r = single_alu_op2(ctx, ALU_OP1_MOV,
11088 tmp_num, 2,
11089 V_SQ_ALU_SRC_LITERAL, 0,
11090 0, 0);
11091 if (r)
11092 return r;
11093
11094 r = single_alu_op2(ctx, ALU_OP1_MOV,
11095 tmp_num, 3,
11096 V_SQ_ALU_SRC_LITERAL, 0,
11097 0, 0);
11098 if (r)
11099 return r;
11100
11101 /* treg 0 is log2_denom */
11102 /* normally this gets the MSB for the denom high value
11103 - however we know this will always be 0 here. */
11104 r = single_alu_op2(ctx,
11105 ALU_OP1_MOV,
11106 treg, 0,
11107 V_SQ_ALU_SRC_LITERAL, 32,
11108 0, 0);
11109 if (r)
11110 return r;
11111
11112 /* normally check demon hi for 0, but we know it is already */
11113 /* t0.z = num_hi >= denom_lo */
11114 r = single_alu_op2(ctx,
11115 ALU_OP2_SETGE_UINT,
11116 treg, 1,
11117 alu_num_hi.sel, alu_num_hi.chan,
11118 V_SQ_ALU_SRC_LITERAL, alu_denom_lo.value);
11119 if (r)
11120 return r;
11121
11122 memset(&alu_src, 0, sizeof(alu_src));
11123 alu_src.sel = treg;
11124 alu_src.chan = 1;
11125 r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
11126 if (r)
11127 return r;
11128
11129 /* for loops in here */
11130 /* get msb t0.x = msb(src[1].x) first */
11131 int msb_lo = util_last_bit(alu_denom_lo.value);
11132 r = single_alu_op2(ctx, ALU_OP1_MOV,
11133 treg, 0,
11134 V_SQ_ALU_SRC_LITERAL, msb_lo,
11135 0, 0);
11136 if (r)
11137 return r;
11138
11139 /* unroll the asm here */
11140 for (i = 0; i < 31; i++) {
11141 r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,
11142 treg, 2,
11143 V_SQ_ALU_SRC_LITERAL, i,
11144 treg, 0);
11145 if (r)
11146 return r;
11147
11148 /* we can do this on the CPU */
11149 uint32_t denom_lo_shl = alu_denom_lo.value << (31 - i);
11150 /* t0.z = tmp_num.y >= t0.z */
11151 r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,
11152 treg, 1,
11153 tmp_num, 1,
11154 V_SQ_ALU_SRC_LITERAL, denom_lo_shl);
11155 if (r)
11156 return r;
11157
11158 r = single_alu_op2(ctx, ALU_OP2_AND_INT,
11159 treg, 1,
11160 treg, 1,
11161 treg, 2);
11162 if (r)
11163 return r;
11164
11165 memset(&alu_src, 0, sizeof(alu_src));
11166 alu_src.sel = treg;
11167 alu_src.chan = 1;
11168 r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
11169 if (r)
11170 return r;
11171
11172 r = single_alu_op2(ctx, ALU_OP2_SUB_INT,
11173 tmp_num, 1,
11174 tmp_num, 1,
11175 V_SQ_ALU_SRC_LITERAL, denom_lo_shl);
11176 if (r)
11177 return r;
11178
11179 r = single_alu_op2(ctx, ALU_OP2_OR_INT,
11180 tmp_num, 3,
11181 tmp_num, 3,
11182 V_SQ_ALU_SRC_LITERAL, 1U << (31 - i));
11183 if (r)
11184 return r;
11185
11186 r = tgsi_endif(ctx);
11187 if (r)
11188 return r;
11189 }
11190
11191 /* log2_denom is always <= 31, so manually peel the last loop
11192 * iteration.
11193 */
11194 r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,
11195 treg, 1,
11196 tmp_num, 1,
11197 V_SQ_ALU_SRC_LITERAL, alu_denom_lo.value);
11198 if (r)
11199 return r;
11200
11201 memset(&alu_src, 0, sizeof(alu_src));
11202 alu_src.sel = treg;
11203 alu_src.chan = 1;
11204 r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
11205 if (r)
11206 return r;
11207
11208 r = single_alu_op2(ctx, ALU_OP2_SUB_INT,
11209 tmp_num, 1,
11210 tmp_num, 1,
11211 V_SQ_ALU_SRC_LITERAL, alu_denom_lo.value);
11212 if (r)
11213 return r;
11214
11215 r = single_alu_op2(ctx, ALU_OP2_OR_INT,
11216 tmp_num, 3,
11217 tmp_num, 3,
11218 V_SQ_ALU_SRC_LITERAL, 1U);
11219 if (r)
11220 return r;
11221 r = tgsi_endif(ctx);
11222 if (r)
11223 return r;
11224
11225 r = tgsi_endif(ctx);
11226 if (r)
11227 return r;
11228
11229 /* onto the second loop to unroll */
11230 for (i = 0; i < 31; i++) {
11231 r = single_alu_op2(ctx, ALU_OP2_SETGE_UINT,
11232 treg, 1,
11233 V_SQ_ALU_SRC_LITERAL, (63 - (31 - i)),
11234 treg, 0);
11235 if (r)
11236 return r;
11237
11238 uint64_t denom_shl = (uint64_t)alu_denom_lo.value << (31 - i);
11239 r = single_alu_op2(ctx, ALU_OP1_MOV,
11240 treg, 2,
11241 V_SQ_ALU_SRC_LITERAL, (denom_shl & 0xffffffff),
11242 0, 0);
11243 if (r)
11244 return r;
11245
11246 r = single_alu_op2(ctx, ALU_OP1_MOV,
11247 treg, 3,
11248 V_SQ_ALU_SRC_LITERAL, (denom_shl >> 32),
11249 0, 0);
11250 if (r)
11251 return r;
11252
11253 r = emit_u64sge(ctx, sub_tmp,
11254 tmp_num, 0,
11255 treg, 2);
11256 if (r)
11257 return r;
11258
11259 r = single_alu_op2(ctx, ALU_OP2_AND_INT,
11260 treg, 1,
11261 treg, 1,
11262 sub_tmp, 0);
11263 if (r)
11264 return r;
11265
11266 memset(&alu_src, 0, sizeof(alu_src));
11267 alu_src.sel = treg;
11268 alu_src.chan = 1;
11269 r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
11270 if (r)
11271 return r;
11272
11273
11274 r = emit_u64add(ctx, ALU_OP2_SUB_INT,
11275 sub_tmp,
11276 tmp_num, 0,
11277 treg, 2);
11278 if (r)
11279 return r;
11280
11281 r = single_alu_op2(ctx, ALU_OP1_MOV,
11282 tmp_num, 0,
11283 sub_tmp, 0,
11284 0, 0);
11285 if (r)
11286 return r;
11287
11288 r = single_alu_op2(ctx, ALU_OP1_MOV,
11289 tmp_num, 1,
11290 sub_tmp, 1,
11291 0, 0);
11292 if (r)
11293 return r;
11294
11295 r = single_alu_op2(ctx, ALU_OP2_OR_INT,
11296 tmp_num, 2,
11297 tmp_num, 2,
11298 V_SQ_ALU_SRC_LITERAL, 1U << (31 - i));
11299 if (r)
11300 return r;
11301
11302 r = tgsi_endif(ctx);
11303 if (r)
11304 return r;
11305 }
11306
11307 /* log2_denom is always <= 63, so manually peel the last loop
11308 * iteration.
11309 */
11310 uint64_t denom_shl = (uint64_t)alu_denom_lo.value;
11311 r = single_alu_op2(ctx, ALU_OP1_MOV,
11312 treg, 2,
11313 V_SQ_ALU_SRC_LITERAL, (denom_shl & 0xffffffff),
11314 0, 0);
11315 if (r)
11316 return r;
11317
11318 r = single_alu_op2(ctx, ALU_OP1_MOV,
11319 treg, 3,
11320 V_SQ_ALU_SRC_LITERAL, (denom_shl >> 32),
11321 0, 0);
11322 if (r)
11323 return r;
11324
11325 r = emit_u64sge(ctx, sub_tmp,
11326 tmp_num, 0,
11327 treg, 2);
11328 if (r)
11329 return r;
11330
11331 memset(&alu_src, 0, sizeof(alu_src));
11332 alu_src.sel = sub_tmp;
11333 alu_src.chan = 0;
11334 r = emit_if(ctx, ALU_OP2_PRED_SETNE_INT, &alu_src);
11335 if (r)
11336 return r;
11337
11338 r = emit_u64add(ctx, ALU_OP2_SUB_INT,
11339 sub_tmp,
11340 tmp_num, 0,
11341 treg, 2);
11342 if (r)
11343 return r;
11344
11345 r = single_alu_op2(ctx, ALU_OP2_OR_INT,
11346 tmp_num, 2,
11347 tmp_num, 2,
11348 V_SQ_ALU_SRC_LITERAL, 1U);
11349 if (r)
11350 return r;
11351 r = tgsi_endif(ctx);
11352 if (r)
11353 return r;
11354
11355 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11356 alu.op = ALU_OP1_MOV;
11357 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
11358 alu.src[0].sel = tmp_num;
11359 alu.src[0].chan = 2;
11360 r = r600_bytecode_add_alu(ctx->bc, &alu);
11361 if (r)
11362 return r;
11363
11364 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11365 alu.op = ALU_OP1_MOV;
11366 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
11367 alu.src[0].sel = tmp_num;
11368 alu.src[0].chan = 3;
11369 alu.last = 1;
11370 r = r600_bytecode_add_alu(ctx->bc, &alu);
11371 if (r)
11372 return r;
11373 return 0;
11374 }
11375
11376 static int egcm_u64sne(struct r600_shader_ctx *ctx)
11377 {
11378 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
11379 struct r600_bytecode_alu alu;
11380 int r;
11381 int treg = ctx->temp_reg;
11382
11383 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11384 alu.op = ALU_OP2_SETNE_INT;
11385 alu.dst.sel = treg;
11386 alu.dst.chan = 0;
11387 alu.dst.write = 1;
11388 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
11389 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
11390 r = r600_bytecode_add_alu(ctx->bc, &alu);
11391 if (r)
11392 return r;
11393
11394 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11395 alu.op = ALU_OP2_SETNE_INT;
11396 alu.dst.sel = treg;
11397 alu.dst.chan = 1;
11398 alu.dst.write = 1;
11399 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
11400 r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
11401 alu.last = 1;
11402 r = r600_bytecode_add_alu(ctx->bc, &alu);
11403 if (r)
11404 return r;
11405
11406 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
11407 alu.op = ALU_OP2_OR_INT;
11408 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
11409 alu.src[0].sel = treg;
11410 alu.src[0].chan = 0;
11411 alu.src[1].sel = treg;
11412 alu.src[1].chan = 1;
11413 alu.last = 1;
11414 r = r600_bytecode_add_alu(ctx->bc, &alu);
11415 if (r)
11416 return r;
11417 return 0;
11418 }
11419
11420 static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
11421 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_r600_arl},
11422 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2},
11423 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit},
11424
11425 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate},
11426
11427 [TGSI_OPCODE_RSQ] = { ALU_OP0_NOP, tgsi_rsq},
11428 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp},
11429 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log},
11430 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL_IEEE, tgsi_op2},
11431 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2},
11432 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
11433 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
11434 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst},
11435 /* MIN_DX10 returns non-nan result if one src is NaN, MIN returns NaN */
11436 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2},
11437 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2},
11438 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap},
11439 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2},
11440 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3},
11441 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp},
11442 [TGSI_OPCODE_FMA] = { ALU_OP0_NOP, tgsi_unsupported},
11443 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
11444 [21] = { ALU_OP0_NOP, tgsi_unsupported},
11445 [22] = { ALU_OP0_NOP, tgsi_unsupported},
11446 [23] = { ALU_OP0_NOP, tgsi_unsupported},
11447 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2},
11448 [25] = { ALU_OP0_NOP, tgsi_unsupported},
11449 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2},
11450 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2},
11451 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
11452 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
11453 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, tgsi_pow},
11454 [31] = { ALU_OP0_NOP, tgsi_unsupported},
11455 [32] = { ALU_OP0_NOP, tgsi_unsupported},
11456 [TGSI_OPCODE_CLOCK] = { ALU_OP0_NOP, tgsi_unsupported},
11457 [34] = { ALU_OP0_NOP, tgsi_unsupported},
11458 [35] = { ALU_OP0_NOP, tgsi_unsupported},
11459 [TGSI_OPCODE_COS] = { ALU_OP1_COS, tgsi_trig},
11460 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
11461 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
11462 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */
11463 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_unsupported},
11464 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported},
11465 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported},
11466 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported},
11467 [44] = { ALU_OP0_NOP, tgsi_unsupported},
11468 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2},
11469 [46] = { ALU_OP0_NOP, tgsi_unsupported},
11470 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2},
11471 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, tgsi_trig},
11472 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap},
11473 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2},
11474 [51] = { ALU_OP0_NOP, tgsi_unsupported},
11475 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex},
11476 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex},
11477 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex},
11478 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_unsupported},
11479 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported},
11480 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported},
11481 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported},
11482 [59] = { ALU_OP0_NOP, tgsi_unsupported},
11483 [60] = { ALU_OP0_NOP, tgsi_unsupported},
11484 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_r600_arl},
11485 [62] = { ALU_OP0_NOP, tgsi_unsupported},
11486 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported},
11487 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported},
11488 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg},
11489 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp},
11490 [67] = { ALU_OP0_NOP, tgsi_unsupported},
11491 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
11492 [69] = { ALU_OP0_NOP, tgsi_unsupported},
11493 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported},
11494 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
11495 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex},
11496 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
11497 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if},
11498 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif},
11499 [76] = { ALU_OP0_NOP, tgsi_unsupported},
11500 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else},
11501 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif},
11502 [TGSI_OPCODE_DDX_FINE] = { ALU_OP0_NOP, tgsi_unsupported},
11503 [TGSI_OPCODE_DDY_FINE] = { ALU_OP0_NOP, tgsi_unsupported},
11504 [81] = { ALU_OP0_NOP, tgsi_unsupported},
11505 [82] = { ALU_OP0_NOP, tgsi_unsupported},
11506 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2},
11507 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
11508 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2},
11509 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2},
11510 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2_trans},
11511 [88] = { ALU_OP0_NOP, tgsi_unsupported},
11512 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2},
11513 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2},
11514 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod},
11515 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2},
11516 [93] = { ALU_OP0_NOP, tgsi_unsupported},
11517 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex},
11518 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
11519 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
11520 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
11521 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit},
11522 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop},
11523 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported},
11524 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop},
11525 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported},
11526 [103] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
11527 [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
11528 [TGSI_OPCODE_RESQ] = { ALU_OP0_NOP, tgsi_unsupported},
11529 [106] = { ALU_OP0_NOP, tgsi_unsupported},
11530 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported},
11531 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2},
11532 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2},
11533 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
11534 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
11535 [TGSI_OPCODE_MEMBAR] = { ALU_OP0_NOP, tgsi_unsupported},
11536 [113] = { ALU_OP0_NOP, tgsi_unsupported},
11537 [114] = { ALU_OP0_NOP, tgsi_unsupported},
11538 [115] = { ALU_OP0_NOP, tgsi_unsupported},
11539 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */
11540 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */
11541 [TGSI_OPCODE_DFMA] = { ALU_OP0_NOP, tgsi_unsupported},
11542 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_op2_trans},
11543 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv},
11544 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2},
11545 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2},
11546 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg},
11547 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2},
11548 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2_trans},
11549 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap},
11550 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_op2_trans},
11551 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
11552 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2},
11553 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv},
11554 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad},
11555 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2},
11556 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2},
11557 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod},
11558 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_UINT, tgsi_op2_trans},
11559 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2},
11560 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2},
11561 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2_trans},
11562 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
11563 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2_swap},
11564 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
11565 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported},
11566 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported},
11567 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
11568 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported},
11569 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported},
11570 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported},
11571 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported},
11572 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported},
11573 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported},
11574 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported},
11575 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported},
11576 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported},
11577 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported},
11578 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported},
11579 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported},
11580 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_r600_arl},
11581 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp},
11582 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs},
11583 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg},
11584 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_unsupported},
11585 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_unsupported},
11586 [163] = { ALU_OP0_NOP, tgsi_unsupported},
11587 [164] = { ALU_OP0_NOP, tgsi_unsupported},
11588 [165] = { ALU_OP0_NOP, tgsi_unsupported},
11589 [TGSI_OPCODE_BARRIER] = { ALU_OP0_NOP, tgsi_unsupported},
11590 [TGSI_OPCODE_ATOMUADD] = { ALU_OP0_NOP, tgsi_unsupported},
11591 [TGSI_OPCODE_ATOMXCHG] = { ALU_OP0_NOP, tgsi_unsupported},
11592 [TGSI_OPCODE_ATOMCAS] = { ALU_OP0_NOP, tgsi_unsupported},
11593 [TGSI_OPCODE_ATOMAND] = { ALU_OP0_NOP, tgsi_unsupported},
11594 [TGSI_OPCODE_ATOMOR] = { ALU_OP0_NOP, tgsi_unsupported},
11595 [TGSI_OPCODE_ATOMXOR] = { ALU_OP0_NOP, tgsi_unsupported},
11596 [TGSI_OPCODE_ATOMUMIN] = { ALU_OP0_NOP, tgsi_unsupported},
11597 [TGSI_OPCODE_ATOMUMAX] = { ALU_OP0_NOP, tgsi_unsupported},
11598 [TGSI_OPCODE_ATOMIMIN] = { ALU_OP0_NOP, tgsi_unsupported},
11599 [TGSI_OPCODE_ATOMIMAX] = { ALU_OP0_NOP, tgsi_unsupported},
11600 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex},
11601 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
11602 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex},
11603 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, tgsi_op2_trans},
11604 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, tgsi_op2_trans},
11605 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_unsupported},
11606 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_unsupported},
11607 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_unsupported},
11608 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_unsupported},
11609 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_unsupported},
11610 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_unsupported},
11611 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_unsupported},
11612 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_unsupported},
11613 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_unsupported},
11614 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_unsupported},
11615 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_unsupported},
11616 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_unsupported},
11617 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_unsupported},
11618 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported},
11619 };
11620
11621 static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = {
11622 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_eg_arl},
11623 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2},
11624 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit},
11625 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate},
11626 [TGSI_OPCODE_RSQ] = { ALU_OP0_NOP, tgsi_rsq},
11627 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp},
11628 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log},
11629 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL_IEEE, tgsi_op2},
11630 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2},
11631 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
11632 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
11633 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst},
11634 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2},
11635 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2},
11636 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap},
11637 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2},
11638 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3},
11639 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp},
11640 [TGSI_OPCODE_FMA] = { ALU_OP3_FMA, tgsi_op3},
11641 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
11642 [21] = { ALU_OP0_NOP, tgsi_unsupported},
11643 [22] = { ALU_OP0_NOP, tgsi_unsupported},
11644 [23] = { ALU_OP0_NOP, tgsi_unsupported},
11645 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2},
11646 [25] = { ALU_OP0_NOP, tgsi_unsupported},
11647 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2},
11648 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2},
11649 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
11650 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
11651 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, tgsi_pow},
11652 [31] = { ALU_OP0_NOP, tgsi_unsupported},
11653 [32] = { ALU_OP0_NOP, tgsi_unsupported},
11654 [TGSI_OPCODE_CLOCK] = { ALU_OP0_NOP, tgsi_clock},
11655 [34] = { ALU_OP0_NOP, tgsi_unsupported},
11656 [35] = { ALU_OP0_NOP, tgsi_unsupported},
11657 [TGSI_OPCODE_COS] = { ALU_OP1_COS, tgsi_trig},
11658 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
11659 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
11660 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */
11661 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_pk2h},
11662 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported},
11663 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported},
11664 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported},
11665 [44] = { ALU_OP0_NOP, tgsi_unsupported},
11666 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2},
11667 [46] = { ALU_OP0_NOP, tgsi_unsupported},
11668 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2},
11669 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, tgsi_trig},
11670 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap},
11671 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2},
11672 [51] = { ALU_OP0_NOP, tgsi_unsupported},
11673 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex},
11674 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex},
11675 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex},
11676 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_up2h},
11677 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported},
11678 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported},
11679 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported},
11680 [59] = { ALU_OP0_NOP, tgsi_unsupported},
11681 [60] = { ALU_OP0_NOP, tgsi_unsupported},
11682 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_eg_arl},
11683 [62] = { ALU_OP0_NOP, tgsi_unsupported},
11684 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported},
11685 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported},
11686 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg},
11687 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp},
11688 [67] = { ALU_OP0_NOP, tgsi_unsupported},
11689 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
11690 [69] = { ALU_OP0_NOP, tgsi_unsupported},
11691 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported},
11692 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
11693 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex},
11694 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
11695 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if},
11696 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif},
11697 [76] = { ALU_OP0_NOP, tgsi_unsupported},
11698 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else},
11699 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif},
11700 [TGSI_OPCODE_DDX_FINE] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
11701 [TGSI_OPCODE_DDY_FINE] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
11702 [82] = { ALU_OP0_NOP, tgsi_unsupported},
11703 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2},
11704 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
11705 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2},
11706 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2},
11707 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2},
11708 [88] = { ALU_OP0_NOP, tgsi_unsupported},
11709 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2},
11710 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2},
11711 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod},
11712 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2},
11713 [93] = { ALU_OP0_NOP, tgsi_unsupported},
11714 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex},
11715 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
11716 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
11717 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
11718 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit},
11719 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop},
11720 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported},
11721 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop},
11722 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported},
11723 [103] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
11724 [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
11725 [TGSI_OPCODE_RESQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_resq},
11726 [106] = { ALU_OP0_NOP, tgsi_unsupported},
11727 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported},
11728 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2},
11729 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2},
11730 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
11731 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
11732 [TGSI_OPCODE_MEMBAR] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
11733 [113] = { ALU_OP0_NOP, tgsi_unsupported},
11734 [114] = { ALU_OP0_NOP, tgsi_unsupported},
11735 [115] = { ALU_OP0_NOP, tgsi_unsupported},
11736 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */
11737 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */
11738 /* Refer below for TGSI_OPCODE_DFMA */
11739 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_f2i},
11740 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv},
11741 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2},
11742 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2},
11743 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg},
11744 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2},
11745 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2},
11746 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap},
11747 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_f2i},
11748 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
11749 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2},
11750 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv},
11751 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad},
11752 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2},
11753 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2},
11754 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod},
11755 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_UINT, tgsi_op2_trans},
11756 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2},
11757 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2},
11758 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2},
11759 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
11760 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2},
11761 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
11762 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported},
11763 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported},
11764 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
11765 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported},
11766 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported},
11767 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported},
11768 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported},
11769 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported},
11770 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported},
11771 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported},
11772 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported},
11773 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported},
11774 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported},
11775 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported},
11776 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported},
11777 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_eg_arl},
11778 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp},
11779 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs},
11780 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg},
11781 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_load},
11782 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_store},
11783 [163] = { ALU_OP0_NOP, tgsi_unsupported},
11784 [164] = { ALU_OP0_NOP, tgsi_unsupported},
11785 [165] = { ALU_OP0_NOP, tgsi_unsupported},
11786 [TGSI_OPCODE_BARRIER] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
11787 [TGSI_OPCODE_ATOMUADD] = { V_RAT_INST_ADD_RTN, tgsi_atomic_op},
11788 [TGSI_OPCODE_ATOMXCHG] = { V_RAT_INST_XCHG_RTN, tgsi_atomic_op},
11789 [TGSI_OPCODE_ATOMCAS] = { V_RAT_INST_CMPXCHG_INT_RTN, tgsi_atomic_op},
11790 [TGSI_OPCODE_ATOMAND] = { V_RAT_INST_AND_RTN, tgsi_atomic_op},
11791 [TGSI_OPCODE_ATOMOR] = { V_RAT_INST_OR_RTN, tgsi_atomic_op},
11792 [TGSI_OPCODE_ATOMXOR] = { V_RAT_INST_XOR_RTN, tgsi_atomic_op},
11793 [TGSI_OPCODE_ATOMUMIN] = { V_RAT_INST_MIN_UINT_RTN, tgsi_atomic_op},
11794 [TGSI_OPCODE_ATOMUMAX] = { V_RAT_INST_MAX_UINT_RTN, tgsi_atomic_op},
11795 [TGSI_OPCODE_ATOMIMIN] = { V_RAT_INST_MIN_INT_RTN, tgsi_atomic_op},
11796 [TGSI_OPCODE_ATOMIMAX] = { V_RAT_INST_MAX_INT_RTN, tgsi_atomic_op},
11797 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex},
11798 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
11799 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex},
11800 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, tgsi_op2_trans},
11801 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, tgsi_op2_trans},
11802 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_tex},
11803 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_tex},
11804 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_bfe},
11805 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_bfe},
11806 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_bfi},
11807 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_op2},
11808 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_op2},
11809 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_op2},
11810 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_msb},
11811 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_msb},
11812 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_interp_egcm},
11813 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_interp_egcm},
11814 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_interp_egcm},
11815 [TGSI_OPCODE_F2D] = { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},
11816 [TGSI_OPCODE_D2F] = { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},
11817 [TGSI_OPCODE_DABS] = { ALU_OP1_MOV, tgsi_op2_64},
11818 [TGSI_OPCODE_DNEG] = { ALU_OP2_ADD_64, tgsi_dneg},
11819 [TGSI_OPCODE_DADD] = { ALU_OP2_ADD_64, tgsi_op2_64},
11820 [TGSI_OPCODE_DMUL] = { ALU_OP2_MUL_64, cayman_mul_double_instr},
11821 [TGSI_OPCODE_DDIV] = { 0, cayman_ddiv_instr },
11822 [TGSI_OPCODE_DMAX] = { ALU_OP2_MAX_64, tgsi_op2_64},
11823 [TGSI_OPCODE_DMIN] = { ALU_OP2_MIN_64, tgsi_op2_64},
11824 [TGSI_OPCODE_DSLT] = { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
11825 [TGSI_OPCODE_DSGE] = { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},
11826 [TGSI_OPCODE_DSEQ] = { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},
11827 [TGSI_OPCODE_DSNE] = { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},
11828 [TGSI_OPCODE_DRCP] = { ALU_OP2_RECIP_64, cayman_emit_double_instr},
11829 [TGSI_OPCODE_DSQRT] = { ALU_OP2_SQRT_64, cayman_emit_double_instr},
11830 [TGSI_OPCODE_DMAD] = { ALU_OP3_FMA_64, tgsi_op3_64},
11831 [TGSI_OPCODE_DFMA] = { ALU_OP3_FMA_64, tgsi_op3_64},
11832 [TGSI_OPCODE_DFRAC] = { ALU_OP1_FRACT_64, tgsi_op2_64},
11833 [TGSI_OPCODE_DLDEXP] = { ALU_OP2_LDEXP_64, tgsi_op2_64},
11834 [TGSI_OPCODE_DFRACEXP] = { ALU_OP1_FREXP_64, tgsi_dfracexp},
11835 [TGSI_OPCODE_D2I] = { ALU_OP1_FLT_TO_INT, egcm_double_to_int},
11836 [TGSI_OPCODE_I2D] = { ALU_OP1_INT_TO_FLT, egcm_int_to_double},
11837 [TGSI_OPCODE_D2U] = { ALU_OP1_FLT_TO_UINT, egcm_double_to_int},
11838 [TGSI_OPCODE_U2D] = { ALU_OP1_UINT_TO_FLT, egcm_int_to_double},
11839 [TGSI_OPCODE_DRSQ] = { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},
11840 [TGSI_OPCODE_U64SNE] = { ALU_OP0_NOP, egcm_u64sne },
11841 [TGSI_OPCODE_U64ADD] = { ALU_OP0_NOP, egcm_u64add },
11842 [TGSI_OPCODE_U64MUL] = { ALU_OP0_NOP, egcm_u64mul },
11843 [TGSI_OPCODE_U64DIV] = { ALU_OP0_NOP, egcm_u64div },
11844 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported},
11845 };
11846
11847 static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = {
11848 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_eg_arl},
11849 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2},
11850 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit},
11851 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, cayman_emit_float_instr},
11852 [TGSI_OPCODE_RSQ] = { ALU_OP1_RECIPSQRT_IEEE, cayman_emit_float_instr},
11853 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp},
11854 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log},
11855 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL_IEEE, tgsi_op2},
11856 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2},
11857 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
11858 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
11859 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst},
11860 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2},
11861 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2},
11862 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap},
11863 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2},
11864 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3},
11865 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp},
11866 [TGSI_OPCODE_FMA] = { ALU_OP3_FMA, tgsi_op3},
11867 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, cayman_emit_float_instr},
11868 [21] = { ALU_OP0_NOP, tgsi_unsupported},
11869 [22] = { ALU_OP0_NOP, tgsi_unsupported},
11870 [23] = { ALU_OP0_NOP, tgsi_unsupported},
11871 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2},
11872 [25] = { ALU_OP0_NOP, tgsi_unsupported},
11873 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2},
11874 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2},
11875 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, cayman_emit_float_instr},
11876 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, cayman_emit_float_instr},
11877 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, cayman_pow},
11878 [31] = { ALU_OP0_NOP, tgsi_unsupported},
11879 [32] = { ALU_OP0_NOP, tgsi_unsupported},
11880 [TGSI_OPCODE_CLOCK] = { ALU_OP0_NOP, tgsi_clock},
11881 [34] = { ALU_OP0_NOP, tgsi_unsupported},
11882 [35] = { ALU_OP0_NOP, tgsi_unsupported},
11883 [TGSI_OPCODE_COS] = { ALU_OP1_COS, cayman_trig},
11884 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
11885 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
11886 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */
11887 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_pk2h},
11888 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported},
11889 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported},
11890 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported},
11891 [44] = { ALU_OP0_NOP, tgsi_unsupported},
11892 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2},
11893 [46] = { ALU_OP0_NOP, tgsi_unsupported},
11894 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2},
11895 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, cayman_trig},
11896 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap},
11897 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2},
11898 [51] = { ALU_OP0_NOP, tgsi_unsupported},
11899 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex},
11900 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex},
11901 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex},
11902 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_up2h},
11903 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported},
11904 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported},
11905 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported},
11906 [59] = { ALU_OP0_NOP, tgsi_unsupported},
11907 [60] = { ALU_OP0_NOP, tgsi_unsupported},
11908 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_eg_arl},
11909 [62] = { ALU_OP0_NOP, tgsi_unsupported},
11910 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported},
11911 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported},
11912 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg},
11913 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp},
11914 [67] = { ALU_OP0_NOP, tgsi_unsupported},
11915 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
11916 [69] = { ALU_OP0_NOP, tgsi_unsupported},
11917 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported},
11918 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
11919 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex},
11920 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
11921 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if},
11922 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif},
11923 [76] = { ALU_OP0_NOP, tgsi_unsupported},
11924 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else},
11925 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif},
11926 [TGSI_OPCODE_DDX_FINE] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
11927 [TGSI_OPCODE_DDY_FINE] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
11928 [82] = { ALU_OP0_NOP, tgsi_unsupported},
11929 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2},
11930 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2},
11931 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2},
11932 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2},
11933 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2},
11934 [88] = { ALU_OP0_NOP, tgsi_unsupported},
11935 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2},
11936 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2},
11937 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod},
11938 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2},
11939 [93] = { ALU_OP0_NOP, tgsi_unsupported},
11940 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex},
11941 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
11942 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
11943 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
11944 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit},
11945 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop},
11946 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported},
11947 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop},
11948 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported},
11949 [103] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
11950 [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
11951 [TGSI_OPCODE_RESQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_resq},
11952 [106] = { ALU_OP0_NOP, tgsi_unsupported},
11953 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported},
11954 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2},
11955 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2},
11956 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
11957 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
11958 [TGSI_OPCODE_MEMBAR] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
11959 [113] = { ALU_OP0_NOP, tgsi_unsupported},
11960 [114] = { ALU_OP0_NOP, tgsi_unsupported},
11961 [115] = { ALU_OP0_NOP, tgsi_unsupported},
11962 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */
11963 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */
11964 /* Refer below for TGSI_OPCODE_DFMA */
11965 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_op2},
11966 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv},
11967 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2},
11968 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2},
11969 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg},
11970 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2},
11971 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2},
11972 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap},
11973 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_op2},
11974 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2},
11975 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2},
11976 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv},
11977 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad},
11978 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2},
11979 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2},
11980 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod},
11981 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_INT, cayman_mul_int_instr},
11982 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2},
11983 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2},
11984 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2},
11985 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
11986 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2},
11987 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
11988 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported},
11989 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported},
11990 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
11991 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported},
11992 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported},
11993 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported},
11994 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported},
11995 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported},
11996 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported},
11997 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported},
11998 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported},
11999 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported},
12000 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported},
12001 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported},
12002 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported},
12003 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_eg_arl},
12004 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp},
12005 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs},
12006 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg},
12007 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_load},
12008 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_store},
12009 [163] = { ALU_OP0_NOP, tgsi_unsupported},
12010 [164] = { ALU_OP0_NOP, tgsi_unsupported},
12011 [165] = { ALU_OP0_NOP, tgsi_unsupported},
12012 [TGSI_OPCODE_BARRIER] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
12013 [TGSI_OPCODE_ATOMUADD] = { V_RAT_INST_ADD_RTN, tgsi_atomic_op},
12014 [TGSI_OPCODE_ATOMXCHG] = { V_RAT_INST_XCHG_RTN, tgsi_atomic_op},
12015 [TGSI_OPCODE_ATOMCAS] = { V_RAT_INST_CMPXCHG_INT_RTN, tgsi_atomic_op},
12016 [TGSI_OPCODE_ATOMAND] = { V_RAT_INST_AND_RTN, tgsi_atomic_op},
12017 [TGSI_OPCODE_ATOMOR] = { V_RAT_INST_OR_RTN, tgsi_atomic_op},
12018 [TGSI_OPCODE_ATOMXOR] = { V_RAT_INST_XOR_RTN, tgsi_atomic_op},
12019 [TGSI_OPCODE_ATOMUMIN] = { V_RAT_INST_MIN_UINT_RTN, tgsi_atomic_op},
12020 [TGSI_OPCODE_ATOMUMAX] = { V_RAT_INST_MAX_UINT_RTN, tgsi_atomic_op},
12021 [TGSI_OPCODE_ATOMIMIN] = { V_RAT_INST_MIN_INT_RTN, tgsi_atomic_op},
12022 [TGSI_OPCODE_ATOMIMAX] = { V_RAT_INST_MAX_INT_RTN, tgsi_atomic_op},
12023 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex},
12024 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
12025 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex},
12026 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, cayman_mul_int_instr},
12027 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, cayman_mul_int_instr},
12028 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_tex},
12029 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_tex},
12030 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_bfe},
12031 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_bfe},
12032 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_bfi},
12033 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_op2},
12034 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_op2},
12035 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_op2},
12036 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_msb},
12037 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_msb},
12038 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_interp_egcm},
12039 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_interp_egcm},
12040 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_interp_egcm},
12041 [TGSI_OPCODE_F2D] = { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},
12042 [TGSI_OPCODE_D2F] = { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},
12043 [TGSI_OPCODE_DABS] = { ALU_OP1_MOV, tgsi_op2_64},
12044 [TGSI_OPCODE_DNEG] = { ALU_OP2_ADD_64, tgsi_dneg},
12045 [TGSI_OPCODE_DADD] = { ALU_OP2_ADD_64, tgsi_op2_64},
12046 [TGSI_OPCODE_DMUL] = { ALU_OP2_MUL_64, cayman_mul_double_instr},
12047 [TGSI_OPCODE_DDIV] = { 0, cayman_ddiv_instr },
12048 [TGSI_OPCODE_DMAX] = { ALU_OP2_MAX_64, tgsi_op2_64},
12049 [TGSI_OPCODE_DMIN] = { ALU_OP2_MIN_64, tgsi_op2_64},
12050 [TGSI_OPCODE_DSLT] = { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
12051 [TGSI_OPCODE_DSGE] = { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},
12052 [TGSI_OPCODE_DSEQ] = { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},
12053 [TGSI_OPCODE_DSNE] = { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},
12054 [TGSI_OPCODE_DRCP] = { ALU_OP2_RECIP_64, cayman_emit_double_instr},
12055 [TGSI_OPCODE_DSQRT] = { ALU_OP2_SQRT_64, cayman_emit_double_instr},
12056 [TGSI_OPCODE_DMAD] = { ALU_OP3_FMA_64, tgsi_op3_64},
12057 [TGSI_OPCODE_DFMA] = { ALU_OP3_FMA_64, tgsi_op3_64},
12058 [TGSI_OPCODE_DFRAC] = { ALU_OP1_FRACT_64, tgsi_op2_64},
12059 [TGSI_OPCODE_DLDEXP] = { ALU_OP2_LDEXP_64, tgsi_op2_64},
12060 [TGSI_OPCODE_DFRACEXP] = { ALU_OP1_FREXP_64, tgsi_dfracexp},
12061 [TGSI_OPCODE_D2I] = { ALU_OP1_FLT_TO_INT, egcm_double_to_int},
12062 [TGSI_OPCODE_I2D] = { ALU_OP1_INT_TO_FLT, egcm_int_to_double},
12063 [TGSI_OPCODE_D2U] = { ALU_OP1_FLT_TO_UINT, egcm_double_to_int},
12064 [TGSI_OPCODE_U2D] = { ALU_OP1_UINT_TO_FLT, egcm_int_to_double},
12065 [TGSI_OPCODE_DRSQ] = { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},
12066 [TGSI_OPCODE_U64SNE] = { ALU_OP0_NOP, egcm_u64sne },
12067 [TGSI_OPCODE_U64ADD] = { ALU_OP0_NOP, egcm_u64add },
12068 [TGSI_OPCODE_U64MUL] = { ALU_OP0_NOP, egcm_u64mul },
12069 [TGSI_OPCODE_U64DIV] = { ALU_OP0_NOP, egcm_u64div },
12070 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported},
12071 };