r600_shader: only load from LDS what is really used
[mesa.git] / src / gallium / drivers / r600 / r600_shader.c
1 /*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 */
23 #include "r600_sq.h"
24 #include "r600_formats.h"
25 #include "r600_opcodes.h"
26 #include "r600_shader.h"
27 #include "r600d.h"
28
29 #include "sb/sb_public.h"
30
31 #include "pipe/p_shader_tokens.h"
32 #include "tgsi/tgsi_info.h"
33 #include "tgsi/tgsi_parse.h"
34 #include "tgsi/tgsi_scan.h"
35 #include "tgsi/tgsi_dump.h"
36 #include "util/u_bitcast.h"
37 #include "util/u_memory.h"
38 #include "util/u_math.h"
39 #include <stdio.h>
40 #include <errno.h>
41
42 /* CAYMAN notes
43 Why CAYMAN got loops for lots of instructions is explained here.
44
45 -These 8xx t-slot only ops are implemented in all vector slots.
46 MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT
47 These 8xx t-slot only opcodes become vector ops, with all four
48 slots expecting the arguments on sources a and b. Result is
49 broadcast to all channels.
50 MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT, MUL_64
51 These 8xx t-slot only opcodes become vector ops in the z, y, and
52 x slots.
53 EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64
54 RECIPSQRT_IEEE/CLAMPED/FF/_64/CLAMPED_64
55 SQRT_IEEE/_64
56 SIN/COS
57 The w slot may have an independent co-issued operation, or if the
58 result is required to be in the w slot, the opcode above may be
59 issued in the w slot as well.
60 The compiler must issue the source argument to slots z, y, and x
61 */
62
63 /* Contents of r0 on entry to various shaders
64
65 VS - .x = VertexID
66 .y = RelVertexID (??)
67 .w = InstanceID
68
69 GS - r0.xyw, r1.xyz = per-vertex offsets
70 r0.z = PrimitiveID
71
72 TCS - .x = PatchID
73 .y = RelPatchID (??)
74 .z = InvocationID
75 .w = tess factor base.
76
77 TES - .x = TessCoord.x
78 - .y = TessCoord.y
79 - .z = RelPatchID (??)
80 - .w = PrimitiveID
81
82 PS - face_gpr.z = SampleMask
83 face_gpr.w = SampleID
84 */
85 #define R600_SHADER_BUFFER_INFO_SEL (512 + R600_BUFFER_INFO_OFFSET / 16)
86 static int r600_shader_from_tgsi(struct r600_context *rctx,
87 struct r600_pipe_shader *pipeshader,
88 union r600_shader_key key);
89
90 static void r600_add_gpr_array(struct r600_shader *ps, int start_gpr,
91 int size, unsigned comp_mask) {
92
93 if (!size)
94 return;
95
96 if (ps->num_arrays == ps->max_arrays) {
97 ps->max_arrays += 64;
98 ps->arrays = realloc(ps->arrays, ps->max_arrays *
99 sizeof(struct r600_shader_array));
100 }
101
102 int n = ps->num_arrays;
103 ++ps->num_arrays;
104
105 ps->arrays[n].comp_mask = comp_mask;
106 ps->arrays[n].gpr_start = start_gpr;
107 ps->arrays[n].gpr_count = size;
108 }
109
110 static void r600_dump_streamout(struct pipe_stream_output_info *so)
111 {
112 unsigned i;
113
114 fprintf(stderr, "STREAMOUT\n");
115 for (i = 0; i < so->num_outputs; i++) {
116 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
117 so->output[i].start_component;
118 fprintf(stderr, " %i: MEM_STREAM%d_BUF%i[%i..%i] <- OUT[%i].%s%s%s%s%s\n",
119 i,
120 so->output[i].stream,
121 so->output[i].output_buffer,
122 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
123 so->output[i].register_index,
124 mask & 1 ? "x" : "",
125 mask & 2 ? "y" : "",
126 mask & 4 ? "z" : "",
127 mask & 8 ? "w" : "",
128 so->output[i].dst_offset < so->output[i].start_component ? " (will lower)" : "");
129 }
130 }
131
132 static int store_shader(struct pipe_context *ctx,
133 struct r600_pipe_shader *shader)
134 {
135 struct r600_context *rctx = (struct r600_context *)ctx;
136 uint32_t *ptr, i;
137
138 if (shader->bo == NULL) {
139 shader->bo = (struct r600_resource*)
140 pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_IMMUTABLE, shader->shader.bc.ndw * 4);
141 if (shader->bo == NULL) {
142 return -ENOMEM;
143 }
144 ptr = r600_buffer_map_sync_with_rings(&rctx->b, shader->bo, PIPE_TRANSFER_WRITE);
145 if (R600_BIG_ENDIAN) {
146 for (i = 0; i < shader->shader.bc.ndw; ++i) {
147 ptr[i] = util_cpu_to_le32(shader->shader.bc.bytecode[i]);
148 }
149 } else {
150 memcpy(ptr, shader->shader.bc.bytecode, shader->shader.bc.ndw * sizeof(*ptr));
151 }
152 rctx->b.ws->buffer_unmap(shader->bo->buf);
153 }
154
155 return 0;
156 }
157
158 int r600_pipe_shader_create(struct pipe_context *ctx,
159 struct r600_pipe_shader *shader,
160 union r600_shader_key key)
161 {
162 struct r600_context *rctx = (struct r600_context *)ctx;
163 struct r600_pipe_shader_selector *sel = shader->selector;
164 int r;
165 bool dump = r600_can_dump_shader(&rctx->screen->b,
166 tgsi_get_processor_type(sel->tokens));
167 unsigned use_sb = !(rctx->screen->b.debug_flags & DBG_NO_SB);
168 unsigned sb_disasm = use_sb || (rctx->screen->b.debug_flags & DBG_SB_DISASM);
169 unsigned export_shader;
170
171 shader->shader.bc.isa = rctx->isa;
172
173 if (dump) {
174 fprintf(stderr, "--------------------------------------------------------------\n");
175 tgsi_dump(sel->tokens, 0);
176
177 if (sel->so.num_outputs) {
178 r600_dump_streamout(&sel->so);
179 }
180 }
181 r = r600_shader_from_tgsi(rctx, shader, key);
182 if (r) {
183 R600_ERR("translation from TGSI failed !\n");
184 goto error;
185 }
186 if (shader->shader.processor_type == PIPE_SHADER_VERTEX) {
187 /* only disable for vertex shaders in tess paths */
188 if (key.vs.as_ls)
189 use_sb = 0;
190 }
191 use_sb &= (shader->shader.processor_type != PIPE_SHADER_TESS_CTRL);
192 use_sb &= (shader->shader.processor_type != PIPE_SHADER_TESS_EVAL);
193
194 /* disable SB for shaders using doubles */
195 use_sb &= !shader->shader.uses_doubles;
196
197 use_sb &= !shader->shader.uses_atomics;
198 use_sb &= !shader->shader.uses_images;
199
200 /* Check if the bytecode has already been built. */
201 if (!shader->shader.bc.bytecode) {
202 r = r600_bytecode_build(&shader->shader.bc);
203 if (r) {
204 R600_ERR("building bytecode failed !\n");
205 goto error;
206 }
207 }
208
209 if (dump && !sb_disasm) {
210 fprintf(stderr, "--------------------------------------------------------------\n");
211 r600_bytecode_disasm(&shader->shader.bc);
212 fprintf(stderr, "______________________________________________________________\n");
213 } else if ((dump && sb_disasm) || use_sb) {
214 r = r600_sb_bytecode_process(rctx, &shader->shader.bc, &shader->shader,
215 dump, use_sb);
216 if (r) {
217 R600_ERR("r600_sb_bytecode_process failed !\n");
218 goto error;
219 }
220 }
221
222 if (shader->gs_copy_shader) {
223 if (dump) {
224 // dump copy shader
225 r = r600_sb_bytecode_process(rctx, &shader->gs_copy_shader->shader.bc,
226 &shader->gs_copy_shader->shader, dump, 0);
227 if (r)
228 goto error;
229 }
230
231 if ((r = store_shader(ctx, shader->gs_copy_shader)))
232 goto error;
233 }
234
235 /* Store the shader in a buffer. */
236 if ((r = store_shader(ctx, shader)))
237 goto error;
238
239 /* Build state. */
240 switch (shader->shader.processor_type) {
241 case PIPE_SHADER_TESS_CTRL:
242 evergreen_update_hs_state(ctx, shader);
243 break;
244 case PIPE_SHADER_TESS_EVAL:
245 if (key.tes.as_es)
246 evergreen_update_es_state(ctx, shader);
247 else
248 evergreen_update_vs_state(ctx, shader);
249 break;
250 case PIPE_SHADER_GEOMETRY:
251 if (rctx->b.chip_class >= EVERGREEN) {
252 evergreen_update_gs_state(ctx, shader);
253 evergreen_update_vs_state(ctx, shader->gs_copy_shader);
254 } else {
255 r600_update_gs_state(ctx, shader);
256 r600_update_vs_state(ctx, shader->gs_copy_shader);
257 }
258 break;
259 case PIPE_SHADER_VERTEX:
260 export_shader = key.vs.as_es;
261 if (rctx->b.chip_class >= EVERGREEN) {
262 if (key.vs.as_ls)
263 evergreen_update_ls_state(ctx, shader);
264 else if (key.vs.as_es)
265 evergreen_update_es_state(ctx, shader);
266 else
267 evergreen_update_vs_state(ctx, shader);
268 } else {
269 if (export_shader)
270 r600_update_es_state(ctx, shader);
271 else
272 r600_update_vs_state(ctx, shader);
273 }
274 break;
275 case PIPE_SHADER_FRAGMENT:
276 if (rctx->b.chip_class >= EVERGREEN) {
277 evergreen_update_ps_state(ctx, shader);
278 } else {
279 r600_update_ps_state(ctx, shader);
280 }
281 break;
282 default:
283 r = -EINVAL;
284 goto error;
285 }
286 return 0;
287
288 error:
289 r600_pipe_shader_destroy(ctx, shader);
290 return r;
291 }
292
293 void r600_pipe_shader_destroy(struct pipe_context *ctx UNUSED, struct r600_pipe_shader *shader)
294 {
295 r600_resource_reference(&shader->bo, NULL);
296 r600_bytecode_clear(&shader->shader.bc);
297 r600_release_command_buffer(&shader->command_buffer);
298 }
299
300 /*
301 * tgsi -> r600 shader
302 */
303 struct r600_shader_tgsi_instruction;
304
305 struct r600_shader_src {
306 unsigned sel;
307 unsigned swizzle[4];
308 unsigned neg;
309 unsigned abs;
310 unsigned rel;
311 unsigned kc_bank;
312 boolean kc_rel; /* true if cache bank is indexed */
313 uint32_t value[4];
314 };
315
316 struct eg_interp {
317 boolean enabled;
318 unsigned ij_index;
319 };
320
321 struct r600_shader_ctx {
322 struct tgsi_shader_info info;
323 struct tgsi_parse_context parse;
324 const struct tgsi_token *tokens;
325 unsigned type;
326 unsigned file_offset[TGSI_FILE_COUNT];
327 unsigned temp_reg;
328 const struct r600_shader_tgsi_instruction *inst_info;
329 struct r600_bytecode *bc;
330 struct r600_shader *shader;
331 struct r600_shader_src src[4];
332 uint32_t *literals;
333 uint32_t nliterals;
334 uint32_t max_driver_temp_used;
335 /* needed for evergreen interpolation */
336 struct eg_interp eg_interpolators[6]; // indexed by Persp/Linear * 3 + sample/center/centroid
337 /* evergreen/cayman also store sample mask in face register */
338 int face_gpr;
339 /* sample id is .w component stored in fixed point position register */
340 int fixed_pt_position_gpr;
341 int colors_used;
342 boolean clip_vertex_write;
343 unsigned cv_output;
344 unsigned edgeflag_output;
345 int fragcoord_input;
346 int native_integers;
347 int next_ring_offset;
348 int gs_out_ring_offset;
349 int gs_next_vertex;
350 struct r600_shader *gs_for_vs;
351 int gs_export_gpr_tregs[4];
352 int gs_rotated_input[2];
353 const struct pipe_stream_output_info *gs_stream_output_info;
354 unsigned enabled_stream_buffers_mask;
355 unsigned tess_input_info; /* temp with tess input offsets */
356 unsigned tess_output_info; /* temp with tess input offsets */
357 unsigned thread_id_gpr; /* temp with thread id calculated for images */
358 };
359
360 struct r600_shader_tgsi_instruction {
361 unsigned op;
362 int (*process)(struct r600_shader_ctx *ctx);
363 };
364
365 static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so, int stream, bool ind);
366 static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[];
367 static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx);
368 static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason);
369 static void fc_pushlevel(struct r600_shader_ctx *ctx, int type);
370 static int tgsi_else(struct r600_shader_ctx *ctx);
371 static int tgsi_endif(struct r600_shader_ctx *ctx);
372 static int tgsi_bgnloop(struct r600_shader_ctx *ctx);
373 static int tgsi_endloop(struct r600_shader_ctx *ctx);
374 static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx);
375 static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
376 unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,
377 unsigned int dst_reg);
378 static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
379 const struct r600_shader_src *shader_src,
380 unsigned chan);
381 static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg,
382 unsigned dst_reg, unsigned mask);
383
384 static int tgsi_last_instruction(unsigned writemask)
385 {
386 int i, lasti = 0;
387
388 for (i = 0; i < 4; i++) {
389 if (writemask & (1 << i)) {
390 lasti = i;
391 }
392 }
393 return lasti;
394 }
395
396 static int tgsi_is_supported(struct r600_shader_ctx *ctx)
397 {
398 struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction;
399 unsigned j;
400
401 if (i->Instruction.NumDstRegs > 1 && i->Instruction.Opcode != TGSI_OPCODE_DFRACEXP) {
402 R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs);
403 return -EINVAL;
404 }
405 #if 0
406 if (i->Instruction.Label) {
407 R600_ERR("label unsupported\n");
408 return -EINVAL;
409 }
410 #endif
411 for (j = 0; j < i->Instruction.NumSrcRegs; j++) {
412 if (i->Src[j].Register.Dimension) {
413 switch (i->Src[j].Register.File) {
414 case TGSI_FILE_CONSTANT:
415 case TGSI_FILE_HW_ATOMIC:
416 break;
417 case TGSI_FILE_INPUT:
418 if (ctx->type == PIPE_SHADER_GEOMETRY ||
419 ctx->type == PIPE_SHADER_TESS_CTRL ||
420 ctx->type == PIPE_SHADER_TESS_EVAL)
421 break;
422 case TGSI_FILE_OUTPUT:
423 if (ctx->type == PIPE_SHADER_TESS_CTRL)
424 break;
425 default:
426 R600_ERR("unsupported src %d (file %d, dimension %d)\n", j,
427 i->Src[j].Register.File,
428 i->Src[j].Register.Dimension);
429 return -EINVAL;
430 }
431 }
432 }
433 for (j = 0; j < i->Instruction.NumDstRegs; j++) {
434 if (i->Dst[j].Register.Dimension) {
435 if (ctx->type == PIPE_SHADER_TESS_CTRL)
436 continue;
437 R600_ERR("unsupported dst (dimension)\n");
438 return -EINVAL;
439 }
440 }
441 return 0;
442 }
443
444 int eg_get_interpolator_index(unsigned interpolate, unsigned location)
445 {
446 if (interpolate == TGSI_INTERPOLATE_COLOR ||
447 interpolate == TGSI_INTERPOLATE_LINEAR ||
448 interpolate == TGSI_INTERPOLATE_PERSPECTIVE)
449 {
450 int is_linear = interpolate == TGSI_INTERPOLATE_LINEAR;
451 int loc;
452
453 switch(location) {
454 case TGSI_INTERPOLATE_LOC_CENTER:
455 loc = 1;
456 break;
457 case TGSI_INTERPOLATE_LOC_CENTROID:
458 loc = 2;
459 break;
460 case TGSI_INTERPOLATE_LOC_SAMPLE:
461 default:
462 loc = 0; break;
463 }
464
465 return is_linear * 3 + loc;
466 }
467
468 return -1;
469 }
470
471 static void evergreen_interp_assign_ij_index(struct r600_shader_ctx *ctx,
472 int input)
473 {
474 int i = eg_get_interpolator_index(
475 ctx->shader->input[input].interpolate,
476 ctx->shader->input[input].interpolate_location);
477 assert(i >= 0);
478 ctx->shader->input[input].ij_index = ctx->eg_interpolators[i].ij_index;
479 }
480
481 static int evergreen_interp_alu(struct r600_shader_ctx *ctx, int input)
482 {
483 int i, r;
484 struct r600_bytecode_alu alu;
485 int gpr = 0, base_chan = 0;
486 int ij_index = ctx->shader->input[input].ij_index;
487
488 /* work out gpr and base_chan from index */
489 gpr = ij_index / 2;
490 base_chan = (2 * (ij_index % 2)) + 1;
491
492 for (i = 0; i < 8; i++) {
493 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
494
495 if (i < 4)
496 alu.op = ALU_OP2_INTERP_ZW;
497 else
498 alu.op = ALU_OP2_INTERP_XY;
499
500 if ((i > 1) && (i < 6)) {
501 alu.dst.sel = ctx->shader->input[input].gpr;
502 alu.dst.write = 1;
503 }
504
505 alu.dst.chan = i % 4;
506
507 alu.src[0].sel = gpr;
508 alu.src[0].chan = (base_chan - (i % 2));
509
510 alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
511
512 alu.bank_swizzle_force = SQ_ALU_VEC_210;
513 if ((i % 4) == 3)
514 alu.last = 1;
515 r = r600_bytecode_add_alu(ctx->bc, &alu);
516 if (r)
517 return r;
518 }
519 return 0;
520 }
521
522 static int evergreen_interp_flat(struct r600_shader_ctx *ctx, int input)
523 {
524 int i, r;
525 struct r600_bytecode_alu alu;
526
527 for (i = 0; i < 4; i++) {
528 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
529
530 alu.op = ALU_OP1_INTERP_LOAD_P0;
531
532 alu.dst.sel = ctx->shader->input[input].gpr;
533 alu.dst.write = 1;
534
535 alu.dst.chan = i;
536
537 alu.src[0].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
538 alu.src[0].chan = i;
539
540 if (i == 3)
541 alu.last = 1;
542 r = r600_bytecode_add_alu(ctx->bc, &alu);
543 if (r)
544 return r;
545 }
546 return 0;
547 }
548
549 /*
550 * Special export handling in shaders
551 *
552 * shader export ARRAY_BASE for EXPORT_POS:
553 * 60 is position
554 * 61 is misc vector
555 * 62, 63 are clip distance vectors
556 *
557 * The use of the values exported in 61-63 are controlled by PA_CL_VS_OUT_CNTL:
558 * VS_OUT_MISC_VEC_ENA - enables the use of all fields in export 61
559 * USE_VTX_POINT_SIZE - point size in the X channel of export 61
560 * USE_VTX_EDGE_FLAG - edge flag in the Y channel of export 61
561 * USE_VTX_RENDER_TARGET_INDX - render target index in the Z channel of export 61
562 * USE_VTX_VIEWPORT_INDX - viewport index in the W channel of export 61
563 * USE_VTX_KILL_FLAG - kill flag in the Z channel of export 61 (mutually
564 * exclusive from render target index)
565 * VS_OUT_CCDIST0_VEC_ENA/VS_OUT_CCDIST1_VEC_ENA - enable clip distance vectors
566 *
567 *
568 * shader export ARRAY_BASE for EXPORT_PIXEL:
569 * 0-7 CB targets
570 * 61 computed Z vector
571 *
572 * The use of the values exported in the computed Z vector are controlled
573 * by DB_SHADER_CONTROL:
574 * Z_EXPORT_ENABLE - Z as a float in RED
575 * STENCIL_REF_EXPORT_ENABLE - stencil ref as int in GREEN
576 * COVERAGE_TO_MASK_ENABLE - alpha to mask in ALPHA
577 * MASK_EXPORT_ENABLE - pixel sample mask in BLUE
578 * DB_SOURCE_FORMAT - export control restrictions
579 *
580 */
581
582
583 /* Map name/sid pair from tgsi to the 8-bit semantic index for SPI setup */
584 static int r600_spi_sid(struct r600_shader_io * io)
585 {
586 int index, name = io->name;
587
588 /* These params are handled differently, they don't need
589 * semantic indices, so we'll use 0 for them.
590 */
591 if (name == TGSI_SEMANTIC_POSITION ||
592 name == TGSI_SEMANTIC_PSIZE ||
593 name == TGSI_SEMANTIC_EDGEFLAG ||
594 name == TGSI_SEMANTIC_FACE ||
595 name == TGSI_SEMANTIC_SAMPLEMASK)
596 index = 0;
597 else {
598 if (name == TGSI_SEMANTIC_GENERIC) {
599 /* For generic params simply use sid from tgsi */
600 index = io->sid;
601 } else {
602 /* For non-generic params - pack name and sid into 8 bits */
603 index = 0x80 | (name<<3) | (io->sid);
604 }
605
606 /* Make sure that all really used indices have nonzero value, so
607 * we can just compare it to 0 later instead of comparing the name
608 * with different values to detect special cases. */
609 index++;
610 }
611
612 return index;
613 };
614
615 /* we need this to get a common lds index for vs/tcs/tes input/outputs */
616 int r600_get_lds_unique_index(unsigned semantic_name, unsigned index)
617 {
618 switch (semantic_name) {
619 case TGSI_SEMANTIC_POSITION:
620 return 0;
621 case TGSI_SEMANTIC_PSIZE:
622 return 1;
623 case TGSI_SEMANTIC_CLIPDIST:
624 assert(index <= 1);
625 return 2 + index;
626 case TGSI_SEMANTIC_GENERIC:
627 if (index <= 63-4)
628 return 4 + index - 9;
629 else
630 /* same explanation as in the default statement,
631 * the only user hitting this is st/nine.
632 */
633 return 0;
634
635 /* patch indices are completely separate and thus start from 0 */
636 case TGSI_SEMANTIC_TESSOUTER:
637 return 0;
638 case TGSI_SEMANTIC_TESSINNER:
639 return 1;
640 case TGSI_SEMANTIC_PATCH:
641 return 2 + index;
642
643 default:
644 /* Don't fail here. The result of this function is only used
645 * for LS, TCS, TES, and GS, where legacy GL semantics can't
646 * occur, but this function is called for all vertex shaders
647 * before it's known whether LS will be compiled or not.
648 */
649 return 0;
650 }
651 }
652
653 /* turn input into interpolate on EG */
654 static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index)
655 {
656 int r = 0;
657
658 if (ctx->shader->input[index].spi_sid) {
659 ctx->shader->input[index].lds_pos = ctx->shader->nlds++;
660 if (ctx->shader->input[index].interpolate > 0) {
661 evergreen_interp_assign_ij_index(ctx, index);
662 r = evergreen_interp_alu(ctx, index);
663 } else {
664 r = evergreen_interp_flat(ctx, index);
665 }
666 }
667 return r;
668 }
669
670 static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back)
671 {
672 struct r600_bytecode_alu alu;
673 int i, r;
674 int gpr_front = ctx->shader->input[front].gpr;
675 int gpr_back = ctx->shader->input[back].gpr;
676
677 for (i = 0; i < 4; i++) {
678 memset(&alu, 0, sizeof(alu));
679 alu.op = ALU_OP3_CNDGT;
680 alu.is_op3 = 1;
681 alu.dst.write = 1;
682 alu.dst.sel = gpr_front;
683 alu.src[0].sel = ctx->face_gpr;
684 alu.src[1].sel = gpr_front;
685 alu.src[2].sel = gpr_back;
686
687 alu.dst.chan = i;
688 alu.src[1].chan = i;
689 alu.src[2].chan = i;
690 alu.last = (i==3);
691
692 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
693 return r;
694 }
695
696 return 0;
697 }
698
699 /* execute a single slot ALU calculation */
700 static int single_alu_op2(struct r600_shader_ctx *ctx, int op,
701 int dst_sel, int dst_chan,
702 int src0_sel, unsigned src0_chan_val,
703 int src1_sel, unsigned src1_chan_val)
704 {
705 struct r600_bytecode_alu alu;
706 int r, i;
707
708 if (ctx->bc->chip_class == CAYMAN && op == ALU_OP2_MULLO_INT) {
709 for (i = 0; i < 4; i++) {
710 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
711 alu.op = op;
712 alu.src[0].sel = src0_sel;
713 if (src0_sel == V_SQ_ALU_SRC_LITERAL)
714 alu.src[0].value = src0_chan_val;
715 else
716 alu.src[0].chan = src0_chan_val;
717 alu.src[1].sel = src1_sel;
718 if (src1_sel == V_SQ_ALU_SRC_LITERAL)
719 alu.src[1].value = src1_chan_val;
720 else
721 alu.src[1].chan = src1_chan_val;
722 alu.dst.sel = dst_sel;
723 alu.dst.chan = i;
724 alu.dst.write = i == dst_chan;
725 alu.last = (i == 3);
726 r = r600_bytecode_add_alu(ctx->bc, &alu);
727 if (r)
728 return r;
729 }
730 return 0;
731 }
732
733 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
734 alu.op = op;
735 alu.src[0].sel = src0_sel;
736 if (src0_sel == V_SQ_ALU_SRC_LITERAL)
737 alu.src[0].value = src0_chan_val;
738 else
739 alu.src[0].chan = src0_chan_val;
740 alu.src[1].sel = src1_sel;
741 if (src1_sel == V_SQ_ALU_SRC_LITERAL)
742 alu.src[1].value = src1_chan_val;
743 else
744 alu.src[1].chan = src1_chan_val;
745 alu.dst.sel = dst_sel;
746 alu.dst.chan = dst_chan;
747 alu.dst.write = 1;
748 alu.last = 1;
749 r = r600_bytecode_add_alu(ctx->bc, &alu);
750 if (r)
751 return r;
752 return 0;
753 }
754
755 /* execute a single slot ALU calculation */
756 static int single_alu_op3(struct r600_shader_ctx *ctx, int op,
757 int dst_sel, int dst_chan,
758 int src0_sel, unsigned src0_chan_val,
759 int src1_sel, unsigned src1_chan_val,
760 int src2_sel, unsigned src2_chan_val)
761 {
762 struct r600_bytecode_alu alu;
763 int r;
764
765 /* validate this for other ops */
766 assert(op == ALU_OP3_MULADD_UINT24 || op == ALU_OP3_CNDE_INT);
767 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
768 alu.op = op;
769 alu.src[0].sel = src0_sel;
770 if (src0_sel == V_SQ_ALU_SRC_LITERAL)
771 alu.src[0].value = src0_chan_val;
772 else
773 alu.src[0].chan = src0_chan_val;
774 alu.src[1].sel = src1_sel;
775 if (src1_sel == V_SQ_ALU_SRC_LITERAL)
776 alu.src[1].value = src1_chan_val;
777 else
778 alu.src[1].chan = src1_chan_val;
779 alu.src[2].sel = src2_sel;
780 if (src2_sel == V_SQ_ALU_SRC_LITERAL)
781 alu.src[2].value = src2_chan_val;
782 else
783 alu.src[2].chan = src2_chan_val;
784 alu.dst.sel = dst_sel;
785 alu.dst.chan = dst_chan;
786 alu.is_op3 = 1;
787 alu.last = 1;
788 r = r600_bytecode_add_alu(ctx->bc, &alu);
789 if (r)
790 return r;
791 return 0;
792 }
793
794 /* put it in temp_reg.x */
795 static int get_lds_offset0(struct r600_shader_ctx *ctx,
796 int rel_patch_chan,
797 int temp_reg, bool is_patch_var)
798 {
799 int r;
800
801 /* MUL temp.x, patch_stride (input_vals.x), rel_patch_id (r0.y (tcs)) */
802 /* ADD
803 Dimension - patch0_offset (input_vals.z),
804 Non-dim - patch0_data_offset (input_vals.w)
805 */
806 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
807 temp_reg, 0,
808 ctx->tess_output_info, 0,
809 0, rel_patch_chan,
810 ctx->tess_output_info, is_patch_var ? 3 : 2);
811 if (r)
812 return r;
813 return 0;
814 }
815
816 static inline int get_address_file_reg(struct r600_shader_ctx *ctx, int index)
817 {
818 return index > 0 ? ctx->bc->index_reg[index - 1] : ctx->bc->ar_reg;
819 }
820
821 static int r600_get_temp(struct r600_shader_ctx *ctx)
822 {
823 return ctx->temp_reg + ctx->max_driver_temp_used++;
824 }
825
826 static int vs_add_primid_output(struct r600_shader_ctx *ctx, int prim_id_sid)
827 {
828 int i;
829 i = ctx->shader->noutput++;
830 ctx->shader->output[i].name = TGSI_SEMANTIC_PRIMID;
831 ctx->shader->output[i].sid = 0;
832 ctx->shader->output[i].gpr = 0;
833 ctx->shader->output[i].interpolate = TGSI_INTERPOLATE_CONSTANT;
834 ctx->shader->output[i].write_mask = 0x4;
835 ctx->shader->output[i].spi_sid = prim_id_sid;
836
837 return 0;
838 }
839
840 static int tgsi_barrier(struct r600_shader_ctx *ctx)
841 {
842 struct r600_bytecode_alu alu;
843 int r;
844
845 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
846 alu.op = ctx->inst_info->op;
847 alu.last = 1;
848
849 r = r600_bytecode_add_alu(ctx->bc, &alu);
850 if (r)
851 return r;
852 return 0;
853 }
854
855 static int tgsi_declaration(struct r600_shader_ctx *ctx)
856 {
857 struct tgsi_full_declaration *d = &ctx->parse.FullToken.FullDeclaration;
858 int r, i, j, count = d->Range.Last - d->Range.First + 1;
859
860 switch (d->Declaration.File) {
861 case TGSI_FILE_INPUT:
862 for (j = 0; j < count; j++) {
863 i = ctx->shader->ninput + j;
864 assert(i < ARRAY_SIZE(ctx->shader->input));
865 ctx->shader->input[i].name = d->Semantic.Name;
866 ctx->shader->input[i].sid = d->Semantic.Index + j;
867 ctx->shader->input[i].interpolate = d->Interp.Interpolate;
868 ctx->shader->input[i].interpolate_location = d->Interp.Location;
869 ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First + j;
870 if (ctx->type == PIPE_SHADER_FRAGMENT) {
871 ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]);
872 switch (ctx->shader->input[i].name) {
873 case TGSI_SEMANTIC_FACE:
874 if (ctx->face_gpr != -1)
875 ctx->shader->input[i].gpr = ctx->face_gpr; /* already allocated by allocate_system_value_inputs */
876 else
877 ctx->face_gpr = ctx->shader->input[i].gpr;
878 break;
879 case TGSI_SEMANTIC_COLOR:
880 ctx->colors_used++;
881 break;
882 case TGSI_SEMANTIC_POSITION:
883 ctx->fragcoord_input = i;
884 break;
885 case TGSI_SEMANTIC_PRIMID:
886 /* set this for now */
887 ctx->shader->gs_prim_id_input = true;
888 ctx->shader->ps_prim_id_input = i;
889 break;
890 }
891 if (ctx->bc->chip_class >= EVERGREEN) {
892 if ((r = evergreen_interp_input(ctx, i)))
893 return r;
894 }
895 } else if (ctx->type == PIPE_SHADER_GEOMETRY) {
896 /* FIXME probably skip inputs if they aren't passed in the ring */
897 ctx->shader->input[i].ring_offset = ctx->next_ring_offset;
898 ctx->next_ring_offset += 16;
899 if (ctx->shader->input[i].name == TGSI_SEMANTIC_PRIMID)
900 ctx->shader->gs_prim_id_input = true;
901 }
902 }
903 ctx->shader->ninput += count;
904 break;
905 case TGSI_FILE_OUTPUT:
906 for (j = 0; j < count; j++) {
907 i = ctx->shader->noutput + j;
908 assert(i < ARRAY_SIZE(ctx->shader->output));
909 ctx->shader->output[i].name = d->Semantic.Name;
910 ctx->shader->output[i].sid = d->Semantic.Index + j;
911 ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First + j;
912 ctx->shader->output[i].interpolate = d->Interp.Interpolate;
913 ctx->shader->output[i].write_mask = d->Declaration.UsageMask;
914 if (ctx->type == PIPE_SHADER_VERTEX ||
915 ctx->type == PIPE_SHADER_GEOMETRY ||
916 ctx->type == PIPE_SHADER_TESS_EVAL) {
917 ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]);
918 switch (d->Semantic.Name) {
919 case TGSI_SEMANTIC_CLIPDIST:
920 break;
921 case TGSI_SEMANTIC_PSIZE:
922 ctx->shader->vs_out_misc_write = 1;
923 ctx->shader->vs_out_point_size = 1;
924 break;
925 case TGSI_SEMANTIC_EDGEFLAG:
926 ctx->shader->vs_out_misc_write = 1;
927 ctx->shader->vs_out_edgeflag = 1;
928 ctx->edgeflag_output = i;
929 break;
930 case TGSI_SEMANTIC_VIEWPORT_INDEX:
931 ctx->shader->vs_out_misc_write = 1;
932 ctx->shader->vs_out_viewport = 1;
933 break;
934 case TGSI_SEMANTIC_LAYER:
935 ctx->shader->vs_out_misc_write = 1;
936 ctx->shader->vs_out_layer = 1;
937 break;
938 case TGSI_SEMANTIC_CLIPVERTEX:
939 ctx->clip_vertex_write = TRUE;
940 ctx->cv_output = i;
941 break;
942 }
943 if (ctx->type == PIPE_SHADER_GEOMETRY) {
944 ctx->gs_out_ring_offset += 16;
945 }
946 } else if (ctx->type == PIPE_SHADER_FRAGMENT) {
947 switch (d->Semantic.Name) {
948 case TGSI_SEMANTIC_COLOR:
949 ctx->shader->nr_ps_max_color_exports++;
950 break;
951 }
952 }
953 }
954 ctx->shader->noutput += count;
955 break;
956 case TGSI_FILE_TEMPORARY:
957 if (ctx->info.indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
958 if (d->Array.ArrayID) {
959 r600_add_gpr_array(ctx->shader,
960 ctx->file_offset[TGSI_FILE_TEMPORARY] +
961 d->Range.First,
962 d->Range.Last - d->Range.First + 1, 0x0F);
963 }
964 }
965 break;
966
967 case TGSI_FILE_CONSTANT:
968 case TGSI_FILE_SAMPLER:
969 case TGSI_FILE_SAMPLER_VIEW:
970 case TGSI_FILE_ADDRESS:
971 case TGSI_FILE_IMAGE:
972 break;
973
974 case TGSI_FILE_HW_ATOMIC:
975 i = ctx->shader->nhwatomic_ranges;
976 ctx->shader->atomics[i].start = d->Range.First;
977 ctx->shader->atomics[i].end = d->Range.Last;
978 ctx->shader->atomics[i].hw_idx = ctx->shader->atomic_base + ctx->shader->nhwatomic;
979 ctx->shader->atomics[i].array_id = d->Array.ArrayID;
980 ctx->shader->atomics[i].buffer_id = d->Dim.Index2D;
981 ctx->shader->nhwatomic_ranges++;
982 ctx->shader->nhwatomic += count;
983 break;
984
985 case TGSI_FILE_SYSTEM_VALUE:
986 if (d->Semantic.Name == TGSI_SEMANTIC_SAMPLEMASK ||
987 d->Semantic.Name == TGSI_SEMANTIC_SAMPLEID ||
988 d->Semantic.Name == TGSI_SEMANTIC_SAMPLEPOS) {
989 break; /* Already handled from allocate_system_value_inputs */
990 } else if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) {
991 if (!ctx->native_integers) {
992 struct r600_bytecode_alu alu;
993 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
994
995 alu.op = ALU_OP1_INT_TO_FLT;
996 alu.src[0].sel = 0;
997 alu.src[0].chan = 3;
998
999 alu.dst.sel = 0;
1000 alu.dst.chan = 3;
1001 alu.dst.write = 1;
1002 alu.last = 1;
1003
1004 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1005 return r;
1006 }
1007 break;
1008 } else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID)
1009 break;
1010 else if (d->Semantic.Name == TGSI_SEMANTIC_INVOCATIONID)
1011 break;
1012 else if (d->Semantic.Name == TGSI_SEMANTIC_TESSINNER ||
1013 d->Semantic.Name == TGSI_SEMANTIC_TESSOUTER) {
1014 int param = r600_get_lds_unique_index(d->Semantic.Name, 0);
1015 int dreg = d->Semantic.Name == TGSI_SEMANTIC_TESSINNER ? 3 : 2;
1016 unsigned temp_reg = r600_get_temp(ctx);
1017
1018 r = get_lds_offset0(ctx, 2, temp_reg, true);
1019 if (r)
1020 return r;
1021
1022 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
1023 temp_reg, 0,
1024 temp_reg, 0,
1025 V_SQ_ALU_SRC_LITERAL, param * 16);
1026 if (r)
1027 return r;
1028
1029 do_lds_fetch_values(ctx, temp_reg, dreg, 0xf);
1030 }
1031 else if (d->Semantic.Name == TGSI_SEMANTIC_TESSCOORD) {
1032 /* MOV r1.x, r0.x;
1033 MOV r1.y, r0.y;
1034 */
1035 for (i = 0; i < 2; i++) {
1036 struct r600_bytecode_alu alu;
1037 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1038 alu.op = ALU_OP1_MOV;
1039 alu.src[0].sel = 0;
1040 alu.src[0].chan = 0 + i;
1041 alu.dst.sel = 1;
1042 alu.dst.chan = 0 + i;
1043 alu.dst.write = 1;
1044 alu.last = (i == 1) ? 1 : 0;
1045 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1046 return r;
1047 }
1048 /* ADD r1.z, 1.0f, -r0.x */
1049 struct r600_bytecode_alu alu;
1050 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1051 alu.op = ALU_OP2_ADD;
1052 alu.src[0].sel = V_SQ_ALU_SRC_1;
1053 alu.src[1].sel = 1;
1054 alu.src[1].chan = 0;
1055 alu.src[1].neg = 1;
1056 alu.dst.sel = 1;
1057 alu.dst.chan = 2;
1058 alu.dst.write = 1;
1059 alu.last = 1;
1060 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1061 return r;
1062
1063 /* ADD r1.z, r1.z, -r1.y */
1064 alu.op = ALU_OP2_ADD;
1065 alu.src[0].sel = 1;
1066 alu.src[0].chan = 2;
1067 alu.src[1].sel = 1;
1068 alu.src[1].chan = 1;
1069 alu.src[1].neg = 1;
1070 alu.dst.sel = 1;
1071 alu.dst.chan = 2;
1072 alu.dst.write = 1;
1073 alu.last = 1;
1074 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1075 return r;
1076 break;
1077 }
1078 break;
1079 default:
1080 R600_ERR("unsupported file %d declaration\n", d->Declaration.File);
1081 return -EINVAL;
1082 }
1083 return 0;
1084 }
1085
1086 static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_offset)
1087 {
1088 struct tgsi_parse_context parse;
1089 struct {
1090 boolean enabled;
1091 int *reg;
1092 unsigned name, alternate_name;
1093 } inputs[2] = {
1094 { false, &ctx->face_gpr, TGSI_SEMANTIC_SAMPLEMASK, ~0u }, /* lives in Front Face GPR.z */
1095
1096 { false, &ctx->fixed_pt_position_gpr, TGSI_SEMANTIC_SAMPLEID, TGSI_SEMANTIC_SAMPLEPOS } /* SAMPLEID is in Fixed Point Position GPR.w */
1097 };
1098 int num_regs = 0;
1099 unsigned k, i;
1100
1101 if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) {
1102 return 0;
1103 }
1104
1105 /* need to scan shader for system values and interpolateAtSample/Offset/Centroid */
1106 while (!tgsi_parse_end_of_tokens(&parse)) {
1107 tgsi_parse_token(&parse);
1108
1109 if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) {
1110 const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
1111 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE ||
1112 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
1113 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID)
1114 {
1115 int interpolate, location, k;
1116
1117 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
1118 location = TGSI_INTERPOLATE_LOC_CENTER;
1119 inputs[1].enabled = true; /* needs SAMPLEID */
1120 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
1121 location = TGSI_INTERPOLATE_LOC_CENTER;
1122 /* Needs sample positions, currently those are always available */
1123 } else {
1124 location = TGSI_INTERPOLATE_LOC_CENTROID;
1125 }
1126
1127 interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index];
1128 k = eg_get_interpolator_index(interpolate, location);
1129 if (k >= 0)
1130 ctx->eg_interpolators[k].enabled = true;
1131 }
1132 } else if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_DECLARATION) {
1133 struct tgsi_full_declaration *d = &parse.FullToken.FullDeclaration;
1134 if (d->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
1135 for (k = 0; k < ARRAY_SIZE(inputs); k++) {
1136 if (d->Semantic.Name == inputs[k].name ||
1137 d->Semantic.Name == inputs[k].alternate_name) {
1138 inputs[k].enabled = true;
1139 }
1140 }
1141 }
1142 }
1143 }
1144
1145 tgsi_parse_free(&parse);
1146
1147 for (i = 0; i < ARRAY_SIZE(inputs); i++) {
1148 boolean enabled = inputs[i].enabled;
1149 int *reg = inputs[i].reg;
1150 unsigned name = inputs[i].name;
1151
1152 if (enabled) {
1153 int gpr = gpr_offset + num_regs++;
1154 ctx->shader->nsys_inputs++;
1155
1156 // add to inputs, allocate a gpr
1157 k = ctx->shader->ninput++;
1158 ctx->shader->input[k].name = name;
1159 ctx->shader->input[k].sid = 0;
1160 ctx->shader->input[k].interpolate = TGSI_INTERPOLATE_CONSTANT;
1161 ctx->shader->input[k].interpolate_location = TGSI_INTERPOLATE_LOC_CENTER;
1162 *reg = ctx->shader->input[k].gpr = gpr;
1163 }
1164 }
1165
1166 return gpr_offset + num_regs;
1167 }
1168
1169 /*
1170 * for evergreen we need to scan the shader to find the number of GPRs we need to
1171 * reserve for interpolation and system values
1172 *
1173 * we need to know if we are going to emit
1174 * any sample or centroid inputs
1175 * if perspective and linear are required
1176 */
1177 static int evergreen_gpr_count(struct r600_shader_ctx *ctx)
1178 {
1179 unsigned i;
1180 int num_baryc;
1181 struct tgsi_parse_context parse;
1182
1183 memset(&ctx->eg_interpolators, 0, sizeof(ctx->eg_interpolators));
1184
1185 for (i = 0; i < ctx->info.num_inputs; i++) {
1186 int k;
1187 /* skip position/face/mask/sampleid */
1188 if (ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_POSITION ||
1189 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_FACE ||
1190 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEMASK ||
1191 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEID)
1192 continue;
1193
1194 k = eg_get_interpolator_index(
1195 ctx->info.input_interpolate[i],
1196 ctx->info.input_interpolate_loc[i]);
1197 if (k >= 0)
1198 ctx->eg_interpolators[k].enabled = TRUE;
1199 }
1200
1201 if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) {
1202 return 0;
1203 }
1204
1205 /* need to scan shader for system values and interpolateAtSample/Offset/Centroid */
1206 while (!tgsi_parse_end_of_tokens(&parse)) {
1207 tgsi_parse_token(&parse);
1208
1209 if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) {
1210 const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
1211 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE ||
1212 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
1213 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID)
1214 {
1215 int interpolate, location, k;
1216
1217 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
1218 location = TGSI_INTERPOLATE_LOC_CENTER;
1219 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
1220 location = TGSI_INTERPOLATE_LOC_CENTER;
1221 } else {
1222 location = TGSI_INTERPOLATE_LOC_CENTROID;
1223 }
1224
1225 interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index];
1226 k = eg_get_interpolator_index(interpolate, location);
1227 if (k >= 0)
1228 ctx->eg_interpolators[k].enabled = true;
1229 }
1230 }
1231 }
1232
1233 tgsi_parse_free(&parse);
1234
1235 /* assign gpr to each interpolator according to priority */
1236 num_baryc = 0;
1237 for (i = 0; i < ARRAY_SIZE(ctx->eg_interpolators); i++) {
1238 if (ctx->eg_interpolators[i].enabled) {
1239 ctx->eg_interpolators[i].ij_index = num_baryc;
1240 num_baryc ++;
1241 }
1242 }
1243
1244 /* XXX PULL MODEL and LINE STIPPLE */
1245
1246 num_baryc = (num_baryc + 1) >> 1;
1247 return allocate_system_value_inputs(ctx, num_baryc);
1248 }
1249
1250 /* sample_id_sel == NULL means fetch for current sample */
1251 static int load_sample_position(struct r600_shader_ctx *ctx, struct r600_shader_src *sample_id, int chan_sel)
1252 {
1253 struct r600_bytecode_vtx vtx;
1254 int r, t1;
1255
1256 assert(ctx->fixed_pt_position_gpr != -1);
1257
1258 t1 = r600_get_temp(ctx);
1259
1260 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
1261 vtx.op = FETCH_OP_VFETCH;
1262 vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;
1263 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1264 if (sample_id == NULL) {
1265 vtx.src_gpr = ctx->fixed_pt_position_gpr; // SAMPLEID is in .w;
1266 vtx.src_sel_x = 3;
1267 }
1268 else {
1269 struct r600_bytecode_alu alu;
1270
1271 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1272 alu.op = ALU_OP1_MOV;
1273 r600_bytecode_src(&alu.src[0], sample_id, chan_sel);
1274 alu.dst.sel = t1;
1275 alu.dst.write = 1;
1276 alu.last = 1;
1277 r = r600_bytecode_add_alu(ctx->bc, &alu);
1278 if (r)
1279 return r;
1280
1281 vtx.src_gpr = t1;
1282 vtx.src_sel_x = 0;
1283 }
1284 vtx.mega_fetch_count = 16;
1285 vtx.dst_gpr = t1;
1286 vtx.dst_sel_x = 0;
1287 vtx.dst_sel_y = 1;
1288 vtx.dst_sel_z = 2;
1289 vtx.dst_sel_w = 3;
1290 vtx.data_format = FMT_32_32_32_32_FLOAT;
1291 vtx.num_format_all = 2;
1292 vtx.format_comp_all = 1;
1293 vtx.use_const_fields = 0;
1294 vtx.offset = 1; // first element is size of buffer
1295 vtx.endian = r600_endian_swap(32);
1296 vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */
1297
1298 r = r600_bytecode_add_vtx(ctx->bc, &vtx);
1299 if (r)
1300 return r;
1301
1302 return t1;
1303 }
1304
1305 static void tgsi_src(struct r600_shader_ctx *ctx,
1306 const struct tgsi_full_src_register *tgsi_src,
1307 struct r600_shader_src *r600_src)
1308 {
1309 memset(r600_src, 0, sizeof(*r600_src));
1310 r600_src->swizzle[0] = tgsi_src->Register.SwizzleX;
1311 r600_src->swizzle[1] = tgsi_src->Register.SwizzleY;
1312 r600_src->swizzle[2] = tgsi_src->Register.SwizzleZ;
1313 r600_src->swizzle[3] = tgsi_src->Register.SwizzleW;
1314 r600_src->neg = tgsi_src->Register.Negate;
1315 r600_src->abs = tgsi_src->Register.Absolute;
1316
1317 if (tgsi_src->Register.File == TGSI_FILE_IMMEDIATE) {
1318 int index;
1319 if ((tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleY) &&
1320 (tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleZ) &&
1321 (tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleW)) {
1322
1323 index = tgsi_src->Register.Index * 4 + tgsi_src->Register.SwizzleX;
1324 r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel, &r600_src->neg, r600_src->abs);
1325 if (r600_src->sel != V_SQ_ALU_SRC_LITERAL)
1326 return;
1327 }
1328 index = tgsi_src->Register.Index;
1329 r600_src->sel = V_SQ_ALU_SRC_LITERAL;
1330 memcpy(r600_src->value, ctx->literals + index * 4, sizeof(r600_src->value));
1331 } else if (tgsi_src->Register.File == TGSI_FILE_SYSTEM_VALUE) {
1332 if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEMASK) {
1333 r600_src->swizzle[0] = 2; // Z value
1334 r600_src->swizzle[1] = 2;
1335 r600_src->swizzle[2] = 2;
1336 r600_src->swizzle[3] = 2;
1337 r600_src->sel = ctx->face_gpr;
1338 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEID) {
1339 r600_src->swizzle[0] = 3; // W value
1340 r600_src->swizzle[1] = 3;
1341 r600_src->swizzle[2] = 3;
1342 r600_src->swizzle[3] = 3;
1343 r600_src->sel = ctx->fixed_pt_position_gpr;
1344 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEPOS) {
1345 r600_src->swizzle[0] = 0;
1346 r600_src->swizzle[1] = 1;
1347 r600_src->swizzle[2] = 4;
1348 r600_src->swizzle[3] = 4;
1349 r600_src->sel = load_sample_position(ctx, NULL, -1);
1350 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INSTANCEID) {
1351 r600_src->swizzle[0] = 3;
1352 r600_src->swizzle[1] = 3;
1353 r600_src->swizzle[2] = 3;
1354 r600_src->swizzle[3] = 3;
1355 r600_src->sel = 0;
1356 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTEXID) {
1357 r600_src->swizzle[0] = 0;
1358 r600_src->swizzle[1] = 0;
1359 r600_src->swizzle[2] = 0;
1360 r600_src->swizzle[3] = 0;
1361 r600_src->sel = 0;
1362 } else if (ctx->type != PIPE_SHADER_TESS_CTRL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) {
1363 r600_src->swizzle[0] = 3;
1364 r600_src->swizzle[1] = 3;
1365 r600_src->swizzle[2] = 3;
1366 r600_src->swizzle[3] = 3;
1367 r600_src->sel = 1;
1368 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) {
1369 r600_src->swizzle[0] = 2;
1370 r600_src->swizzle[1] = 2;
1371 r600_src->swizzle[2] = 2;
1372 r600_src->swizzle[3] = 2;
1373 r600_src->sel = 0;
1374 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSCOORD) {
1375 r600_src->sel = 1;
1376 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSINNER) {
1377 r600_src->sel = 3;
1378 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSOUTER) {
1379 r600_src->sel = 2;
1380 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTICESIN) {
1381 if (ctx->type == PIPE_SHADER_TESS_CTRL) {
1382 r600_src->sel = ctx->tess_input_info;
1383 r600_src->swizzle[0] = 2;
1384 r600_src->swizzle[1] = 2;
1385 r600_src->swizzle[2] = 2;
1386 r600_src->swizzle[3] = 2;
1387 } else {
1388 r600_src->sel = ctx->tess_input_info;
1389 r600_src->swizzle[0] = 3;
1390 r600_src->swizzle[1] = 3;
1391 r600_src->swizzle[2] = 3;
1392 r600_src->swizzle[3] = 3;
1393 }
1394 } else if (ctx->type == PIPE_SHADER_TESS_CTRL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_PRIMID) {
1395 r600_src->sel = 0;
1396 r600_src->swizzle[0] = 0;
1397 r600_src->swizzle[1] = 0;
1398 r600_src->swizzle[2] = 0;
1399 r600_src->swizzle[3] = 0;
1400 } else if (ctx->type == PIPE_SHADER_TESS_EVAL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_PRIMID) {
1401 r600_src->sel = 0;
1402 r600_src->swizzle[0] = 3;
1403 r600_src->swizzle[1] = 3;
1404 r600_src->swizzle[2] = 3;
1405 r600_src->swizzle[3] = 3;
1406 }
1407 } else {
1408 if (tgsi_src->Register.Indirect)
1409 r600_src->rel = V_SQ_REL_RELATIVE;
1410 r600_src->sel = tgsi_src->Register.Index;
1411 r600_src->sel += ctx->file_offset[tgsi_src->Register.File];
1412 }
1413 if (tgsi_src->Register.File == TGSI_FILE_CONSTANT) {
1414 if (tgsi_src->Register.Dimension) {
1415 r600_src->kc_bank = tgsi_src->Dimension.Index;
1416 if (tgsi_src->Dimension.Indirect) {
1417 r600_src->kc_rel = 1;
1418 }
1419 }
1420 }
1421 }
1422
1423 static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
1424 unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,
1425 unsigned int dst_reg)
1426 {
1427 struct r600_bytecode_vtx vtx;
1428 unsigned int ar_reg;
1429 int r;
1430
1431 if (offset) {
1432 struct r600_bytecode_alu alu;
1433
1434 memset(&alu, 0, sizeof(alu));
1435
1436 alu.op = ALU_OP2_ADD_INT;
1437 alu.src[0].sel = ctx->bc->ar_reg;
1438 alu.src[0].chan = ar_chan;
1439
1440 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1441 alu.src[1].value = offset;
1442
1443 alu.dst.sel = dst_reg;
1444 alu.dst.chan = ar_chan;
1445 alu.dst.write = 1;
1446 alu.last = 1;
1447
1448 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1449 return r;
1450
1451 ar_reg = dst_reg;
1452 } else {
1453 ar_reg = ctx->bc->ar_reg;
1454 }
1455
1456 memset(&vtx, 0, sizeof(vtx));
1457 vtx.buffer_id = cb_idx;
1458 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1459 vtx.src_gpr = ar_reg;
1460 vtx.src_sel_x = ar_chan;
1461 vtx.mega_fetch_count = 16;
1462 vtx.dst_gpr = dst_reg;
1463 vtx.dst_sel_x = 0; /* SEL_X */
1464 vtx.dst_sel_y = 1; /* SEL_Y */
1465 vtx.dst_sel_z = 2; /* SEL_Z */
1466 vtx.dst_sel_w = 3; /* SEL_W */
1467 vtx.data_format = FMT_32_32_32_32_FLOAT;
1468 vtx.num_format_all = 2; /* NUM_FORMAT_SCALED */
1469 vtx.format_comp_all = 1; /* FORMAT_COMP_SIGNED */
1470 vtx.endian = r600_endian_swap(32);
1471 vtx.buffer_index_mode = cb_rel; // cb_rel ? V_SQ_CF_INDEX_0 : V_SQ_CF_INDEX_NONE;
1472
1473 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1474 return r;
1475
1476 return 0;
1477 }
1478
1479 static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1480 {
1481 struct r600_bytecode_vtx vtx;
1482 int r;
1483 unsigned index = src->Register.Index;
1484 unsigned vtx_id = src->Dimension.Index;
1485 int offset_reg = ctx->gs_rotated_input[vtx_id / 3];
1486 int offset_chan = vtx_id % 3;
1487 int t2 = 0;
1488
1489 /* offsets of per-vertex data in ESGS ring are passed to GS in R0.x, R0.y,
1490 * R0.w, R1.x, R1.y, R1.z (it seems R0.z is used for PrimitiveID) */
1491
1492 if (offset_reg == ctx->gs_rotated_input[0] && offset_chan == 2)
1493 offset_chan = 3;
1494
1495 if (src->Dimension.Indirect || src->Register.Indirect)
1496 t2 = r600_get_temp(ctx);
1497
1498 if (src->Dimension.Indirect) {
1499 int treg[3];
1500 struct r600_bytecode_alu alu;
1501 int r, i;
1502 unsigned addr_reg;
1503 addr_reg = get_address_file_reg(ctx, src->DimIndirect.Index);
1504 if (src->DimIndirect.Index > 0) {
1505 r = single_alu_op2(ctx, ALU_OP1_MOV,
1506 ctx->bc->ar_reg, 0,
1507 addr_reg, 0,
1508 0, 0);
1509 if (r)
1510 return r;
1511 }
1512 /*
1513 we have to put the R0.x/y/w into Rt.x Rt+1.x Rt+2.x then index reg from Rt.
1514 at least this is what fglrx seems to do. */
1515 for (i = 0; i < 3; i++) {
1516 treg[i] = r600_get_temp(ctx);
1517 }
1518 r600_add_gpr_array(ctx->shader, treg[0], 3, 0x0F);
1519
1520 for (i = 0; i < 3; i++) {
1521 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1522 alu.op = ALU_OP1_MOV;
1523 alu.src[0].sel = ctx->gs_rotated_input[0];
1524 alu.src[0].chan = i == 2 ? 3 : i;
1525 alu.dst.sel = treg[i];
1526 alu.dst.chan = 0;
1527 alu.dst.write = 1;
1528 alu.last = 1;
1529 r = r600_bytecode_add_alu(ctx->bc, &alu);
1530 if (r)
1531 return r;
1532 }
1533 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1534 alu.op = ALU_OP1_MOV;
1535 alu.src[0].sel = treg[0];
1536 alu.src[0].rel = 1;
1537 alu.dst.sel = t2;
1538 alu.dst.write = 1;
1539 alu.last = 1;
1540 r = r600_bytecode_add_alu(ctx->bc, &alu);
1541 if (r)
1542 return r;
1543 offset_reg = t2;
1544 offset_chan = 0;
1545 }
1546
1547 if (src->Register.Indirect) {
1548 int addr_reg;
1549 unsigned first = ctx->info.input_array_first[src->Indirect.ArrayID];
1550
1551 addr_reg = get_address_file_reg(ctx, src->Indirect.Index);
1552
1553 /* pull the value from index_reg */
1554 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
1555 t2, 1,
1556 addr_reg, 0,
1557 V_SQ_ALU_SRC_LITERAL, first);
1558 if (r)
1559 return r;
1560 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
1561 t2, 0,
1562 t2, 1,
1563 V_SQ_ALU_SRC_LITERAL, 4,
1564 offset_reg, offset_chan);
1565 if (r)
1566 return r;
1567 offset_reg = t2;
1568 offset_chan = 0;
1569 index = src->Register.Index - first;
1570 }
1571
1572 memset(&vtx, 0, sizeof(vtx));
1573 vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
1574 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1575 vtx.src_gpr = offset_reg;
1576 vtx.src_sel_x = offset_chan;
1577 vtx.offset = index * 16; /*bytes*/
1578 vtx.mega_fetch_count = 16;
1579 vtx.dst_gpr = dst_reg;
1580 vtx.dst_sel_x = 0; /* SEL_X */
1581 vtx.dst_sel_y = 1; /* SEL_Y */
1582 vtx.dst_sel_z = 2; /* SEL_Z */
1583 vtx.dst_sel_w = 3; /* SEL_W */
1584 if (ctx->bc->chip_class >= EVERGREEN) {
1585 vtx.use_const_fields = 1;
1586 } else {
1587 vtx.data_format = FMT_32_32_32_32_FLOAT;
1588 }
1589
1590 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1591 return r;
1592
1593 return 0;
1594 }
1595
1596 static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx)
1597 {
1598 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1599 unsigned i;
1600
1601 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1602 struct tgsi_full_src_register *src = &inst->Src[i];
1603
1604 if (src->Register.File == TGSI_FILE_INPUT) {
1605 if (ctx->shader->input[src->Register.Index].name == TGSI_SEMANTIC_PRIMID) {
1606 /* primitive id is in R0.z */
1607 ctx->src[i].sel = 0;
1608 ctx->src[i].swizzle[0] = 2;
1609 }
1610 }
1611 if (src->Register.File == TGSI_FILE_INPUT && src->Register.Dimension) {
1612 int treg = r600_get_temp(ctx);
1613
1614 fetch_gs_input(ctx, src, treg);
1615 ctx->src[i].sel = treg;
1616 ctx->src[i].rel = 0;
1617 }
1618 }
1619 return 0;
1620 }
1621
1622
1623 /* Tessellation shaders pass outputs to the next shader using LDS.
1624 *
1625 * LS outputs = TCS(HS) inputs
1626 * TCS(HS) outputs = TES(DS) inputs
1627 *
1628 * The LDS layout is:
1629 * - TCS inputs for patch 0
1630 * - TCS inputs for patch 1
1631 * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2)
1632 * - ...
1633 * - TCS outputs for patch 0 = get_tcs_out_patch0_offset
1634 * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset
1635 * - TCS outputs for patch 1
1636 * - Per-patch TCS outputs for patch 1
1637 * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2)
1638 * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
1639 * - ...
1640 *
1641 * All three shaders VS(LS), TCS, TES share the same LDS space.
1642 */
1643 /* this will return with the dw address in temp_reg.x */
1644 static int r600_get_byte_address(struct r600_shader_ctx *ctx, int temp_reg,
1645 const struct tgsi_full_dst_register *dst,
1646 const struct tgsi_full_src_register *src,
1647 int stride_bytes_reg, int stride_bytes_chan)
1648 {
1649 struct tgsi_full_dst_register reg;
1650 ubyte *name, *index, *array_first;
1651 int r;
1652 int param;
1653 struct tgsi_shader_info *info = &ctx->info;
1654 /* Set the register description. The address computation is the same
1655 * for sources and destinations. */
1656 if (src) {
1657 reg.Register.File = src->Register.File;
1658 reg.Register.Index = src->Register.Index;
1659 reg.Register.Indirect = src->Register.Indirect;
1660 reg.Register.Dimension = src->Register.Dimension;
1661 reg.Indirect = src->Indirect;
1662 reg.Dimension = src->Dimension;
1663 reg.DimIndirect = src->DimIndirect;
1664 } else
1665 reg = *dst;
1666
1667 /* If the register is 2-dimensional (e.g. an array of vertices
1668 * in a primitive), calculate the base address of the vertex. */
1669 if (reg.Register.Dimension) {
1670 int sel, chan;
1671 if (reg.Dimension.Indirect) {
1672 unsigned addr_reg;
1673 assert (reg.DimIndirect.File == TGSI_FILE_ADDRESS);
1674
1675 addr_reg = get_address_file_reg(ctx, reg.DimIndirect.Index);
1676 /* pull the value from index_reg */
1677 sel = addr_reg;
1678 chan = 0;
1679 } else {
1680 sel = V_SQ_ALU_SRC_LITERAL;
1681 chan = reg.Dimension.Index;
1682 }
1683
1684 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
1685 temp_reg, 0,
1686 stride_bytes_reg, stride_bytes_chan,
1687 sel, chan,
1688 temp_reg, 0);
1689 if (r)
1690 return r;
1691 }
1692
1693 if (reg.Register.File == TGSI_FILE_INPUT) {
1694 name = info->input_semantic_name;
1695 index = info->input_semantic_index;
1696 array_first = info->input_array_first;
1697 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
1698 name = info->output_semantic_name;
1699 index = info->output_semantic_index;
1700 array_first = info->output_array_first;
1701 } else {
1702 assert(0);
1703 return -1;
1704 }
1705 if (reg.Register.Indirect) {
1706 int addr_reg;
1707 int first;
1708 /* Add the relative address of the element. */
1709 if (reg.Indirect.ArrayID)
1710 first = array_first[reg.Indirect.ArrayID];
1711 else
1712 first = reg.Register.Index;
1713
1714 addr_reg = get_address_file_reg(ctx, reg.Indirect.Index);
1715
1716 /* pull the value from index_reg */
1717 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
1718 temp_reg, 0,
1719 V_SQ_ALU_SRC_LITERAL, 16,
1720 addr_reg, 0,
1721 temp_reg, 0);
1722 if (r)
1723 return r;
1724
1725 param = r600_get_lds_unique_index(name[first],
1726 index[first]);
1727
1728 } else {
1729 param = r600_get_lds_unique_index(name[reg.Register.Index],
1730 index[reg.Register.Index]);
1731 }
1732
1733 /* add to base_addr - passed in temp_reg.x */
1734 if (param) {
1735 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
1736 temp_reg, 0,
1737 temp_reg, 0,
1738 V_SQ_ALU_SRC_LITERAL, param * 16);
1739 if (r)
1740 return r;
1741
1742 }
1743 return 0;
1744 }
1745
1746 static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg,
1747 unsigned dst_reg, unsigned mask)
1748 {
1749 struct r600_bytecode_alu alu;
1750 int r, i;
1751
1752 if ((ctx->bc->cf_last->ndw>>1) >= 0x60)
1753 ctx->bc->force_add_cf = 1;
1754
1755 for (i = 1; i < 4; i++) {
1756 if (!(mask & (1 << i)))
1757 continue;
1758
1759 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
1760 temp_reg, i,
1761 temp_reg, 0,
1762 V_SQ_ALU_SRC_LITERAL, 4 * i);
1763 if (r)
1764 return r;
1765 }
1766 for (i = 0; i < 4; i++) {
1767 if (! (mask & (1 << i)))
1768 continue;
1769
1770 /* emit an LDS_READ_RET */
1771 memset(&alu, 0, sizeof(alu));
1772 alu.op = LDS_OP1_LDS_READ_RET;
1773 alu.src[0].sel = temp_reg;
1774 alu.src[0].chan = i;
1775 alu.src[1].sel = V_SQ_ALU_SRC_0;
1776 alu.src[2].sel = V_SQ_ALU_SRC_0;
1777 alu.dst.chan = 0;
1778 alu.is_lds_idx_op = true;
1779 alu.last = 1;
1780 r = r600_bytecode_add_alu(ctx->bc, &alu);
1781 if (r)
1782 return r;
1783 }
1784 for (i = 0; i < 4; i++) {
1785 if (! (mask & (1 << i)))
1786 continue;
1787 /* then read from LDS_OQ_A_POP */
1788 memset(&alu, 0, sizeof(alu));
1789
1790 alu.op = ALU_OP1_MOV;
1791 alu.src[0].sel = EG_V_SQ_ALU_SRC_LDS_OQ_A_POP;
1792 alu.src[0].chan = 0;
1793 alu.dst.sel = dst_reg;
1794 alu.dst.chan = i;
1795 alu.dst.write = 1;
1796 alu.last = 1;
1797 r = r600_bytecode_add_alu(ctx->bc, &alu);
1798 if (r)
1799 return r;
1800 }
1801 return 0;
1802 }
1803
1804 static int fetch_mask(struct tgsi_src_register *reg)
1805 {
1806 int mask = 0;
1807 mask |= 1 << reg->SwizzleX;
1808 mask |= 1 << reg->SwizzleY;
1809 mask |= 1 << reg->SwizzleZ;
1810 mask |= 1 << reg->SwizzleW;
1811 return mask;
1812 }
1813
1814 static int fetch_tes_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1815 {
1816 int r;
1817 unsigned temp_reg = r600_get_temp(ctx);
1818
1819 r = get_lds_offset0(ctx, 2, temp_reg,
1820 src->Register.Dimension ? false : true);
1821 if (r)
1822 return r;
1823
1824 /* the base address is now in temp.x */
1825 r = r600_get_byte_address(ctx, temp_reg,
1826 NULL, src, ctx->tess_output_info, 1);
1827 if (r)
1828 return r;
1829
1830 r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register));
1831 if (r)
1832 return r;
1833 return 0;
1834 }
1835
1836 static int fetch_tcs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1837 {
1838 int r;
1839 unsigned temp_reg = r600_get_temp(ctx);
1840
1841 /* t.x = ips * r0.y */
1842 r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24,
1843 temp_reg, 0,
1844 ctx->tess_input_info, 0,
1845 0, 1);
1846
1847 if (r)
1848 return r;
1849
1850 /* the base address is now in temp.x */
1851 r = r600_get_byte_address(ctx, temp_reg,
1852 NULL, src, ctx->tess_input_info, 1);
1853 if (r)
1854 return r;
1855
1856 r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register));
1857 if (r)
1858 return r;
1859 return 0;
1860 }
1861
1862 static int fetch_tcs_output(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1863 {
1864 int r;
1865 unsigned temp_reg = r600_get_temp(ctx);
1866
1867 r = get_lds_offset0(ctx, 1, temp_reg,
1868 src->Register.Dimension ? false : true);
1869 if (r)
1870 return r;
1871 /* the base address is now in temp.x */
1872 r = r600_get_byte_address(ctx, temp_reg,
1873 NULL, src,
1874 ctx->tess_output_info, 1);
1875 if (r)
1876 return r;
1877
1878 r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register));
1879 if (r)
1880 return r;
1881 return 0;
1882 }
1883
1884 static int tgsi_split_lds_inputs(struct r600_shader_ctx *ctx)
1885 {
1886 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1887 unsigned i;
1888
1889 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1890 struct tgsi_full_src_register *src = &inst->Src[i];
1891
1892 if (ctx->type == PIPE_SHADER_TESS_EVAL && src->Register.File == TGSI_FILE_INPUT) {
1893 int treg = r600_get_temp(ctx);
1894 fetch_tes_input(ctx, src, treg);
1895 ctx->src[i].sel = treg;
1896 ctx->src[i].rel = 0;
1897 }
1898 if (ctx->type == PIPE_SHADER_TESS_CTRL && src->Register.File == TGSI_FILE_INPUT) {
1899 int treg = r600_get_temp(ctx);
1900 fetch_tcs_input(ctx, src, treg);
1901 ctx->src[i].sel = treg;
1902 ctx->src[i].rel = 0;
1903 }
1904 if (ctx->type == PIPE_SHADER_TESS_CTRL && src->Register.File == TGSI_FILE_OUTPUT) {
1905 int treg = r600_get_temp(ctx);
1906 fetch_tcs_output(ctx, src, treg);
1907 ctx->src[i].sel = treg;
1908 ctx->src[i].rel = 0;
1909 }
1910 }
1911 return 0;
1912 }
1913
1914 static int tgsi_split_constant(struct r600_shader_ctx *ctx)
1915 {
1916 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1917 struct r600_bytecode_alu alu;
1918 int i, j, k, nconst, r;
1919
1920 for (i = 0, nconst = 0; i < inst->Instruction.NumSrcRegs; i++) {
1921 if (inst->Src[i].Register.File == TGSI_FILE_CONSTANT) {
1922 nconst++;
1923 }
1924 tgsi_src(ctx, &inst->Src[i], &ctx->src[i]);
1925 }
1926 for (i = 0, j = nconst - 1; i < inst->Instruction.NumSrcRegs; i++) {
1927 if (inst->Src[i].Register.File != TGSI_FILE_CONSTANT) {
1928 continue;
1929 }
1930
1931 if (ctx->src[i].rel) {
1932 int chan = inst->Src[i].Indirect.Swizzle;
1933 int treg = r600_get_temp(ctx);
1934 if ((r = tgsi_fetch_rel_const(ctx, ctx->src[i].kc_bank, ctx->src[i].kc_rel, ctx->src[i].sel - 512, chan, treg)))
1935 return r;
1936
1937 ctx->src[i].kc_bank = 0;
1938 ctx->src[i].kc_rel = 0;
1939 ctx->src[i].sel = treg;
1940 ctx->src[i].rel = 0;
1941 j--;
1942 } else if (j > 0) {
1943 int treg = r600_get_temp(ctx);
1944 for (k = 0; k < 4; k++) {
1945 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1946 alu.op = ALU_OP1_MOV;
1947 alu.src[0].sel = ctx->src[i].sel;
1948 alu.src[0].chan = k;
1949 alu.src[0].rel = ctx->src[i].rel;
1950 alu.src[0].kc_bank = ctx->src[i].kc_bank;
1951 alu.src[0].kc_rel = ctx->src[i].kc_rel;
1952 alu.dst.sel = treg;
1953 alu.dst.chan = k;
1954 alu.dst.write = 1;
1955 if (k == 3)
1956 alu.last = 1;
1957 r = r600_bytecode_add_alu(ctx->bc, &alu);
1958 if (r)
1959 return r;
1960 }
1961 ctx->src[i].sel = treg;
1962 ctx->src[i].rel =0;
1963 j--;
1964 }
1965 }
1966 return 0;
1967 }
1968
1969 /* need to move any immediate into a temp - for trig functions which use literal for PI stuff */
1970 static int tgsi_split_literal_constant(struct r600_shader_ctx *ctx)
1971 {
1972 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1973 struct r600_bytecode_alu alu;
1974 int i, j, k, nliteral, r;
1975
1976 for (i = 0, nliteral = 0; i < inst->Instruction.NumSrcRegs; i++) {
1977 if (ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
1978 nliteral++;
1979 }
1980 }
1981 for (i = 0, j = nliteral - 1; i < inst->Instruction.NumSrcRegs; i++) {
1982 if (j > 0 && ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
1983 int treg = r600_get_temp(ctx);
1984 for (k = 0; k < 4; k++) {
1985 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1986 alu.op = ALU_OP1_MOV;
1987 alu.src[0].sel = ctx->src[i].sel;
1988 alu.src[0].chan = k;
1989 alu.src[0].value = ctx->src[i].value[k];
1990 alu.dst.sel = treg;
1991 alu.dst.chan = k;
1992 alu.dst.write = 1;
1993 if (k == 3)
1994 alu.last = 1;
1995 r = r600_bytecode_add_alu(ctx->bc, &alu);
1996 if (r)
1997 return r;
1998 }
1999 ctx->src[i].sel = treg;
2000 j--;
2001 }
2002 }
2003 return 0;
2004 }
2005
2006 static int process_twoside_color_inputs(struct r600_shader_ctx *ctx)
2007 {
2008 int i, r, count = ctx->shader->ninput;
2009
2010 for (i = 0; i < count; i++) {
2011 if (ctx->shader->input[i].name == TGSI_SEMANTIC_COLOR) {
2012 r = select_twoside_color(ctx, i, ctx->shader->input[i].back_color_input);
2013 if (r)
2014 return r;
2015 }
2016 }
2017 return 0;
2018 }
2019
2020 static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output_info *so,
2021 int stream, unsigned *stream_item_size UNUSED)
2022 {
2023 unsigned so_gpr[PIPE_MAX_SHADER_OUTPUTS];
2024 unsigned start_comp[PIPE_MAX_SHADER_OUTPUTS];
2025 int j, r;
2026 unsigned i;
2027
2028 /* Sanity checking. */
2029 if (so->num_outputs > PIPE_MAX_SO_OUTPUTS) {
2030 R600_ERR("Too many stream outputs: %d\n", so->num_outputs);
2031 r = -EINVAL;
2032 goto out_err;
2033 }
2034 for (i = 0; i < so->num_outputs; i++) {
2035 if (so->output[i].output_buffer >= 4) {
2036 R600_ERR("Exceeded the max number of stream output buffers, got: %d\n",
2037 so->output[i].output_buffer);
2038 r = -EINVAL;
2039 goto out_err;
2040 }
2041 }
2042
2043 /* Initialize locations where the outputs are stored. */
2044 for (i = 0; i < so->num_outputs; i++) {
2045
2046 so_gpr[i] = ctx->shader->output[so->output[i].register_index].gpr;
2047 start_comp[i] = so->output[i].start_component;
2048 /* Lower outputs with dst_offset < start_component.
2049 *
2050 * We can only output 4D vectors with a write mask, e.g. we can
2051 * only output the W component at offset 3, etc. If we want
2052 * to store Y, Z, or W at buffer offset 0, we need to use MOV
2053 * to move it to X and output X. */
2054 if (so->output[i].dst_offset < so->output[i].start_component) {
2055 unsigned tmp = r600_get_temp(ctx);
2056
2057 for (j = 0; j < so->output[i].num_components; j++) {
2058 struct r600_bytecode_alu alu;
2059 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2060 alu.op = ALU_OP1_MOV;
2061 alu.src[0].sel = so_gpr[i];
2062 alu.src[0].chan = so->output[i].start_component + j;
2063
2064 alu.dst.sel = tmp;
2065 alu.dst.chan = j;
2066 alu.dst.write = 1;
2067 if (j == so->output[i].num_components - 1)
2068 alu.last = 1;
2069 r = r600_bytecode_add_alu(ctx->bc, &alu);
2070 if (r)
2071 return r;
2072 }
2073 start_comp[i] = 0;
2074 so_gpr[i] = tmp;
2075 }
2076 }
2077
2078 /* Write outputs to buffers. */
2079 for (i = 0; i < so->num_outputs; i++) {
2080 struct r600_bytecode_output output;
2081
2082 if (stream != -1 && stream != so->output[i].output_buffer)
2083 continue;
2084
2085 memset(&output, 0, sizeof(struct r600_bytecode_output));
2086 output.gpr = so_gpr[i];
2087 output.elem_size = so->output[i].num_components - 1;
2088 if (output.elem_size == 2)
2089 output.elem_size = 3; // 3 not supported, write 4 with junk at end
2090 output.array_base = so->output[i].dst_offset - start_comp[i];
2091 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
2092 output.burst_count = 1;
2093 /* array_size is an upper limit for the burst_count
2094 * with MEM_STREAM instructions */
2095 output.array_size = 0xFFF;
2096 output.comp_mask = ((1 << so->output[i].num_components) - 1) << start_comp[i];
2097
2098 if (ctx->bc->chip_class >= EVERGREEN) {
2099 switch (so->output[i].output_buffer) {
2100 case 0:
2101 output.op = CF_OP_MEM_STREAM0_BUF0;
2102 break;
2103 case 1:
2104 output.op = CF_OP_MEM_STREAM0_BUF1;
2105 break;
2106 case 2:
2107 output.op = CF_OP_MEM_STREAM0_BUF2;
2108 break;
2109 case 3:
2110 output.op = CF_OP_MEM_STREAM0_BUF3;
2111 break;
2112 }
2113 output.op += so->output[i].stream * 4;
2114 assert(output.op >= CF_OP_MEM_STREAM0_BUF0 && output.op <= CF_OP_MEM_STREAM3_BUF3);
2115 ctx->enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer) << so->output[i].stream * 4;
2116 } else {
2117 switch (so->output[i].output_buffer) {
2118 case 0:
2119 output.op = CF_OP_MEM_STREAM0;
2120 break;
2121 case 1:
2122 output.op = CF_OP_MEM_STREAM1;
2123 break;
2124 case 2:
2125 output.op = CF_OP_MEM_STREAM2;
2126 break;
2127 case 3:
2128 output.op = CF_OP_MEM_STREAM3;
2129 break;
2130 }
2131 ctx->enabled_stream_buffers_mask |= 1 << so->output[i].output_buffer;
2132 }
2133 r = r600_bytecode_add_output(ctx->bc, &output);
2134 if (r)
2135 goto out_err;
2136 }
2137 return 0;
2138 out_err:
2139 return r;
2140 }
2141
2142 static void convert_edgeflag_to_int(struct r600_shader_ctx *ctx)
2143 {
2144 struct r600_bytecode_alu alu;
2145 unsigned reg;
2146
2147 if (!ctx->shader->vs_out_edgeflag)
2148 return;
2149
2150 reg = ctx->shader->output[ctx->edgeflag_output].gpr;
2151
2152 /* clamp(x, 0, 1) */
2153 memset(&alu, 0, sizeof(alu));
2154 alu.op = ALU_OP1_MOV;
2155 alu.src[0].sel = reg;
2156 alu.dst.sel = reg;
2157 alu.dst.write = 1;
2158 alu.dst.clamp = 1;
2159 alu.last = 1;
2160 r600_bytecode_add_alu(ctx->bc, &alu);
2161
2162 memset(&alu, 0, sizeof(alu));
2163 alu.op = ALU_OP1_FLT_TO_INT;
2164 alu.src[0].sel = reg;
2165 alu.dst.sel = reg;
2166 alu.dst.write = 1;
2167 alu.last = 1;
2168 r600_bytecode_add_alu(ctx->bc, &alu);
2169 }
2170
2171 static int generate_gs_copy_shader(struct r600_context *rctx,
2172 struct r600_pipe_shader *gs,
2173 struct pipe_stream_output_info *so)
2174 {
2175 struct r600_shader_ctx ctx = {};
2176 struct r600_shader *gs_shader = &gs->shader;
2177 struct r600_pipe_shader *cshader;
2178 unsigned ocnt = gs_shader->noutput;
2179 struct r600_bytecode_alu alu;
2180 struct r600_bytecode_vtx vtx;
2181 struct r600_bytecode_output output;
2182 struct r600_bytecode_cf *cf_jump, *cf_pop,
2183 *last_exp_pos = NULL, *last_exp_param = NULL;
2184 int next_clip_pos = 61, next_param = 0;
2185 unsigned i, j;
2186 int ring;
2187 bool only_ring_0 = true;
2188 cshader = calloc(1, sizeof(struct r600_pipe_shader));
2189 if (!cshader)
2190 return 0;
2191
2192 memcpy(cshader->shader.output, gs_shader->output, ocnt *
2193 sizeof(struct r600_shader_io));
2194
2195 cshader->shader.noutput = ocnt;
2196
2197 ctx.shader = &cshader->shader;
2198 ctx.bc = &ctx.shader->bc;
2199 ctx.type = ctx.bc->type = PIPE_SHADER_VERTEX;
2200
2201 r600_bytecode_init(ctx.bc, rctx->b.chip_class, rctx->b.family,
2202 rctx->screen->has_compressed_msaa_texturing);
2203
2204 ctx.bc->isa = rctx->isa;
2205
2206 cf_jump = NULL;
2207 memset(cshader->shader.ring_item_sizes, 0, sizeof(cshader->shader.ring_item_sizes));
2208
2209 /* R0.x = R0.x & 0x3fffffff */
2210 memset(&alu, 0, sizeof(alu));
2211 alu.op = ALU_OP2_AND_INT;
2212 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2213 alu.src[1].value = 0x3fffffff;
2214 alu.dst.write = 1;
2215 r600_bytecode_add_alu(ctx.bc, &alu);
2216
2217 /* R0.y = R0.x >> 30 */
2218 memset(&alu, 0, sizeof(alu));
2219 alu.op = ALU_OP2_LSHR_INT;
2220 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2221 alu.src[1].value = 0x1e;
2222 alu.dst.chan = 1;
2223 alu.dst.write = 1;
2224 alu.last = 1;
2225 r600_bytecode_add_alu(ctx.bc, &alu);
2226
2227 /* fetch vertex data from GSVS ring */
2228 for (i = 0; i < ocnt; ++i) {
2229 struct r600_shader_io *out = &ctx.shader->output[i];
2230
2231 out->gpr = i + 1;
2232 out->ring_offset = i * 16;
2233
2234 memset(&vtx, 0, sizeof(vtx));
2235 vtx.op = FETCH_OP_VFETCH;
2236 vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
2237 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
2238 vtx.mega_fetch_count = 16;
2239 vtx.offset = out->ring_offset;
2240 vtx.dst_gpr = out->gpr;
2241 vtx.src_gpr = 0;
2242 vtx.dst_sel_x = 0;
2243 vtx.dst_sel_y = 1;
2244 vtx.dst_sel_z = 2;
2245 vtx.dst_sel_w = 3;
2246 if (rctx->b.chip_class >= EVERGREEN) {
2247 vtx.use_const_fields = 1;
2248 } else {
2249 vtx.data_format = FMT_32_32_32_32_FLOAT;
2250 }
2251
2252 r600_bytecode_add_vtx(ctx.bc, &vtx);
2253 }
2254 ctx.temp_reg = i + 1;
2255 for (ring = 3; ring >= 0; --ring) {
2256 bool enabled = false;
2257 for (i = 0; i < so->num_outputs; i++) {
2258 if (so->output[i].stream == ring) {
2259 enabled = true;
2260 if (ring > 0)
2261 only_ring_0 = false;
2262 break;
2263 }
2264 }
2265 if (ring != 0 && !enabled) {
2266 cshader->shader.ring_item_sizes[ring] = 0;
2267 continue;
2268 }
2269
2270 if (cf_jump) {
2271 // Patch up jump label
2272 r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
2273 cf_pop = ctx.bc->cf_last;
2274
2275 cf_jump->cf_addr = cf_pop->id + 2;
2276 cf_jump->pop_count = 1;
2277 cf_pop->cf_addr = cf_pop->id + 2;
2278 cf_pop->pop_count = 1;
2279 }
2280
2281 /* PRED_SETE_INT __, R0.y, ring */
2282 memset(&alu, 0, sizeof(alu));
2283 alu.op = ALU_OP2_PRED_SETE_INT;
2284 alu.src[0].chan = 1;
2285 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2286 alu.src[1].value = ring;
2287 alu.execute_mask = 1;
2288 alu.update_pred = 1;
2289 alu.last = 1;
2290 r600_bytecode_add_alu_type(ctx.bc, &alu, CF_OP_ALU_PUSH_BEFORE);
2291
2292 r600_bytecode_add_cfinst(ctx.bc, CF_OP_JUMP);
2293 cf_jump = ctx.bc->cf_last;
2294
2295 if (enabled)
2296 emit_streamout(&ctx, so, only_ring_0 ? -1 : ring, &cshader->shader.ring_item_sizes[ring]);
2297 cshader->shader.ring_item_sizes[ring] = ocnt * 16;
2298 }
2299
2300 /* bc adds nops - copy it */
2301 if (ctx.bc->chip_class == R600) {
2302 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2303 alu.op = ALU_OP0_NOP;
2304 alu.last = 1;
2305 r600_bytecode_add_alu(ctx.bc, &alu);
2306
2307 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
2308 }
2309
2310 /* export vertex data */
2311 /* XXX factor out common code with r600_shader_from_tgsi ? */
2312 for (i = 0; i < ocnt; ++i) {
2313 struct r600_shader_io *out = &ctx.shader->output[i];
2314 bool instream0 = true;
2315 if (out->name == TGSI_SEMANTIC_CLIPVERTEX)
2316 continue;
2317
2318 for (j = 0; j < so->num_outputs; j++) {
2319 if (so->output[j].register_index == i) {
2320 if (so->output[j].stream == 0)
2321 break;
2322 if (so->output[j].stream > 0)
2323 instream0 = false;
2324 }
2325 }
2326 if (!instream0)
2327 continue;
2328 memset(&output, 0, sizeof(output));
2329 output.gpr = out->gpr;
2330 output.elem_size = 3;
2331 output.swizzle_x = 0;
2332 output.swizzle_y = 1;
2333 output.swizzle_z = 2;
2334 output.swizzle_w = 3;
2335 output.burst_count = 1;
2336 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2337 output.op = CF_OP_EXPORT;
2338 switch (out->name) {
2339 case TGSI_SEMANTIC_POSITION:
2340 output.array_base = 60;
2341 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2342 break;
2343
2344 case TGSI_SEMANTIC_PSIZE:
2345 output.array_base = 61;
2346 if (next_clip_pos == 61)
2347 next_clip_pos = 62;
2348 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2349 output.swizzle_y = 7;
2350 output.swizzle_z = 7;
2351 output.swizzle_w = 7;
2352 ctx.shader->vs_out_misc_write = 1;
2353 ctx.shader->vs_out_point_size = 1;
2354 break;
2355 case TGSI_SEMANTIC_LAYER:
2356 if (out->spi_sid) {
2357 /* duplicate it as PARAM to pass to the pixel shader */
2358 output.array_base = next_param++;
2359 r600_bytecode_add_output(ctx.bc, &output);
2360 last_exp_param = ctx.bc->cf_last;
2361 }
2362 output.array_base = 61;
2363 if (next_clip_pos == 61)
2364 next_clip_pos = 62;
2365 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2366 output.swizzle_x = 7;
2367 output.swizzle_y = 7;
2368 output.swizzle_z = 0;
2369 output.swizzle_w = 7;
2370 ctx.shader->vs_out_misc_write = 1;
2371 ctx.shader->vs_out_layer = 1;
2372 break;
2373 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2374 if (out->spi_sid) {
2375 /* duplicate it as PARAM to pass to the pixel shader */
2376 output.array_base = next_param++;
2377 r600_bytecode_add_output(ctx.bc, &output);
2378 last_exp_param = ctx.bc->cf_last;
2379 }
2380 output.array_base = 61;
2381 if (next_clip_pos == 61)
2382 next_clip_pos = 62;
2383 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2384 ctx.shader->vs_out_misc_write = 1;
2385 ctx.shader->vs_out_viewport = 1;
2386 output.swizzle_x = 7;
2387 output.swizzle_y = 7;
2388 output.swizzle_z = 7;
2389 output.swizzle_w = 0;
2390 break;
2391 case TGSI_SEMANTIC_CLIPDIST:
2392 /* spi_sid is 0 for clipdistance outputs that were generated
2393 * for clipvertex - we don't need to pass them to PS */
2394 ctx.shader->clip_dist_write = gs->shader.clip_dist_write;
2395 ctx.shader->cull_dist_write = gs->shader.cull_dist_write;
2396 ctx.shader->cc_dist_mask = gs->shader.cc_dist_mask;
2397 if (out->spi_sid) {
2398 /* duplicate it as PARAM to pass to the pixel shader */
2399 output.array_base = next_param++;
2400 r600_bytecode_add_output(ctx.bc, &output);
2401 last_exp_param = ctx.bc->cf_last;
2402 }
2403 output.array_base = next_clip_pos++;
2404 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2405 break;
2406 case TGSI_SEMANTIC_FOG:
2407 output.swizzle_y = 4; /* 0 */
2408 output.swizzle_z = 4; /* 0 */
2409 output.swizzle_w = 5; /* 1 */
2410 break;
2411 default:
2412 output.array_base = next_param++;
2413 break;
2414 }
2415 r600_bytecode_add_output(ctx.bc, &output);
2416 if (output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM)
2417 last_exp_param = ctx.bc->cf_last;
2418 else
2419 last_exp_pos = ctx.bc->cf_last;
2420 }
2421
2422 if (!last_exp_pos) {
2423 memset(&output, 0, sizeof(output));
2424 output.gpr = 0;
2425 output.elem_size = 3;
2426 output.swizzle_x = 7;
2427 output.swizzle_y = 7;
2428 output.swizzle_z = 7;
2429 output.swizzle_w = 7;
2430 output.burst_count = 1;
2431 output.type = 2;
2432 output.op = CF_OP_EXPORT;
2433 output.array_base = 60;
2434 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2435 r600_bytecode_add_output(ctx.bc, &output);
2436 last_exp_pos = ctx.bc->cf_last;
2437 }
2438
2439 if (!last_exp_param) {
2440 memset(&output, 0, sizeof(output));
2441 output.gpr = 0;
2442 output.elem_size = 3;
2443 output.swizzle_x = 7;
2444 output.swizzle_y = 7;
2445 output.swizzle_z = 7;
2446 output.swizzle_w = 7;
2447 output.burst_count = 1;
2448 output.type = 2;
2449 output.op = CF_OP_EXPORT;
2450 output.array_base = next_param++;
2451 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2452 r600_bytecode_add_output(ctx.bc, &output);
2453 last_exp_param = ctx.bc->cf_last;
2454 }
2455
2456 last_exp_pos->op = CF_OP_EXPORT_DONE;
2457 last_exp_param->op = CF_OP_EXPORT_DONE;
2458
2459 r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
2460 cf_pop = ctx.bc->cf_last;
2461
2462 cf_jump->cf_addr = cf_pop->id + 2;
2463 cf_jump->pop_count = 1;
2464 cf_pop->cf_addr = cf_pop->id + 2;
2465 cf_pop->pop_count = 1;
2466
2467 if (ctx.bc->chip_class == CAYMAN)
2468 cm_bytecode_add_cf_end(ctx.bc);
2469 else {
2470 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
2471 ctx.bc->cf_last->end_of_program = 1;
2472 }
2473
2474 gs->gs_copy_shader = cshader;
2475 cshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
2476
2477 ctx.bc->nstack = 1;
2478
2479 return r600_bytecode_build(ctx.bc);
2480 }
2481
2482 static int emit_inc_ring_offset(struct r600_shader_ctx *ctx, int idx, bool ind)
2483 {
2484 if (ind) {
2485 struct r600_bytecode_alu alu;
2486 int r;
2487
2488 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2489 alu.op = ALU_OP2_ADD_INT;
2490 alu.src[0].sel = ctx->gs_export_gpr_tregs[idx];
2491 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2492 alu.src[1].value = ctx->gs_out_ring_offset >> 4;
2493 alu.dst.sel = ctx->gs_export_gpr_tregs[idx];
2494 alu.dst.write = 1;
2495 alu.last = 1;
2496 r = r600_bytecode_add_alu(ctx->bc, &alu);
2497 if (r)
2498 return r;
2499 }
2500 return 0;
2501 }
2502
2503 static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so UNUSED, int stream, bool ind)
2504 {
2505 struct r600_bytecode_output output;
2506 int ring_offset;
2507 unsigned i, k;
2508 int effective_stream = stream == -1 ? 0 : stream;
2509 int idx = 0;
2510
2511 for (i = 0; i < ctx->shader->noutput; i++) {
2512 if (ctx->gs_for_vs) {
2513 /* for ES we need to lookup corresponding ring offset expected by GS
2514 * (map this output to GS input by name and sid) */
2515 /* FIXME precompute offsets */
2516 ring_offset = -1;
2517 for(k = 0; k < ctx->gs_for_vs->ninput; ++k) {
2518 struct r600_shader_io *in = &ctx->gs_for_vs->input[k];
2519 struct r600_shader_io *out = &ctx->shader->output[i];
2520 if (in->name == out->name && in->sid == out->sid)
2521 ring_offset = in->ring_offset;
2522 }
2523
2524 if (ring_offset == -1)
2525 continue;
2526 } else {
2527 ring_offset = idx * 16;
2528 idx++;
2529 }
2530
2531 if (stream > 0 && ctx->shader->output[i].name == TGSI_SEMANTIC_POSITION)
2532 continue;
2533 /* next_ring_offset after parsing input decls contains total size of
2534 * single vertex data, gs_next_vertex - current vertex index */
2535 if (!ind)
2536 ring_offset += ctx->gs_out_ring_offset * ctx->gs_next_vertex;
2537
2538 memset(&output, 0, sizeof(struct r600_bytecode_output));
2539 output.gpr = ctx->shader->output[i].gpr;
2540 output.elem_size = 3;
2541 output.comp_mask = 0xF;
2542 output.burst_count = 1;
2543
2544 if (ind)
2545 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
2546 else
2547 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
2548
2549 switch (stream) {
2550 default:
2551 case 0:
2552 output.op = CF_OP_MEM_RING; break;
2553 case 1:
2554 output.op = CF_OP_MEM_RING1; break;
2555 case 2:
2556 output.op = CF_OP_MEM_RING2; break;
2557 case 3:
2558 output.op = CF_OP_MEM_RING3; break;
2559 }
2560
2561 if (ind) {
2562 output.array_base = ring_offset >> 2; /* in dwords */
2563 output.array_size = 0xfff;
2564 output.index_gpr = ctx->gs_export_gpr_tregs[effective_stream];
2565 } else
2566 output.array_base = ring_offset >> 2; /* in dwords */
2567 r600_bytecode_add_output(ctx->bc, &output);
2568 }
2569
2570 ++ctx->gs_next_vertex;
2571 return 0;
2572 }
2573
2574
2575 static int r600_fetch_tess_io_info(struct r600_shader_ctx *ctx)
2576 {
2577 int r;
2578 struct r600_bytecode_vtx vtx;
2579 int temp_val = ctx->temp_reg;
2580 /* need to store the TCS output somewhere */
2581 r = single_alu_op2(ctx, ALU_OP1_MOV,
2582 temp_val, 0,
2583 V_SQ_ALU_SRC_LITERAL, 0,
2584 0, 0);
2585 if (r)
2586 return r;
2587
2588 /* used by VS/TCS */
2589 if (ctx->tess_input_info) {
2590 /* fetch tcs input values into resv space */
2591 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
2592 vtx.op = FETCH_OP_VFETCH;
2593 vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER;
2594 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
2595 vtx.mega_fetch_count = 16;
2596 vtx.data_format = FMT_32_32_32_32;
2597 vtx.num_format_all = 2;
2598 vtx.format_comp_all = 1;
2599 vtx.use_const_fields = 0;
2600 vtx.endian = r600_endian_swap(32);
2601 vtx.srf_mode_all = 1;
2602 vtx.offset = 0;
2603 vtx.dst_gpr = ctx->tess_input_info;
2604 vtx.dst_sel_x = 0;
2605 vtx.dst_sel_y = 1;
2606 vtx.dst_sel_z = 2;
2607 vtx.dst_sel_w = 3;
2608 vtx.src_gpr = temp_val;
2609 vtx.src_sel_x = 0;
2610
2611 r = r600_bytecode_add_vtx(ctx->bc, &vtx);
2612 if (r)
2613 return r;
2614 }
2615
2616 /* used by TCS/TES */
2617 if (ctx->tess_output_info) {
2618 /* fetch tcs output values into resv space */
2619 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
2620 vtx.op = FETCH_OP_VFETCH;
2621 vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER;
2622 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
2623 vtx.mega_fetch_count = 16;
2624 vtx.data_format = FMT_32_32_32_32;
2625 vtx.num_format_all = 2;
2626 vtx.format_comp_all = 1;
2627 vtx.use_const_fields = 0;
2628 vtx.endian = r600_endian_swap(32);
2629 vtx.srf_mode_all = 1;
2630 vtx.offset = 16;
2631 vtx.dst_gpr = ctx->tess_output_info;
2632 vtx.dst_sel_x = 0;
2633 vtx.dst_sel_y = 1;
2634 vtx.dst_sel_z = 2;
2635 vtx.dst_sel_w = 3;
2636 vtx.src_gpr = temp_val;
2637 vtx.src_sel_x = 0;
2638
2639 r = r600_bytecode_add_vtx(ctx->bc, &vtx);
2640 if (r)
2641 return r;
2642 }
2643 return 0;
2644 }
2645
2646 static int emit_lds_vs_writes(struct r600_shader_ctx *ctx)
2647 {
2648 int j, r;
2649 int temp_reg;
2650 unsigned i;
2651
2652 /* fetch tcs input values into input_vals */
2653 ctx->tess_input_info = r600_get_temp(ctx);
2654 ctx->tess_output_info = 0;
2655 r = r600_fetch_tess_io_info(ctx);
2656 if (r)
2657 return r;
2658
2659 temp_reg = r600_get_temp(ctx);
2660 /* dst reg contains LDS address stride * idx */
2661 /* MUL vertexID, vertex_dw_stride */
2662 r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24,
2663 temp_reg, 0,
2664 ctx->tess_input_info, 1,
2665 0, 1); /* rel id in r0.y? */
2666 if (r)
2667 return r;
2668
2669 for (i = 0; i < ctx->shader->noutput; i++) {
2670 struct r600_bytecode_alu alu;
2671 int param = r600_get_lds_unique_index(ctx->shader->output[i].name, ctx->shader->output[i].sid);
2672
2673 if (param) {
2674 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2675 temp_reg, 1,
2676 temp_reg, 0,
2677 V_SQ_ALU_SRC_LITERAL, param * 16);
2678 if (r)
2679 return r;
2680 }
2681
2682 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2683 temp_reg, 2,
2684 temp_reg, param ? 1 : 0,
2685 V_SQ_ALU_SRC_LITERAL, 8);
2686 if (r)
2687 return r;
2688
2689
2690 for (j = 0; j < 2; j++) {
2691 int chan = (j == 1) ? 2 : (param ? 1 : 0);
2692 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2693 alu.op = LDS_OP3_LDS_WRITE_REL;
2694 alu.src[0].sel = temp_reg;
2695 alu.src[0].chan = chan;
2696 alu.src[1].sel = ctx->shader->output[i].gpr;
2697 alu.src[1].chan = j * 2;
2698 alu.src[2].sel = ctx->shader->output[i].gpr;
2699 alu.src[2].chan = (j * 2) + 1;
2700 alu.last = 1;
2701 alu.dst.chan = 0;
2702 alu.lds_idx = 1;
2703 alu.is_lds_idx_op = true;
2704 r = r600_bytecode_add_alu(ctx->bc, &alu);
2705 if (r)
2706 return r;
2707 }
2708 }
2709 return 0;
2710 }
2711
2712 static int r600_store_tcs_output(struct r600_shader_ctx *ctx)
2713 {
2714 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2715 const struct tgsi_full_dst_register *dst = &inst->Dst[0];
2716 int i, r, lasti;
2717 int temp_reg = r600_get_temp(ctx);
2718 struct r600_bytecode_alu alu;
2719 unsigned write_mask = dst->Register.WriteMask;
2720
2721 if (inst->Dst[0].Register.File != TGSI_FILE_OUTPUT)
2722 return 0;
2723
2724 r = get_lds_offset0(ctx, 1, temp_reg, dst->Register.Dimension ? false : true);
2725 if (r)
2726 return r;
2727
2728 /* the base address is now in temp.x */
2729 r = r600_get_byte_address(ctx, temp_reg,
2730 &inst->Dst[0], NULL, ctx->tess_output_info, 1);
2731 if (r)
2732 return r;
2733
2734 /* LDS write */
2735 lasti = tgsi_last_instruction(write_mask);
2736 for (i = 1; i <= lasti; i++) {
2737
2738 if (!(write_mask & (1 << i)))
2739 continue;
2740 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2741 temp_reg, i,
2742 temp_reg, 0,
2743 V_SQ_ALU_SRC_LITERAL, 4 * i);
2744 if (r)
2745 return r;
2746 }
2747
2748 for (i = 0; i <= lasti; i++) {
2749 if (!(write_mask & (1 << i)))
2750 continue;
2751
2752 if ((i == 0 && ((write_mask & 3) == 3)) ||
2753 (i == 2 && ((write_mask & 0xc) == 0xc))) {
2754 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2755 alu.op = LDS_OP3_LDS_WRITE_REL;
2756 alu.src[0].sel = temp_reg;
2757 alu.src[0].chan = i;
2758
2759 alu.src[1].sel = dst->Register.Index;
2760 alu.src[1].sel += ctx->file_offset[dst->Register.File];
2761 alu.src[1].chan = i;
2762
2763 alu.src[2].sel = dst->Register.Index;
2764 alu.src[2].sel += ctx->file_offset[dst->Register.File];
2765 alu.src[2].chan = i + 1;
2766 alu.lds_idx = 1;
2767 alu.dst.chan = 0;
2768 alu.last = 1;
2769 alu.is_lds_idx_op = true;
2770 r = r600_bytecode_add_alu(ctx->bc, &alu);
2771 if (r)
2772 return r;
2773 i += 1;
2774 continue;
2775 }
2776 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2777 alu.op = LDS_OP2_LDS_WRITE;
2778 alu.src[0].sel = temp_reg;
2779 alu.src[0].chan = i;
2780
2781 alu.src[1].sel = dst->Register.Index;
2782 alu.src[1].sel += ctx->file_offset[dst->Register.File];
2783 alu.src[1].chan = i;
2784
2785 alu.src[2].sel = V_SQ_ALU_SRC_0;
2786 alu.dst.chan = 0;
2787 alu.last = 1;
2788 alu.is_lds_idx_op = true;
2789 r = r600_bytecode_add_alu(ctx->bc, &alu);
2790 if (r)
2791 return r;
2792 }
2793 return 0;
2794 }
2795
2796 static int r600_tess_factor_read(struct r600_shader_ctx *ctx,
2797 int output_idx)
2798 {
2799 int param;
2800 unsigned temp_reg = r600_get_temp(ctx);
2801 unsigned name = ctx->shader->output[output_idx].name;
2802 int dreg = ctx->shader->output[output_idx].gpr;
2803 int r;
2804
2805 param = r600_get_lds_unique_index(name, 0);
2806 r = get_lds_offset0(ctx, 1, temp_reg, true);
2807 if (r)
2808 return r;
2809
2810 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2811 temp_reg, 0,
2812 temp_reg, 0,
2813 V_SQ_ALU_SRC_LITERAL, param * 16);
2814 if (r)
2815 return r;
2816
2817 do_lds_fetch_values(ctx, temp_reg, dreg, 0xf);
2818 return 0;
2819 }
2820
2821 static int r600_emit_tess_factor(struct r600_shader_ctx *ctx)
2822 {
2823 int stride, outer_comps, inner_comps;
2824 int tessinner_idx = -1, tessouter_idx = -1;
2825 int i, r;
2826 unsigned j;
2827 int temp_reg = r600_get_temp(ctx);
2828 int treg[3] = {-1, -1, -1};
2829 struct r600_bytecode_alu alu;
2830 struct r600_bytecode_cf *cf_jump, *cf_pop;
2831
2832 /* only execute factor emission for invocation 0 */
2833 /* PRED_SETE_INT __, R0.x, 0 */
2834 memset(&alu, 0, sizeof(alu));
2835 alu.op = ALU_OP2_PRED_SETE_INT;
2836 alu.src[0].chan = 2;
2837 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2838 alu.execute_mask = 1;
2839 alu.update_pred = 1;
2840 alu.last = 1;
2841 r600_bytecode_add_alu_type(ctx->bc, &alu, CF_OP_ALU_PUSH_BEFORE);
2842
2843 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
2844 cf_jump = ctx->bc->cf_last;
2845
2846 treg[0] = r600_get_temp(ctx);
2847 switch (ctx->shader->tcs_prim_mode) {
2848 case PIPE_PRIM_LINES:
2849 stride = 8; /* 2 dwords, 1 vec2 store */
2850 outer_comps = 2;
2851 inner_comps = 0;
2852 break;
2853 case PIPE_PRIM_TRIANGLES:
2854 stride = 16; /* 4 dwords, 1 vec4 store */
2855 outer_comps = 3;
2856 inner_comps = 1;
2857 treg[1] = r600_get_temp(ctx);
2858 break;
2859 case PIPE_PRIM_QUADS:
2860 stride = 24; /* 6 dwords, 2 stores (vec4 + vec2) */
2861 outer_comps = 4;
2862 inner_comps = 2;
2863 treg[1] = r600_get_temp(ctx);
2864 treg[2] = r600_get_temp(ctx);
2865 break;
2866 default:
2867 assert(0);
2868 return -1;
2869 }
2870
2871 /* R0 is InvocationID, RelPatchID, PatchID, tf_base */
2872 /* TF_WRITE takes index in R.x, value in R.y */
2873 for (j = 0; j < ctx->shader->noutput; j++) {
2874 if (ctx->shader->output[j].name == TGSI_SEMANTIC_TESSINNER)
2875 tessinner_idx = j;
2876 if (ctx->shader->output[j].name == TGSI_SEMANTIC_TESSOUTER)
2877 tessouter_idx = j;
2878 }
2879
2880 if (tessouter_idx == -1)
2881 return -1;
2882
2883 if (tessinner_idx == -1 && inner_comps)
2884 return -1;
2885
2886 if (tessouter_idx != -1) {
2887 r = r600_tess_factor_read(ctx, tessouter_idx);
2888 if (r)
2889 return r;
2890 }
2891
2892 if (tessinner_idx != -1) {
2893 r = r600_tess_factor_read(ctx, tessinner_idx);
2894 if (r)
2895 return r;
2896 }
2897
2898 /* r.x = tf_base(r0.w) + relpatchid(r0.y) * tf_stride */
2899 /* r.x = relpatchid(r0.y) * tf_stride */
2900
2901 /* multiply incoming r0.y * stride - t.x = r0.y * stride */
2902 /* add incoming r0.w to it: t.x = t.x + r0.w */
2903 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
2904 temp_reg, 0,
2905 0, 1,
2906 V_SQ_ALU_SRC_LITERAL, stride,
2907 0, 3);
2908 if (r)
2909 return r;
2910
2911 for (i = 0; i < outer_comps + inner_comps; i++) {
2912 int out_idx = i >= outer_comps ? tessinner_idx : tessouter_idx;
2913 int out_comp = i >= outer_comps ? i - outer_comps : i;
2914
2915 if (ctx->shader->tcs_prim_mode == PIPE_PRIM_LINES) {
2916 if (out_comp == 1)
2917 out_comp = 0;
2918 else if (out_comp == 0)
2919 out_comp = 1;
2920 }
2921
2922 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2923 treg[i / 2], (2 * (i % 2)),
2924 temp_reg, 0,
2925 V_SQ_ALU_SRC_LITERAL, 4 * i);
2926 if (r)
2927 return r;
2928 r = single_alu_op2(ctx, ALU_OP1_MOV,
2929 treg[i / 2], 1 + (2 * (i%2)),
2930 ctx->shader->output[out_idx].gpr, out_comp,
2931 0, 0);
2932 if (r)
2933 return r;
2934 }
2935 for (i = 0; i < outer_comps + inner_comps; i++) {
2936 struct r600_bytecode_gds gds;
2937
2938 memset(&gds, 0, sizeof(struct r600_bytecode_gds));
2939 gds.src_gpr = treg[i / 2];
2940 gds.src_sel_x = 2 * (i % 2);
2941 gds.src_sel_y = 1 + (2 * (i % 2));
2942 gds.src_sel_z = 4;
2943 gds.dst_sel_x = 7;
2944 gds.dst_sel_y = 7;
2945 gds.dst_sel_z = 7;
2946 gds.dst_sel_w = 7;
2947 gds.op = FETCH_OP_TF_WRITE;
2948 r = r600_bytecode_add_gds(ctx->bc, &gds);
2949 if (r)
2950 return r;
2951 }
2952
2953 // Patch up jump label
2954 r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP);
2955 cf_pop = ctx->bc->cf_last;
2956
2957 cf_jump->cf_addr = cf_pop->id + 2;
2958 cf_jump->pop_count = 1;
2959 cf_pop->cf_addr = cf_pop->id + 2;
2960 cf_pop->pop_count = 1;
2961
2962 return 0;
2963 }
2964
2965 /*
2966 * We have to work out the thread ID for load and atomic
2967 * operations, which store the returned value to an index
2968 * in an intermediate buffer.
2969 * The index is calculated by taking the thread id,
2970 * calculated from the MBCNT instructions.
2971 * Then the shader engine ID is multiplied by 256,
2972 * and the wave id is added.
2973 * Then the result is multipled by 64 and thread id is
2974 * added.
2975 */
2976 static int load_thread_id_gpr(struct r600_shader_ctx *ctx)
2977 {
2978 struct r600_bytecode_alu alu;
2979 int r;
2980
2981 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2982 alu.op = ALU_OP1_MBCNT_32LO_ACCUM_PREV_INT;
2983 alu.dst.sel = ctx->temp_reg;
2984 alu.dst.chan = 0;
2985 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
2986 alu.src[0].value = 0xffffffff;
2987 alu.dst.write = 1;
2988 r = r600_bytecode_add_alu(ctx->bc, &alu);
2989 if (r)
2990 return r;
2991
2992 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2993 alu.op = ALU_OP1_MBCNT_32HI_INT;
2994 alu.dst.sel = ctx->temp_reg;
2995 alu.dst.chan = 1;
2996 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
2997 alu.src[0].value = 0xffffffff;
2998 alu.dst.write = 1;
2999 r = r600_bytecode_add_alu(ctx->bc, &alu);
3000 if (r)
3001 return r;
3002
3003 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3004 alu.op = ALU_OP3_MULADD_UINT24;
3005 alu.dst.sel = ctx->temp_reg;
3006 alu.dst.chan = 2;
3007 alu.src[0].sel = EG_V_SQ_ALU_SRC_SE_ID;
3008 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
3009 alu.src[1].value = 256;
3010 alu.src[2].sel = EG_V_SQ_ALU_SRC_HW_WAVE_ID;
3011 alu.dst.write = 1;
3012 alu.is_op3 = 1;
3013 alu.last = 1;
3014 r = r600_bytecode_add_alu(ctx->bc, &alu);
3015 if (r)
3016 return r;
3017
3018 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
3019 ctx->thread_id_gpr, 1,
3020 ctx->temp_reg, 2,
3021 V_SQ_ALU_SRC_LITERAL, 0x40,
3022 ctx->temp_reg, 0);
3023 if (r)
3024 return r;
3025 return 0;
3026 }
3027
3028 static int r600_shader_from_tgsi(struct r600_context *rctx,
3029 struct r600_pipe_shader *pipeshader,
3030 union r600_shader_key key)
3031 {
3032 struct r600_screen *rscreen = rctx->screen;
3033 struct r600_shader *shader = &pipeshader->shader;
3034 struct tgsi_token *tokens = pipeshader->selector->tokens;
3035 struct pipe_stream_output_info so = pipeshader->selector->so;
3036 struct tgsi_full_immediate *immediate;
3037 struct r600_shader_ctx ctx;
3038 struct r600_bytecode_output output[ARRAY_SIZE(shader->output)];
3039 unsigned output_done, noutput;
3040 unsigned opcode;
3041 int j, k, r = 0;
3042 unsigned i;
3043 int next_param_base = 0, next_clip_base;
3044 int max_color_exports = MAX2(key.ps.nr_cbufs, 1);
3045 bool indirect_gprs;
3046 bool ring_outputs = false;
3047 bool lds_outputs = false;
3048 bool lds_inputs = false;
3049 bool pos_emitted = false;
3050
3051 ctx.bc = &shader->bc;
3052 ctx.shader = shader;
3053 ctx.native_integers = true;
3054
3055 r600_bytecode_init(ctx.bc, rscreen->b.chip_class, rscreen->b.family,
3056 rscreen->has_compressed_msaa_texturing);
3057 ctx.tokens = tokens;
3058 tgsi_scan_shader(tokens, &ctx.info);
3059 shader->indirect_files = ctx.info.indirect_files;
3060
3061 shader->uses_doubles = ctx.info.uses_doubles;
3062 shader->uses_atomics = ctx.info.file_mask[TGSI_FILE_HW_ATOMIC];
3063 shader->nsys_inputs = 0;
3064
3065 shader->uses_images = ctx.info.file_count[TGSI_FILE_IMAGE] > 0;
3066 indirect_gprs = ctx.info.indirect_files & ~((1 << TGSI_FILE_CONSTANT) | (1 << TGSI_FILE_SAMPLER));
3067 tgsi_parse_init(&ctx.parse, tokens);
3068 ctx.type = ctx.info.processor;
3069 shader->processor_type = ctx.type;
3070 ctx.bc->type = shader->processor_type;
3071
3072 switch (ctx.type) {
3073 case PIPE_SHADER_VERTEX:
3074 shader->vs_as_gs_a = key.vs.as_gs_a;
3075 shader->vs_as_es = key.vs.as_es;
3076 shader->vs_as_ls = key.vs.as_ls;
3077 shader->atomic_base = key.vs.first_atomic_counter;
3078 if (shader->vs_as_es)
3079 ring_outputs = true;
3080 if (shader->vs_as_ls)
3081 lds_outputs = true;
3082 break;
3083 case PIPE_SHADER_GEOMETRY:
3084 ring_outputs = true;
3085 shader->atomic_base = key.gs.first_atomic_counter;
3086 shader->gs_tri_strip_adj_fix = key.gs.tri_strip_adj_fix;
3087 break;
3088 case PIPE_SHADER_TESS_CTRL:
3089 shader->tcs_prim_mode = key.tcs.prim_mode;
3090 shader->atomic_base = key.tcs.first_atomic_counter;
3091 lds_outputs = true;
3092 lds_inputs = true;
3093 break;
3094 case PIPE_SHADER_TESS_EVAL:
3095 shader->tes_as_es = key.tes.as_es;
3096 shader->atomic_base = key.tes.first_atomic_counter;
3097 lds_inputs = true;
3098 if (shader->tes_as_es)
3099 ring_outputs = true;
3100 break;
3101 case PIPE_SHADER_FRAGMENT:
3102 shader->two_side = key.ps.color_two_side;
3103 shader->atomic_base = key.ps.first_atomic_counter;
3104 shader->rat_base = key.ps.nr_cbufs;
3105 shader->image_size_const_offset = key.ps.image_size_const_offset;
3106 break;
3107 default:
3108 break;
3109 }
3110
3111 if (shader->vs_as_es || shader->tes_as_es) {
3112 ctx.gs_for_vs = &rctx->gs_shader->current->shader;
3113 } else {
3114 ctx.gs_for_vs = NULL;
3115 }
3116
3117 ctx.next_ring_offset = 0;
3118 ctx.gs_out_ring_offset = 0;
3119 ctx.gs_next_vertex = 0;
3120 ctx.gs_stream_output_info = &so;
3121
3122 ctx.face_gpr = -1;
3123 ctx.fixed_pt_position_gpr = -1;
3124 ctx.fragcoord_input = -1;
3125 ctx.colors_used = 0;
3126 ctx.clip_vertex_write = 0;
3127
3128 shader->nr_ps_color_exports = 0;
3129 shader->nr_ps_max_color_exports = 0;
3130
3131
3132 /* register allocations */
3133 /* Values [0,127] correspond to GPR[0..127].
3134 * Values [128,159] correspond to constant buffer bank 0
3135 * Values [160,191] correspond to constant buffer bank 1
3136 * Values [256,511] correspond to cfile constants c[0..255]. (Gone on EG)
3137 * Values [256,287] correspond to constant buffer bank 2 (EG)
3138 * Values [288,319] correspond to constant buffer bank 3 (EG)
3139 * Other special values are shown in the list below.
3140 * 244 ALU_SRC_1_DBL_L: special constant 1.0 double-float, LSW. (RV670+)
3141 * 245 ALU_SRC_1_DBL_M: special constant 1.0 double-float, MSW. (RV670+)
3142 * 246 ALU_SRC_0_5_DBL_L: special constant 0.5 double-float, LSW. (RV670+)
3143 * 247 ALU_SRC_0_5_DBL_M: special constant 0.5 double-float, MSW. (RV670+)
3144 * 248 SQ_ALU_SRC_0: special constant 0.0.
3145 * 249 SQ_ALU_SRC_1: special constant 1.0 float.
3146 * 250 SQ_ALU_SRC_1_INT: special constant 1 integer.
3147 * 251 SQ_ALU_SRC_M_1_INT: special constant -1 integer.
3148 * 252 SQ_ALU_SRC_0_5: special constant 0.5 float.
3149 * 253 SQ_ALU_SRC_LITERAL: literal constant.
3150 * 254 SQ_ALU_SRC_PV: previous vector result.
3151 * 255 SQ_ALU_SRC_PS: previous scalar result.
3152 */
3153 for (i = 0; i < TGSI_FILE_COUNT; i++) {
3154 ctx.file_offset[i] = 0;
3155 }
3156
3157 if (ctx.type == PIPE_SHADER_VERTEX) {
3158
3159 ctx.file_offset[TGSI_FILE_INPUT] = 1;
3160 if (ctx.info.num_inputs)
3161 r600_bytecode_add_cfinst(ctx.bc, CF_OP_CALL_FS);
3162 }
3163 if (ctx.type == PIPE_SHADER_FRAGMENT) {
3164 if (ctx.bc->chip_class >= EVERGREEN)
3165 ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx);
3166 else
3167 ctx.file_offset[TGSI_FILE_INPUT] = allocate_system_value_inputs(&ctx, ctx.file_offset[TGSI_FILE_INPUT]);
3168 }
3169 if (ctx.type == PIPE_SHADER_GEOMETRY) {
3170 /* FIXME 1 would be enough in some cases (3 or less input vertices) */
3171 ctx.file_offset[TGSI_FILE_INPUT] = 2;
3172 }
3173 if (ctx.type == PIPE_SHADER_TESS_CTRL)
3174 ctx.file_offset[TGSI_FILE_INPUT] = 1;
3175 if (ctx.type == PIPE_SHADER_TESS_EVAL) {
3176 bool add_tesscoord = false, add_tess_inout = false;
3177 ctx.file_offset[TGSI_FILE_INPUT] = 1;
3178 for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) {
3179 /* if we have tesscoord save one reg */
3180 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSCOORD)
3181 add_tesscoord = true;
3182 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSINNER ||
3183 ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSOUTER)
3184 add_tess_inout = true;
3185 }
3186 if (add_tesscoord || add_tess_inout)
3187 ctx.file_offset[TGSI_FILE_INPUT]++;
3188 if (add_tess_inout)
3189 ctx.file_offset[TGSI_FILE_INPUT]+=2;
3190 }
3191
3192 ctx.file_offset[TGSI_FILE_OUTPUT] =
3193 ctx.file_offset[TGSI_FILE_INPUT] +
3194 ctx.info.file_max[TGSI_FILE_INPUT] + 1;
3195 ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] +
3196 ctx.info.file_max[TGSI_FILE_OUTPUT] + 1;
3197
3198 /* Outside the GPR range. This will be translated to one of the
3199 * kcache banks later. */
3200 ctx.file_offset[TGSI_FILE_CONSTANT] = 512;
3201
3202 ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL;
3203 ctx.bc->ar_reg = ctx.file_offset[TGSI_FILE_TEMPORARY] +
3204 ctx.info.file_max[TGSI_FILE_TEMPORARY] + 1;
3205 ctx.bc->index_reg[0] = ctx.bc->ar_reg + 1;
3206 ctx.bc->index_reg[1] = ctx.bc->ar_reg + 2;
3207
3208 if (ctx.type == PIPE_SHADER_TESS_CTRL) {
3209 ctx.tess_input_info = ctx.bc->ar_reg + 3;
3210 ctx.tess_output_info = ctx.bc->ar_reg + 4;
3211 ctx.temp_reg = ctx.bc->ar_reg + 5;
3212 } else if (ctx.type == PIPE_SHADER_TESS_EVAL) {
3213 ctx.tess_input_info = 0;
3214 ctx.tess_output_info = ctx.bc->ar_reg + 3;
3215 ctx.temp_reg = ctx.bc->ar_reg + 4;
3216 } else if (ctx.type == PIPE_SHADER_GEOMETRY) {
3217 ctx.gs_export_gpr_tregs[0] = ctx.bc->ar_reg + 3;
3218 ctx.gs_export_gpr_tregs[1] = ctx.bc->ar_reg + 4;
3219 ctx.gs_export_gpr_tregs[2] = ctx.bc->ar_reg + 5;
3220 ctx.gs_export_gpr_tregs[3] = ctx.bc->ar_reg + 6;
3221 ctx.temp_reg = ctx.bc->ar_reg + 7;
3222 if (ctx.shader->gs_tri_strip_adj_fix) {
3223 ctx.gs_rotated_input[0] = ctx.bc->ar_reg + 7;
3224 ctx.gs_rotated_input[1] = ctx.bc->ar_reg + 8;
3225 ctx.temp_reg += 2;
3226 } else {
3227 ctx.gs_rotated_input[0] = 0;
3228 ctx.gs_rotated_input[1] = 1;
3229 }
3230 } else {
3231 ctx.temp_reg = ctx.bc->ar_reg + 3;
3232 }
3233
3234 if (shader->uses_images && ctx.type == PIPE_SHADER_FRAGMENT) {
3235 ctx.thread_id_gpr = ctx.temp_reg;
3236 ctx.temp_reg++;
3237 } else
3238 ctx.thread_id_gpr = 0;
3239
3240 shader->max_arrays = 0;
3241 shader->num_arrays = 0;
3242 if (indirect_gprs) {
3243
3244 if (ctx.info.indirect_files & (1 << TGSI_FILE_INPUT)) {
3245 r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_INPUT],
3246 ctx.file_offset[TGSI_FILE_OUTPUT] -
3247 ctx.file_offset[TGSI_FILE_INPUT],
3248 0x0F);
3249 }
3250 if (ctx.info.indirect_files & (1 << TGSI_FILE_OUTPUT)) {
3251 r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_OUTPUT],
3252 ctx.file_offset[TGSI_FILE_TEMPORARY] -
3253 ctx.file_offset[TGSI_FILE_OUTPUT],
3254 0x0F);
3255 }
3256 }
3257
3258 ctx.nliterals = 0;
3259 ctx.literals = NULL;
3260
3261 shader->fs_write_all = ctx.info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] &&
3262 ctx.info.colors_written == 1;
3263 shader->vs_position_window_space = ctx.info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
3264 shader->ps_conservative_z = (uint8_t)ctx.info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT];
3265
3266 if (ctx.type == PIPE_SHADER_VERTEX ||
3267 ctx.type == PIPE_SHADER_GEOMETRY ||
3268 ctx.type == PIPE_SHADER_TESS_EVAL) {
3269 shader->cc_dist_mask = (1 << (ctx.info.properties[TGSI_PROPERTY_NUM_CULLDIST_ENABLED] +
3270 ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED])) - 1;
3271 shader->clip_dist_write = (1 << ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED]) - 1;
3272 shader->cull_dist_write = ((1 << ctx.info.properties[TGSI_PROPERTY_NUM_CULLDIST_ENABLED]) - 1) << ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED];
3273 }
3274
3275 if (shader->vs_as_gs_a)
3276 vs_add_primid_output(&ctx, key.vs.prim_id_out);
3277
3278 if (ctx.type == PIPE_SHADER_TESS_EVAL)
3279 r600_fetch_tess_io_info(&ctx);
3280
3281 while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
3282 tgsi_parse_token(&ctx.parse);
3283 switch (ctx.parse.FullToken.Token.Type) {
3284 case TGSI_TOKEN_TYPE_IMMEDIATE:
3285 immediate = &ctx.parse.FullToken.FullImmediate;
3286 ctx.literals = realloc(ctx.literals, (ctx.nliterals + 1) * 16);
3287 if(ctx.literals == NULL) {
3288 r = -ENOMEM;
3289 goto out_err;
3290 }
3291 ctx.literals[ctx.nliterals * 4 + 0] = immediate->u[0].Uint;
3292 ctx.literals[ctx.nliterals * 4 + 1] = immediate->u[1].Uint;
3293 ctx.literals[ctx.nliterals * 4 + 2] = immediate->u[2].Uint;
3294 ctx.literals[ctx.nliterals * 4 + 3] = immediate->u[3].Uint;
3295 ctx.nliterals++;
3296 break;
3297 case TGSI_TOKEN_TYPE_DECLARATION:
3298 r = tgsi_declaration(&ctx);
3299 if (r)
3300 goto out_err;
3301 break;
3302 case TGSI_TOKEN_TYPE_INSTRUCTION:
3303 case TGSI_TOKEN_TYPE_PROPERTY:
3304 break;
3305 default:
3306 R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type);
3307 r = -EINVAL;
3308 goto out_err;
3309 }
3310 }
3311
3312 shader->ring_item_sizes[0] = ctx.next_ring_offset;
3313 shader->ring_item_sizes[1] = 0;
3314 shader->ring_item_sizes[2] = 0;
3315 shader->ring_item_sizes[3] = 0;
3316
3317 /* Process two side if needed */
3318 if (shader->two_side && ctx.colors_used) {
3319 int i, count = ctx.shader->ninput;
3320 unsigned next_lds_loc = ctx.shader->nlds;
3321
3322 /* additional inputs will be allocated right after the existing inputs,
3323 * we won't need them after the color selection, so we don't need to
3324 * reserve these gprs for the rest of the shader code and to adjust
3325 * output offsets etc. */
3326 int gpr = ctx.file_offset[TGSI_FILE_INPUT] +
3327 ctx.info.file_max[TGSI_FILE_INPUT] + 1;
3328
3329 /* if two sided and neither face or sample mask is used by shader, ensure face_gpr is emitted */
3330 if (ctx.face_gpr == -1) {
3331 i = ctx.shader->ninput++;
3332 ctx.shader->input[i].name = TGSI_SEMANTIC_FACE;
3333 ctx.shader->input[i].spi_sid = 0;
3334 ctx.shader->input[i].gpr = gpr++;
3335 ctx.face_gpr = ctx.shader->input[i].gpr;
3336 }
3337
3338 for (i = 0; i < count; i++) {
3339 if (ctx.shader->input[i].name == TGSI_SEMANTIC_COLOR) {
3340 int ni = ctx.shader->ninput++;
3341 memcpy(&ctx.shader->input[ni],&ctx.shader->input[i], sizeof(struct r600_shader_io));
3342 ctx.shader->input[ni].name = TGSI_SEMANTIC_BCOLOR;
3343 ctx.shader->input[ni].spi_sid = r600_spi_sid(&ctx.shader->input[ni]);
3344 ctx.shader->input[ni].gpr = gpr++;
3345 // TGSI to LLVM needs to know the lds position of inputs.
3346 // Non LLVM path computes it later (in process_twoside_color)
3347 ctx.shader->input[ni].lds_pos = next_lds_loc++;
3348 ctx.shader->input[i].back_color_input = ni;
3349 if (ctx.bc->chip_class >= EVERGREEN) {
3350 if ((r = evergreen_interp_input(&ctx, ni)))
3351 return r;
3352 }
3353 }
3354 }
3355 }
3356
3357 if (shader->fs_write_all && rscreen->b.chip_class >= EVERGREEN)
3358 shader->nr_ps_max_color_exports = 8;
3359
3360 if (ctx.fragcoord_input >= 0) {
3361 if (ctx.bc->chip_class == CAYMAN) {
3362 for (j = 0 ; j < 4; j++) {
3363 struct r600_bytecode_alu alu;
3364 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3365 alu.op = ALU_OP1_RECIP_IEEE;
3366 alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
3367 alu.src[0].chan = 3;
3368
3369 alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
3370 alu.dst.chan = j;
3371 alu.dst.write = (j == 3);
3372 alu.last = 1;
3373 if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
3374 return r;
3375 }
3376 } else {
3377 struct r600_bytecode_alu alu;
3378 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3379 alu.op = ALU_OP1_RECIP_IEEE;
3380 alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
3381 alu.src[0].chan = 3;
3382
3383 alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
3384 alu.dst.chan = 3;
3385 alu.dst.write = 1;
3386 alu.last = 1;
3387 if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
3388 return r;
3389 }
3390 }
3391
3392 if (ctx.thread_id_gpr) {
3393 load_thread_id_gpr(&ctx);
3394 }
3395
3396 if (ctx.type == PIPE_SHADER_GEOMETRY) {
3397 struct r600_bytecode_alu alu;
3398 int r;
3399
3400 /* GS thread with no output workaround - emit a cut at start of GS */
3401 if (ctx.bc->chip_class == R600)
3402 r600_bytecode_add_cfinst(ctx.bc, CF_OP_CUT_VERTEX);
3403
3404 for (j = 0; j < 4; j++) {
3405 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3406 alu.op = ALU_OP1_MOV;
3407 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3408 alu.src[0].value = 0;
3409 alu.dst.sel = ctx.gs_export_gpr_tregs[j];
3410 alu.dst.write = 1;
3411 alu.last = 1;
3412 r = r600_bytecode_add_alu(ctx.bc, &alu);
3413 if (r)
3414 return r;
3415 }
3416
3417 if (ctx.shader->gs_tri_strip_adj_fix) {
3418 r = single_alu_op2(&ctx, ALU_OP2_AND_INT,
3419 ctx.gs_rotated_input[0], 2,
3420 0, 2,
3421 V_SQ_ALU_SRC_LITERAL, 1);
3422 if (r)
3423 return r;
3424
3425 for (i = 0; i < 6; i++) {
3426 int rotated = (i + 4) % 6;
3427 int offset_reg = i / 3;
3428 int offset_chan = i % 3;
3429 int rotated_offset_reg = rotated / 3;
3430 int rotated_offset_chan = rotated % 3;
3431
3432 if (offset_reg == 0 && offset_chan == 2)
3433 offset_chan = 3;
3434 if (rotated_offset_reg == 0 && rotated_offset_chan == 2)
3435 rotated_offset_chan = 3;
3436
3437 r = single_alu_op3(&ctx, ALU_OP3_CNDE_INT,
3438 ctx.gs_rotated_input[offset_reg], offset_chan,
3439 ctx.gs_rotated_input[0], 2,
3440 offset_reg, offset_chan,
3441 rotated_offset_reg, rotated_offset_chan);
3442 if (r)
3443 return r;
3444 }
3445 }
3446 }
3447
3448 if (ctx.type == PIPE_SHADER_TESS_CTRL)
3449 r600_fetch_tess_io_info(&ctx);
3450
3451 if (shader->two_side && ctx.colors_used) {
3452 if ((r = process_twoside_color_inputs(&ctx)))
3453 return r;
3454 }
3455
3456 tgsi_parse_init(&ctx.parse, tokens);
3457 while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
3458 tgsi_parse_token(&ctx.parse);
3459 switch (ctx.parse.FullToken.Token.Type) {
3460 case TGSI_TOKEN_TYPE_INSTRUCTION:
3461 r = tgsi_is_supported(&ctx);
3462 if (r)
3463 goto out_err;
3464 ctx.max_driver_temp_used = 0;
3465 /* reserve first tmp for everyone */
3466 r600_get_temp(&ctx);
3467
3468 opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode;
3469 if ((r = tgsi_split_constant(&ctx)))
3470 goto out_err;
3471 if ((r = tgsi_split_literal_constant(&ctx)))
3472 goto out_err;
3473 if (ctx.type == PIPE_SHADER_GEOMETRY) {
3474 if ((r = tgsi_split_gs_inputs(&ctx)))
3475 goto out_err;
3476 } else if (lds_inputs) {
3477 if ((r = tgsi_split_lds_inputs(&ctx)))
3478 goto out_err;
3479 }
3480 if (ctx.bc->chip_class == CAYMAN)
3481 ctx.inst_info = &cm_shader_tgsi_instruction[opcode];
3482 else if (ctx.bc->chip_class >= EVERGREEN)
3483 ctx.inst_info = &eg_shader_tgsi_instruction[opcode];
3484 else
3485 ctx.inst_info = &r600_shader_tgsi_instruction[opcode];
3486 r = ctx.inst_info->process(&ctx);
3487 if (r)
3488 goto out_err;
3489
3490 if (ctx.type == PIPE_SHADER_TESS_CTRL) {
3491 r = r600_store_tcs_output(&ctx);
3492 if (r)
3493 goto out_err;
3494 }
3495 break;
3496 default:
3497 break;
3498 }
3499 }
3500
3501 /* Reset the temporary register counter. */
3502 ctx.max_driver_temp_used = 0;
3503
3504 noutput = shader->noutput;
3505
3506 if (!ring_outputs && ctx.clip_vertex_write) {
3507 unsigned clipdist_temp[2];
3508
3509 clipdist_temp[0] = r600_get_temp(&ctx);
3510 clipdist_temp[1] = r600_get_temp(&ctx);
3511
3512 /* need to convert a clipvertex write into clipdistance writes and not export
3513 the clip vertex anymore */
3514
3515 memset(&shader->output[noutput], 0, 2*sizeof(struct r600_shader_io));
3516 shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
3517 shader->output[noutput].gpr = clipdist_temp[0];
3518 noutput++;
3519 shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
3520 shader->output[noutput].gpr = clipdist_temp[1];
3521 noutput++;
3522
3523 /* reset spi_sid for clipvertex output to avoid confusing spi */
3524 shader->output[ctx.cv_output].spi_sid = 0;
3525
3526 shader->clip_dist_write = 0xFF;
3527 shader->cc_dist_mask = 0xFF;
3528
3529 for (i = 0; i < 8; i++) {
3530 int oreg = i >> 2;
3531 int ochan = i & 3;
3532
3533 for (j = 0; j < 4; j++) {
3534 struct r600_bytecode_alu alu;
3535 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3536 alu.op = ALU_OP2_DOT4;
3537 alu.src[0].sel = shader->output[ctx.cv_output].gpr;
3538 alu.src[0].chan = j;
3539
3540 alu.src[1].sel = 512 + i;
3541 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
3542 alu.src[1].chan = j;
3543
3544 alu.dst.sel = clipdist_temp[oreg];
3545 alu.dst.chan = j;
3546 alu.dst.write = (j == ochan);
3547 if (j == 3)
3548 alu.last = 1;
3549 r = r600_bytecode_add_alu(ctx.bc, &alu);
3550 if (r)
3551 return r;
3552 }
3553 }
3554 }
3555
3556 /* Add stream outputs. */
3557 if (so.num_outputs) {
3558 bool emit = false;
3559 if (!lds_outputs && !ring_outputs && ctx.type == PIPE_SHADER_VERTEX)
3560 emit = true;
3561 if (!ring_outputs && ctx.type == PIPE_SHADER_TESS_EVAL)
3562 emit = true;
3563 if (emit)
3564 emit_streamout(&ctx, &so, -1, NULL);
3565 }
3566 pipeshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
3567 convert_edgeflag_to_int(&ctx);
3568
3569 if (ctx.type == PIPE_SHADER_TESS_CTRL)
3570 r600_emit_tess_factor(&ctx);
3571
3572 if (lds_outputs) {
3573 if (ctx.type == PIPE_SHADER_VERTEX) {
3574 if (ctx.shader->noutput)
3575 emit_lds_vs_writes(&ctx);
3576 }
3577 } else if (ring_outputs) {
3578 if (shader->vs_as_es || shader->tes_as_es) {
3579 ctx.gs_export_gpr_tregs[0] = r600_get_temp(&ctx);
3580 ctx.gs_export_gpr_tregs[1] = -1;
3581 ctx.gs_export_gpr_tregs[2] = -1;
3582 ctx.gs_export_gpr_tregs[3] = -1;
3583
3584 emit_gs_ring_writes(&ctx, &so, -1, FALSE);
3585 }
3586 } else {
3587 /* Export output */
3588 next_clip_base = shader->vs_out_misc_write ? 62 : 61;
3589
3590 for (i = 0, j = 0; i < noutput; i++, j++) {
3591 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
3592 output[j].gpr = shader->output[i].gpr;
3593 output[j].elem_size = 3;
3594 output[j].swizzle_x = 0;
3595 output[j].swizzle_y = 1;
3596 output[j].swizzle_z = 2;
3597 output[j].swizzle_w = 3;
3598 output[j].burst_count = 1;
3599 output[j].type = 0xffffffff;
3600 output[j].op = CF_OP_EXPORT;
3601 switch (ctx.type) {
3602 case PIPE_SHADER_VERTEX:
3603 case PIPE_SHADER_TESS_EVAL:
3604 switch (shader->output[i].name) {
3605 case TGSI_SEMANTIC_POSITION:
3606 output[j].array_base = 60;
3607 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3608 pos_emitted = true;
3609 break;
3610
3611 case TGSI_SEMANTIC_PSIZE:
3612 output[j].array_base = 61;
3613 output[j].swizzle_y = 7;
3614 output[j].swizzle_z = 7;
3615 output[j].swizzle_w = 7;
3616 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3617 pos_emitted = true;
3618 break;
3619 case TGSI_SEMANTIC_EDGEFLAG:
3620 output[j].array_base = 61;
3621 output[j].swizzle_x = 7;
3622 output[j].swizzle_y = 0;
3623 output[j].swizzle_z = 7;
3624 output[j].swizzle_w = 7;
3625 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3626 pos_emitted = true;
3627 break;
3628 case TGSI_SEMANTIC_LAYER:
3629 /* spi_sid is 0 for outputs that are
3630 * not consumed by PS */
3631 if (shader->output[i].spi_sid) {
3632 output[j].array_base = next_param_base++;
3633 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
3634 j++;
3635 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
3636 }
3637 output[j].array_base = 61;
3638 output[j].swizzle_x = 7;
3639 output[j].swizzle_y = 7;
3640 output[j].swizzle_z = 0;
3641 output[j].swizzle_w = 7;
3642 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3643 pos_emitted = true;
3644 break;
3645 case TGSI_SEMANTIC_VIEWPORT_INDEX:
3646 /* spi_sid is 0 for outputs that are
3647 * not consumed by PS */
3648 if (shader->output[i].spi_sid) {
3649 output[j].array_base = next_param_base++;
3650 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
3651 j++;
3652 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
3653 }
3654 output[j].array_base = 61;
3655 output[j].swizzle_x = 7;
3656 output[j].swizzle_y = 7;
3657 output[j].swizzle_z = 7;
3658 output[j].swizzle_w = 0;
3659 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3660 pos_emitted = true;
3661 break;
3662 case TGSI_SEMANTIC_CLIPVERTEX:
3663 j--;
3664 break;
3665 case TGSI_SEMANTIC_CLIPDIST:
3666 output[j].array_base = next_clip_base++;
3667 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3668 pos_emitted = true;
3669 /* spi_sid is 0 for clipdistance outputs that were generated
3670 * for clipvertex - we don't need to pass them to PS */
3671 if (shader->output[i].spi_sid) {
3672 j++;
3673 /* duplicate it as PARAM to pass to the pixel shader */
3674 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
3675 output[j].array_base = next_param_base++;
3676 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
3677 }
3678 break;
3679 case TGSI_SEMANTIC_FOG:
3680 output[j].swizzle_y = 4; /* 0 */
3681 output[j].swizzle_z = 4; /* 0 */
3682 output[j].swizzle_w = 5; /* 1 */
3683 break;
3684 case TGSI_SEMANTIC_PRIMID:
3685 output[j].swizzle_x = 2;
3686 output[j].swizzle_y = 4; /* 0 */
3687 output[j].swizzle_z = 4; /* 0 */
3688 output[j].swizzle_w = 4; /* 0 */
3689 break;
3690 }
3691
3692 break;
3693 case PIPE_SHADER_FRAGMENT:
3694 if (shader->output[i].name == TGSI_SEMANTIC_COLOR) {
3695 /* never export more colors than the number of CBs */
3696 if (shader->output[i].sid >= max_color_exports) {
3697 /* skip export */
3698 j--;
3699 continue;
3700 }
3701 output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
3702 output[j].array_base = shader->output[i].sid;
3703 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3704 shader->nr_ps_color_exports++;
3705 if (shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN)) {
3706 for (k = 1; k < max_color_exports; k++) {
3707 j++;
3708 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
3709 output[j].gpr = shader->output[i].gpr;
3710 output[j].elem_size = 3;
3711 output[j].swizzle_x = 0;
3712 output[j].swizzle_y = 1;
3713 output[j].swizzle_z = 2;
3714 output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
3715 output[j].burst_count = 1;
3716 output[j].array_base = k;
3717 output[j].op = CF_OP_EXPORT;
3718 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3719 shader->nr_ps_color_exports++;
3720 }
3721 }
3722 } else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) {
3723 output[j].array_base = 61;
3724 output[j].swizzle_x = 2;
3725 output[j].swizzle_y = 7;
3726 output[j].swizzle_z = output[j].swizzle_w = 7;
3727 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3728 } else if (shader->output[i].name == TGSI_SEMANTIC_STENCIL) {
3729 output[j].array_base = 61;
3730 output[j].swizzle_x = 7;
3731 output[j].swizzle_y = 1;
3732 output[j].swizzle_z = output[j].swizzle_w = 7;
3733 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3734 } else if (shader->output[i].name == TGSI_SEMANTIC_SAMPLEMASK) {
3735 output[j].array_base = 61;
3736 output[j].swizzle_x = 7;
3737 output[j].swizzle_y = 7;
3738 output[j].swizzle_z = 0;
3739 output[j].swizzle_w = 7;
3740 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3741 } else {
3742 R600_ERR("unsupported fragment output name %d\n", shader->output[i].name);
3743 r = -EINVAL;
3744 goto out_err;
3745 }
3746 break;
3747 case PIPE_SHADER_TESS_CTRL:
3748 break;
3749 default:
3750 R600_ERR("unsupported processor type %d\n", ctx.type);
3751 r = -EINVAL;
3752 goto out_err;
3753 }
3754
3755 if (output[j].type == 0xffffffff) {
3756 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
3757 output[j].array_base = next_param_base++;
3758 }
3759 }
3760
3761 /* add fake position export */
3762 if ((ctx.type == PIPE_SHADER_VERTEX || ctx.type == PIPE_SHADER_TESS_EVAL) && pos_emitted == false) {
3763 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
3764 output[j].gpr = 0;
3765 output[j].elem_size = 3;
3766 output[j].swizzle_x = 7;
3767 output[j].swizzle_y = 7;
3768 output[j].swizzle_z = 7;
3769 output[j].swizzle_w = 7;
3770 output[j].burst_count = 1;
3771 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3772 output[j].array_base = 60;
3773 output[j].op = CF_OP_EXPORT;
3774 j++;
3775 }
3776
3777 /* add fake param output for vertex shader if no param is exported */
3778 if ((ctx.type == PIPE_SHADER_VERTEX || ctx.type == PIPE_SHADER_TESS_EVAL) && next_param_base == 0) {
3779 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
3780 output[j].gpr = 0;
3781 output[j].elem_size = 3;
3782 output[j].swizzle_x = 7;
3783 output[j].swizzle_y = 7;
3784 output[j].swizzle_z = 7;
3785 output[j].swizzle_w = 7;
3786 output[j].burst_count = 1;
3787 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
3788 output[j].array_base = 0;
3789 output[j].op = CF_OP_EXPORT;
3790 j++;
3791 }
3792
3793 /* add fake pixel export */
3794 if (ctx.type == PIPE_SHADER_FRAGMENT && shader->nr_ps_color_exports == 0) {
3795 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
3796 output[j].gpr = 0;
3797 output[j].elem_size = 3;
3798 output[j].swizzle_x = 7;
3799 output[j].swizzle_y = 7;
3800 output[j].swizzle_z = 7;
3801 output[j].swizzle_w = 7;
3802 output[j].burst_count = 1;
3803 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3804 output[j].array_base = 0;
3805 output[j].op = CF_OP_EXPORT;
3806 j++;
3807 shader->nr_ps_color_exports++;
3808 }
3809
3810 noutput = j;
3811
3812 /* set export done on last export of each type */
3813 for (k = noutput - 1, output_done = 0; k >= 0; k--) {
3814 if (!(output_done & (1 << output[k].type))) {
3815 output_done |= (1 << output[k].type);
3816 output[k].op = CF_OP_EXPORT_DONE;
3817 }
3818 }
3819 /* add output to bytecode */
3820 for (i = 0; i < noutput; i++) {
3821 r = r600_bytecode_add_output(ctx.bc, &output[i]);
3822 if (r)
3823 goto out_err;
3824 }
3825 }
3826
3827 /* add program end */
3828 if (ctx.bc->chip_class == CAYMAN)
3829 cm_bytecode_add_cf_end(ctx.bc);
3830 else {
3831 const struct cf_op_info *last = NULL;
3832
3833 if (ctx.bc->cf_last)
3834 last = r600_isa_cf(ctx.bc->cf_last->op);
3835
3836 /* alu clause instructions don't have EOP bit, so add NOP */
3837 if (!last || last->flags & CF_ALU)
3838 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
3839
3840 ctx.bc->cf_last->end_of_program = 1;
3841 }
3842
3843 /* check GPR limit - we have 124 = 128 - 4
3844 * (4 are reserved as alu clause temporary registers) */
3845 if (ctx.bc->ngpr > 124) {
3846 R600_ERR("GPR limit exceeded - shader requires %d registers\n", ctx.bc->ngpr);
3847 r = -ENOMEM;
3848 goto out_err;
3849 }
3850
3851 if (ctx.type == PIPE_SHADER_GEOMETRY) {
3852 if ((r = generate_gs_copy_shader(rctx, pipeshader, &so)))
3853 return r;
3854 }
3855
3856 free(ctx.literals);
3857 tgsi_parse_free(&ctx.parse);
3858 return 0;
3859 out_err:
3860 free(ctx.literals);
3861 tgsi_parse_free(&ctx.parse);
3862 return r;
3863 }
3864
3865 static int tgsi_unsupported(struct r600_shader_ctx *ctx)
3866 {
3867 const unsigned tgsi_opcode =
3868 ctx->parse.FullToken.FullInstruction.Instruction.Opcode;
3869 R600_ERR("%s tgsi opcode unsupported\n",
3870 tgsi_get_opcode_name(tgsi_opcode));
3871 return -EINVAL;
3872 }
3873
3874 static int tgsi_end(struct r600_shader_ctx *ctx UNUSED)
3875 {
3876 return 0;
3877 }
3878
3879 static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
3880 const struct r600_shader_src *shader_src,
3881 unsigned chan)
3882 {
3883 bc_src->sel = shader_src->sel;
3884 bc_src->chan = shader_src->swizzle[chan];
3885 bc_src->neg = shader_src->neg;
3886 bc_src->abs = shader_src->abs;
3887 bc_src->rel = shader_src->rel;
3888 bc_src->value = shader_src->value[bc_src->chan];
3889 bc_src->kc_bank = shader_src->kc_bank;
3890 bc_src->kc_rel = shader_src->kc_rel;
3891 }
3892
3893 static void r600_bytecode_src_set_abs(struct r600_bytecode_alu_src *bc_src)
3894 {
3895 bc_src->abs = 1;
3896 bc_src->neg = 0;
3897 }
3898
3899 static void r600_bytecode_src_toggle_neg(struct r600_bytecode_alu_src *bc_src)
3900 {
3901 bc_src->neg = !bc_src->neg;
3902 }
3903
3904 static void tgsi_dst(struct r600_shader_ctx *ctx,
3905 const struct tgsi_full_dst_register *tgsi_dst,
3906 unsigned swizzle,
3907 struct r600_bytecode_alu_dst *r600_dst)
3908 {
3909 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3910
3911 r600_dst->sel = tgsi_dst->Register.Index;
3912 r600_dst->sel += ctx->file_offset[tgsi_dst->Register.File];
3913 r600_dst->chan = swizzle;
3914 r600_dst->write = 1;
3915 if (inst->Instruction.Saturate) {
3916 r600_dst->clamp = 1;
3917 }
3918 if (ctx->type == PIPE_SHADER_TESS_CTRL) {
3919 if (tgsi_dst->Register.File == TGSI_FILE_OUTPUT) {
3920 return;
3921 }
3922 }
3923 if (tgsi_dst->Register.Indirect)
3924 r600_dst->rel = V_SQ_REL_RELATIVE;
3925
3926 }
3927
3928 static int tgsi_op2_64_params(struct r600_shader_ctx *ctx, bool singledest, bool swap)
3929 {
3930 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3931 unsigned write_mask = inst->Dst[0].Register.WriteMask;
3932 struct r600_bytecode_alu alu;
3933 int i, j, r, lasti = tgsi_last_instruction(write_mask);
3934 int use_tmp = 0;
3935
3936 if (singledest) {
3937 switch (write_mask) {
3938 case 0x1:
3939 write_mask = 0x3;
3940 break;
3941 case 0x2:
3942 use_tmp = 1;
3943 write_mask = 0x3;
3944 break;
3945 case 0x4:
3946 write_mask = 0xc;
3947 break;
3948 case 0x8:
3949 write_mask = 0xc;
3950 use_tmp = 3;
3951 break;
3952 }
3953 }
3954
3955 lasti = tgsi_last_instruction(write_mask);
3956 for (i = 0; i <= lasti; i++) {
3957
3958 if (!(write_mask & (1 << i)))
3959 continue;
3960
3961 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3962
3963 if (singledest) {
3964 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3965 if (use_tmp) {
3966 alu.dst.sel = ctx->temp_reg;
3967 alu.dst.chan = i;
3968 alu.dst.write = 1;
3969 }
3970 if (i == 1 || i == 3)
3971 alu.dst.write = 0;
3972 } else
3973 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3974
3975 alu.op = ctx->inst_info->op;
3976 if (ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DABS) {
3977 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3978 } else if (!swap) {
3979 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3980 r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
3981 }
3982 } else {
3983 r600_bytecode_src(&alu.src[0], &ctx->src[1], fp64_switch(i));
3984 r600_bytecode_src(&alu.src[1], &ctx->src[0], fp64_switch(i));
3985 }
3986
3987 /* handle some special cases */
3988 if (i == 1 || i == 3) {
3989 switch (ctx->parse.FullToken.FullInstruction.Instruction.Opcode) {
3990 case TGSI_OPCODE_DABS:
3991 r600_bytecode_src_set_abs(&alu.src[0]);
3992 break;
3993 default:
3994 break;
3995 }
3996 }
3997 if (i == lasti) {
3998 alu.last = 1;
3999 }
4000 r = r600_bytecode_add_alu(ctx->bc, &alu);
4001 if (r)
4002 return r;
4003 }
4004
4005 if (use_tmp) {
4006 write_mask = inst->Dst[0].Register.WriteMask;
4007
4008 /* move result from temp to dst */
4009 for (i = 0; i <= lasti; i++) {
4010 if (!(write_mask & (1 << i)))
4011 continue;
4012
4013 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4014 alu.op = ALU_OP1_MOV;
4015 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4016 alu.src[0].sel = ctx->temp_reg;
4017 alu.src[0].chan = use_tmp - 1;
4018 alu.last = (i == lasti);
4019
4020 r = r600_bytecode_add_alu(ctx->bc, &alu);
4021 if (r)
4022 return r;
4023 }
4024 }
4025 return 0;
4026 }
4027
4028 static int tgsi_op2_64(struct r600_shader_ctx *ctx)
4029 {
4030 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4031 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4032 /* confirm writemasking */
4033 if ((write_mask & 0x3) != 0x3 &&
4034 (write_mask & 0xc) != 0xc) {
4035 fprintf(stderr, "illegal writemask for 64-bit: 0x%x\n", write_mask);
4036 return -1;
4037 }
4038 return tgsi_op2_64_params(ctx, false, false);
4039 }
4040
4041 static int tgsi_op2_64_single_dest(struct r600_shader_ctx *ctx)
4042 {
4043 return tgsi_op2_64_params(ctx, true, false);
4044 }
4045
4046 static int tgsi_op2_64_single_dest_s(struct r600_shader_ctx *ctx)
4047 {
4048 return tgsi_op2_64_params(ctx, true, true);
4049 }
4050
4051 static int tgsi_op3_64(struct r600_shader_ctx *ctx)
4052 {
4053 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4054 struct r600_bytecode_alu alu;
4055 int i, j, r;
4056 int lasti = 3;
4057 int tmp = r600_get_temp(ctx);
4058
4059 for (i = 0; i < lasti + 1; i++) {
4060
4061 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4062 alu.op = ctx->inst_info->op;
4063 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4064 r600_bytecode_src(&alu.src[j], &ctx->src[j], i == 3 ? 0 : 1);
4065 }
4066
4067 if (inst->Dst[0].Register.WriteMask & (1 << i))
4068 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4069 else
4070 alu.dst.sel = tmp;
4071
4072 alu.dst.chan = i;
4073 alu.is_op3 = 1;
4074 if (i == lasti) {
4075 alu.last = 1;
4076 }
4077 r = r600_bytecode_add_alu(ctx->bc, &alu);
4078 if (r)
4079 return r;
4080 }
4081 return 0;
4082 }
4083
4084 static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap, int trans_only)
4085 {
4086 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4087 struct r600_bytecode_alu alu;
4088 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4089 int i, j, r, lasti = tgsi_last_instruction(write_mask);
4090 /* use temp register if trans_only and more than one dst component */
4091 int use_tmp = trans_only && (write_mask ^ (1 << lasti));
4092 unsigned op = ctx->inst_info->op;
4093
4094 if (op == ALU_OP2_MUL_IEEE &&
4095 ctx->info.properties[TGSI_PROPERTY_MUL_ZERO_WINS])
4096 op = ALU_OP2_MUL;
4097
4098 for (i = 0; i <= lasti; i++) {
4099 if (!(write_mask & (1 << i)))
4100 continue;
4101
4102 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4103 if (use_tmp) {
4104 alu.dst.sel = ctx->temp_reg;
4105 alu.dst.chan = i;
4106 alu.dst.write = 1;
4107 } else
4108 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4109
4110 alu.op = op;
4111 if (!swap) {
4112 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4113 r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
4114 }
4115 } else {
4116 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4117 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4118 }
4119 if (i == lasti || trans_only) {
4120 alu.last = 1;
4121 }
4122 r = r600_bytecode_add_alu(ctx->bc, &alu);
4123 if (r)
4124 return r;
4125 }
4126
4127 if (use_tmp) {
4128 /* move result from temp to dst */
4129 for (i = 0; i <= lasti; i++) {
4130 if (!(write_mask & (1 << i)))
4131 continue;
4132
4133 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4134 alu.op = ALU_OP1_MOV;
4135 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4136 alu.src[0].sel = ctx->temp_reg;
4137 alu.src[0].chan = i;
4138 alu.last = (i == lasti);
4139
4140 r = r600_bytecode_add_alu(ctx->bc, &alu);
4141 if (r)
4142 return r;
4143 }
4144 }
4145 return 0;
4146 }
4147
4148 static int tgsi_op2(struct r600_shader_ctx *ctx)
4149 {
4150 return tgsi_op2_s(ctx, 0, 0);
4151 }
4152
4153 static int tgsi_op2_swap(struct r600_shader_ctx *ctx)
4154 {
4155 return tgsi_op2_s(ctx, 1, 0);
4156 }
4157
4158 static int tgsi_op2_trans(struct r600_shader_ctx *ctx)
4159 {
4160 return tgsi_op2_s(ctx, 0, 1);
4161 }
4162
4163 static int tgsi_ineg(struct r600_shader_ctx *ctx)
4164 {
4165 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4166 struct r600_bytecode_alu alu;
4167 int i, r;
4168 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4169
4170 for (i = 0; i < lasti + 1; i++) {
4171
4172 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4173 continue;
4174 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4175 alu.op = ctx->inst_info->op;
4176
4177 alu.src[0].sel = V_SQ_ALU_SRC_0;
4178
4179 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4180
4181 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4182
4183 if (i == lasti) {
4184 alu.last = 1;
4185 }
4186 r = r600_bytecode_add_alu(ctx->bc, &alu);
4187 if (r)
4188 return r;
4189 }
4190 return 0;
4191
4192 }
4193
4194 static int tgsi_dneg(struct r600_shader_ctx *ctx)
4195 {
4196 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4197 struct r600_bytecode_alu alu;
4198 int i, r;
4199 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4200
4201 for (i = 0; i < lasti + 1; i++) {
4202
4203 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4204 continue;
4205 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4206 alu.op = ALU_OP1_MOV;
4207
4208 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4209
4210 if (i == 1 || i == 3)
4211 r600_bytecode_src_toggle_neg(&alu.src[0]);
4212 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4213
4214 if (i == lasti) {
4215 alu.last = 1;
4216 }
4217 r = r600_bytecode_add_alu(ctx->bc, &alu);
4218 if (r)
4219 return r;
4220 }
4221 return 0;
4222
4223 }
4224
4225 static int tgsi_dfracexp(struct r600_shader_ctx *ctx)
4226 {
4227 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4228 struct r600_bytecode_alu alu;
4229 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4230 int i, j, r;
4231
4232 for (i = 0; i <= 3; i++) {
4233 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4234 alu.op = ctx->inst_info->op;
4235
4236 alu.dst.sel = ctx->temp_reg;
4237 alu.dst.chan = i;
4238 alu.dst.write = 1;
4239 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4240 r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
4241 }
4242
4243 if (i == 3)
4244 alu.last = 1;
4245
4246 r = r600_bytecode_add_alu(ctx->bc, &alu);
4247 if (r)
4248 return r;
4249 }
4250
4251 /* Replicate significand result across channels. */
4252 for (i = 0; i <= 3; i++) {
4253 if (!(write_mask & (1 << i)))
4254 continue;
4255
4256 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4257 alu.op = ALU_OP1_MOV;
4258 alu.src[0].chan = (i & 1) + 2;
4259 alu.src[0].sel = ctx->temp_reg;
4260
4261 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4262 alu.dst.write = 1;
4263 alu.last = 1;
4264 r = r600_bytecode_add_alu(ctx->bc, &alu);
4265 if (r)
4266 return r;
4267 }
4268
4269 for (i = 0; i <= 3; i++) {
4270 if (inst->Dst[1].Register.WriteMask & (1 << i)) {
4271 /* MOV third channels to writemask dst1 */
4272 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4273 alu.op = ALU_OP1_MOV;
4274 alu.src[0].chan = 1;
4275 alu.src[0].sel = ctx->temp_reg;
4276
4277 tgsi_dst(ctx, &inst->Dst[1], i, &alu.dst);
4278 alu.last = 1;
4279 r = r600_bytecode_add_alu(ctx->bc, &alu);
4280 if (r)
4281 return r;
4282 break;
4283 }
4284 }
4285 return 0;
4286 }
4287
4288
4289 static int egcm_int_to_double(struct r600_shader_ctx *ctx)
4290 {
4291 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4292 struct r600_bytecode_alu alu;
4293 int i, r;
4294 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4295
4296 assert(inst->Instruction.Opcode == TGSI_OPCODE_I2D ||
4297 inst->Instruction.Opcode == TGSI_OPCODE_U2D);
4298
4299 for (i = 0; i <= (lasti+1)/2; i++) {
4300 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4301 alu.op = ctx->inst_info->op;
4302
4303 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4304 alu.dst.sel = ctx->temp_reg;
4305 alu.dst.chan = i;
4306 alu.dst.write = 1;
4307 alu.last = 1;
4308
4309 r = r600_bytecode_add_alu(ctx->bc, &alu);
4310 if (r)
4311 return r;
4312 }
4313
4314 for (i = 0; i <= lasti; i++) {
4315 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4316 alu.op = ALU_OP1_FLT32_TO_FLT64;
4317
4318 alu.src[0].chan = i/2;
4319 if (i%2 == 0)
4320 alu.src[0].sel = ctx->temp_reg;
4321 else {
4322 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
4323 alu.src[0].value = 0x0;
4324 }
4325 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4326 alu.last = i == lasti;
4327
4328 r = r600_bytecode_add_alu(ctx->bc, &alu);
4329 if (r)
4330 return r;
4331 }
4332
4333 return 0;
4334 }
4335
4336 static int egcm_double_to_int(struct r600_shader_ctx *ctx)
4337 {
4338 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4339 struct r600_bytecode_alu alu;
4340 int i, r;
4341 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4342
4343 assert(inst->Instruction.Opcode == TGSI_OPCODE_D2I ||
4344 inst->Instruction.Opcode == TGSI_OPCODE_D2U);
4345
4346 for (i = 0; i <= lasti; i++) {
4347 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4348 alu.op = ALU_OP1_FLT64_TO_FLT32;
4349
4350 r600_bytecode_src(&alu.src[0], &ctx->src[0], fp64_switch(i));
4351 alu.dst.chan = i;
4352 alu.dst.sel = ctx->temp_reg;
4353 alu.dst.write = i%2 == 0;
4354 alu.last = i == lasti;
4355
4356 r = r600_bytecode_add_alu(ctx->bc, &alu);
4357 if (r)
4358 return r;
4359 }
4360
4361 for (i = 0; i <= (lasti+1)/2; i++) {
4362 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4363 alu.op = ctx->inst_info->op;
4364
4365 alu.src[0].chan = i*2;
4366 alu.src[0].sel = ctx->temp_reg;
4367 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
4368 alu.last = 1;
4369
4370 r = r600_bytecode_add_alu(ctx->bc, &alu);
4371 if (r)
4372 return r;
4373 }
4374
4375 return 0;
4376 }
4377
4378 static int cayman_emit_unary_double_raw(struct r600_bytecode *bc,
4379 unsigned op,
4380 int dst_reg,
4381 struct r600_shader_src *src,
4382 bool abs)
4383 {
4384 struct r600_bytecode_alu alu;
4385 const int last_slot = 3;
4386 int r;
4387
4388 /* these have to write the result to X/Y by the looks of it */
4389 for (int i = 0 ; i < last_slot; i++) {
4390 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4391 alu.op = op;
4392
4393 r600_bytecode_src(&alu.src[0], src, 1);
4394 r600_bytecode_src(&alu.src[1], src, 0);
4395
4396 if (abs)
4397 r600_bytecode_src_set_abs(&alu.src[1]);
4398
4399 alu.dst.sel = dst_reg;
4400 alu.dst.chan = i;
4401 alu.dst.write = (i == 0 || i == 1);
4402
4403 if (bc->chip_class != CAYMAN || i == last_slot - 1)
4404 alu.last = 1;
4405 r = r600_bytecode_add_alu(bc, &alu);
4406 if (r)
4407 return r;
4408 }
4409
4410 return 0;
4411 }
4412
4413 static int cayman_emit_double_instr(struct r600_shader_ctx *ctx)
4414 {
4415 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4416 int i, r;
4417 struct r600_bytecode_alu alu;
4418 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4419 int t1 = ctx->temp_reg;
4420
4421 /* should only be one src regs */
4422 assert(inst->Instruction.NumSrcRegs == 1);
4423
4424 /* only support one double at a time */
4425 assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
4426 inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);
4427
4428 r = cayman_emit_unary_double_raw(
4429 ctx->bc, ctx->inst_info->op, t1,
4430 &ctx->src[0],
4431 ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DRSQ ||
4432 ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DSQRT);
4433 if (r)
4434 return r;
4435
4436 for (i = 0 ; i <= lasti; i++) {
4437 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4438 continue;
4439 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4440 alu.op = ALU_OP1_MOV;
4441 alu.src[0].sel = t1;
4442 alu.src[0].chan = (i == 0 || i == 2) ? 0 : 1;
4443 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4444 alu.dst.write = 1;
4445 if (i == lasti)
4446 alu.last = 1;
4447 r = r600_bytecode_add_alu(ctx->bc, &alu);
4448 if (r)
4449 return r;
4450 }
4451 return 0;
4452 }
4453
4454 static int cayman_emit_float_instr(struct r600_shader_ctx *ctx)
4455 {
4456 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4457 int i, j, r;
4458 struct r600_bytecode_alu alu;
4459 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
4460
4461 for (i = 0 ; i < last_slot; i++) {
4462 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4463 alu.op = ctx->inst_info->op;
4464 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4465 r600_bytecode_src(&alu.src[j], &ctx->src[j], 0);
4466
4467 /* RSQ should take the absolute value of src */
4468 if (inst->Instruction.Opcode == TGSI_OPCODE_RSQ) {
4469 r600_bytecode_src_set_abs(&alu.src[j]);
4470 }
4471 }
4472 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4473 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
4474
4475 if (i == last_slot - 1)
4476 alu.last = 1;
4477 r = r600_bytecode_add_alu(ctx->bc, &alu);
4478 if (r)
4479 return r;
4480 }
4481 return 0;
4482 }
4483
4484 static int cayman_mul_int_instr(struct r600_shader_ctx *ctx)
4485 {
4486 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4487 int i, j, k, r;
4488 struct r600_bytecode_alu alu;
4489 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4490 int t1 = ctx->temp_reg;
4491
4492 for (k = 0; k <= lasti; k++) {
4493 if (!(inst->Dst[0].Register.WriteMask & (1 << k)))
4494 continue;
4495
4496 for (i = 0 ; i < 4; i++) {
4497 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4498 alu.op = ctx->inst_info->op;
4499 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4500 r600_bytecode_src(&alu.src[j], &ctx->src[j], k);
4501 }
4502 alu.dst.sel = t1;
4503 alu.dst.chan = i;
4504 alu.dst.write = (i == k);
4505 if (i == 3)
4506 alu.last = 1;
4507 r = r600_bytecode_add_alu(ctx->bc, &alu);
4508 if (r)
4509 return r;
4510 }
4511 }
4512
4513 for (i = 0 ; i <= lasti; i++) {
4514 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4515 continue;
4516 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4517 alu.op = ALU_OP1_MOV;
4518 alu.src[0].sel = t1;
4519 alu.src[0].chan = i;
4520 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4521 alu.dst.write = 1;
4522 if (i == lasti)
4523 alu.last = 1;
4524 r = r600_bytecode_add_alu(ctx->bc, &alu);
4525 if (r)
4526 return r;
4527 }
4528
4529 return 0;
4530 }
4531
4532
4533 static int cayman_mul_double_instr(struct r600_shader_ctx *ctx)
4534 {
4535 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4536 int i, j, k, r;
4537 struct r600_bytecode_alu alu;
4538 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4539 int t1 = ctx->temp_reg;
4540
4541 /* t1 would get overwritten below if we actually tried to
4542 * multiply two pairs of doubles at a time. */
4543 assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
4544 inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);
4545
4546 k = inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ? 0 : 1;
4547
4548 for (i = 0; i < 4; i++) {
4549 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4550 alu.op = ctx->inst_info->op;
4551 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4552 r600_bytecode_src(&alu.src[j], &ctx->src[j], k * 2 + ((i == 3) ? 0 : 1));
4553 }
4554 alu.dst.sel = t1;
4555 alu.dst.chan = i;
4556 alu.dst.write = 1;
4557 if (i == 3)
4558 alu.last = 1;
4559 r = r600_bytecode_add_alu(ctx->bc, &alu);
4560 if (r)
4561 return r;
4562 }
4563
4564 for (i = 0; i <= lasti; i++) {
4565 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4566 continue;
4567 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4568 alu.op = ALU_OP1_MOV;
4569 alu.src[0].sel = t1;
4570 alu.src[0].chan = i;
4571 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4572 alu.dst.write = 1;
4573 if (i == lasti)
4574 alu.last = 1;
4575 r = r600_bytecode_add_alu(ctx->bc, &alu);
4576 if (r)
4577 return r;
4578 }
4579
4580 return 0;
4581 }
4582
4583 /*
4584 * Emit RECIP_64 + MUL_64 to implement division.
4585 */
4586 static int cayman_ddiv_instr(struct r600_shader_ctx *ctx)
4587 {
4588 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4589 int r;
4590 struct r600_bytecode_alu alu;
4591 int t1 = ctx->temp_reg;
4592 int k;
4593
4594 /* Only support one double at a time. This is the same constraint as
4595 * in DMUL lowering. */
4596 assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
4597 inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);
4598
4599 k = inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ? 0 : 1;
4600
4601 r = cayman_emit_unary_double_raw(ctx->bc, ALU_OP2_RECIP_64, t1, &ctx->src[1], false);
4602 if (r)
4603 return r;
4604
4605 for (int i = 0; i < 4; i++) {
4606 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4607 alu.op = ALU_OP2_MUL_64;
4608
4609 r600_bytecode_src(&alu.src[0], &ctx->src[0], k * 2 + ((i == 3) ? 0 : 1));
4610
4611 alu.src[1].sel = t1;
4612 alu.src[1].chan = (i == 3) ? 0 : 1;
4613
4614 alu.dst.sel = t1;
4615 alu.dst.chan = i;
4616 alu.dst.write = 1;
4617 if (i == 3)
4618 alu.last = 1;
4619 r = r600_bytecode_add_alu(ctx->bc, &alu);
4620 if (r)
4621 return r;
4622 }
4623
4624 for (int i = 0; i < 2; i++) {
4625 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4626 alu.op = ALU_OP1_MOV;
4627 alu.src[0].sel = t1;
4628 alu.src[0].chan = i;
4629 tgsi_dst(ctx, &inst->Dst[0], k * 2 + i, &alu.dst);
4630 alu.dst.write = 1;
4631 if (i == 1)
4632 alu.last = 1;
4633 r = r600_bytecode_add_alu(ctx->bc, &alu);
4634 if (r)
4635 return r;
4636 }
4637 return 0;
4638 }
4639
4640 /*
4641 * r600 - trunc to -PI..PI range
4642 * r700 - normalize by dividing by 2PI
4643 * see fdo bug 27901
4644 */
4645 static int tgsi_setup_trig(struct r600_shader_ctx *ctx)
4646 {
4647 int r;
4648 struct r600_bytecode_alu alu;
4649
4650 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4651 alu.op = ALU_OP3_MULADD;
4652 alu.is_op3 = 1;
4653
4654 alu.dst.chan = 0;
4655 alu.dst.sel = ctx->temp_reg;
4656 alu.dst.write = 1;
4657
4658 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4659
4660 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4661 alu.src[1].chan = 0;
4662 alu.src[1].value = u_bitcast_f2u(0.5f * M_1_PI);
4663 alu.src[2].sel = V_SQ_ALU_SRC_0_5;
4664 alu.src[2].chan = 0;
4665 alu.last = 1;
4666 r = r600_bytecode_add_alu(ctx->bc, &alu);
4667 if (r)
4668 return r;
4669
4670 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4671 alu.op = ALU_OP1_FRACT;
4672
4673 alu.dst.chan = 0;
4674 alu.dst.sel = ctx->temp_reg;
4675 alu.dst.write = 1;
4676
4677 alu.src[0].sel = ctx->temp_reg;
4678 alu.src[0].chan = 0;
4679 alu.last = 1;
4680 r = r600_bytecode_add_alu(ctx->bc, &alu);
4681 if (r)
4682 return r;
4683
4684 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4685 alu.op = ALU_OP3_MULADD;
4686 alu.is_op3 = 1;
4687
4688 alu.dst.chan = 0;
4689 alu.dst.sel = ctx->temp_reg;
4690 alu.dst.write = 1;
4691
4692 alu.src[0].sel = ctx->temp_reg;
4693 alu.src[0].chan = 0;
4694
4695 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4696 alu.src[1].chan = 0;
4697 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
4698 alu.src[2].chan = 0;
4699
4700 if (ctx->bc->chip_class == R600) {
4701 alu.src[1].value = u_bitcast_f2u(2.0f * M_PI);
4702 alu.src[2].value = u_bitcast_f2u(-M_PI);
4703 } else {
4704 alu.src[1].sel = V_SQ_ALU_SRC_1;
4705 alu.src[2].sel = V_SQ_ALU_SRC_0_5;
4706 alu.src[2].neg = 1;
4707 }
4708
4709 alu.last = 1;
4710 r = r600_bytecode_add_alu(ctx->bc, &alu);
4711 if (r)
4712 return r;
4713 return 0;
4714 }
4715
4716 static int cayman_trig(struct r600_shader_ctx *ctx)
4717 {
4718 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4719 struct r600_bytecode_alu alu;
4720 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
4721 int i, r;
4722
4723 r = tgsi_setup_trig(ctx);
4724 if (r)
4725 return r;
4726
4727
4728 for (i = 0; i < last_slot; i++) {
4729 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4730 alu.op = ctx->inst_info->op;
4731 alu.dst.chan = i;
4732
4733 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4734 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
4735
4736 alu.src[0].sel = ctx->temp_reg;
4737 alu.src[0].chan = 0;
4738 if (i == last_slot - 1)
4739 alu.last = 1;
4740 r = r600_bytecode_add_alu(ctx->bc, &alu);
4741 if (r)
4742 return r;
4743 }
4744 return 0;
4745 }
4746
4747 static int tgsi_trig(struct r600_shader_ctx *ctx)
4748 {
4749 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4750 struct r600_bytecode_alu alu;
4751 int i, r;
4752 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4753
4754 r = tgsi_setup_trig(ctx);
4755 if (r)
4756 return r;
4757
4758 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4759 alu.op = ctx->inst_info->op;
4760 alu.dst.chan = 0;
4761 alu.dst.sel = ctx->temp_reg;
4762 alu.dst.write = 1;
4763
4764 alu.src[0].sel = ctx->temp_reg;
4765 alu.src[0].chan = 0;
4766 alu.last = 1;
4767 r = r600_bytecode_add_alu(ctx->bc, &alu);
4768 if (r)
4769 return r;
4770
4771 /* replicate result */
4772 for (i = 0; i < lasti + 1; i++) {
4773 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4774 continue;
4775
4776 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4777 alu.op = ALU_OP1_MOV;
4778
4779 alu.src[0].sel = ctx->temp_reg;
4780 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4781 if (i == lasti)
4782 alu.last = 1;
4783 r = r600_bytecode_add_alu(ctx->bc, &alu);
4784 if (r)
4785 return r;
4786 }
4787 return 0;
4788 }
4789
4790 static int tgsi_kill(struct r600_shader_ctx *ctx)
4791 {
4792 const struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4793 struct r600_bytecode_alu alu;
4794 int i, r;
4795
4796 for (i = 0; i < 4; i++) {
4797 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4798 alu.op = ctx->inst_info->op;
4799
4800 alu.dst.chan = i;
4801
4802 alu.src[0].sel = V_SQ_ALU_SRC_0;
4803
4804 if (inst->Instruction.Opcode == TGSI_OPCODE_KILL) {
4805 alu.src[1].sel = V_SQ_ALU_SRC_1;
4806 alu.src[1].neg = 1;
4807 } else {
4808 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4809 }
4810 if (i == 3) {
4811 alu.last = 1;
4812 }
4813 r = r600_bytecode_add_alu(ctx->bc, &alu);
4814 if (r)
4815 return r;
4816 }
4817
4818 /* kill must be last in ALU */
4819 ctx->bc->force_add_cf = 1;
4820 ctx->shader->uses_kill = TRUE;
4821 return 0;
4822 }
4823
4824 static int tgsi_lit(struct r600_shader_ctx *ctx)
4825 {
4826 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4827 struct r600_bytecode_alu alu;
4828 int r;
4829
4830 /* tmp.x = max(src.y, 0.0) */
4831 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4832 alu.op = ALU_OP2_MAX;
4833 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
4834 alu.src[1].sel = V_SQ_ALU_SRC_0; /*0.0*/
4835 alu.src[1].chan = 1;
4836
4837 alu.dst.sel = ctx->temp_reg;
4838 alu.dst.chan = 0;
4839 alu.dst.write = 1;
4840
4841 alu.last = 1;
4842 r = r600_bytecode_add_alu(ctx->bc, &alu);
4843 if (r)
4844 return r;
4845
4846 if (inst->Dst[0].Register.WriteMask & (1 << 2))
4847 {
4848 int chan;
4849 int sel;
4850 unsigned i;
4851
4852 if (ctx->bc->chip_class == CAYMAN) {
4853 for (i = 0; i < 3; i++) {
4854 /* tmp.z = log(tmp.x) */
4855 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4856 alu.op = ALU_OP1_LOG_CLAMPED;
4857 alu.src[0].sel = ctx->temp_reg;
4858 alu.src[0].chan = 0;
4859 alu.dst.sel = ctx->temp_reg;
4860 alu.dst.chan = i;
4861 if (i == 2) {
4862 alu.dst.write = 1;
4863 alu.last = 1;
4864 } else
4865 alu.dst.write = 0;
4866
4867 r = r600_bytecode_add_alu(ctx->bc, &alu);
4868 if (r)
4869 return r;
4870 }
4871 } else {
4872 /* tmp.z = log(tmp.x) */
4873 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4874 alu.op = ALU_OP1_LOG_CLAMPED;
4875 alu.src[0].sel = ctx->temp_reg;
4876 alu.src[0].chan = 0;
4877 alu.dst.sel = ctx->temp_reg;
4878 alu.dst.chan = 2;
4879 alu.dst.write = 1;
4880 alu.last = 1;
4881 r = r600_bytecode_add_alu(ctx->bc, &alu);
4882 if (r)
4883 return r;
4884 }
4885
4886 chan = alu.dst.chan;
4887 sel = alu.dst.sel;
4888
4889 /* tmp.x = amd MUL_LIT(tmp.z, src.w, src.x ) */
4890 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4891 alu.op = ALU_OP3_MUL_LIT;
4892 alu.src[0].sel = sel;
4893 alu.src[0].chan = chan;
4894 r600_bytecode_src(&alu.src[1], &ctx->src[0], 3);
4895 r600_bytecode_src(&alu.src[2], &ctx->src[0], 0);
4896 alu.dst.sel = ctx->temp_reg;
4897 alu.dst.chan = 0;
4898 alu.dst.write = 1;
4899 alu.is_op3 = 1;
4900 alu.last = 1;
4901 r = r600_bytecode_add_alu(ctx->bc, &alu);
4902 if (r)
4903 return r;
4904
4905 if (ctx->bc->chip_class == CAYMAN) {
4906 for (i = 0; i < 3; i++) {
4907 /* dst.z = exp(tmp.x) */
4908 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4909 alu.op = ALU_OP1_EXP_IEEE;
4910 alu.src[0].sel = ctx->temp_reg;
4911 alu.src[0].chan = 0;
4912 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4913 if (i == 2) {
4914 alu.dst.write = 1;
4915 alu.last = 1;
4916 } else
4917 alu.dst.write = 0;
4918 r = r600_bytecode_add_alu(ctx->bc, &alu);
4919 if (r)
4920 return r;
4921 }
4922 } else {
4923 /* dst.z = exp(tmp.x) */
4924 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4925 alu.op = ALU_OP1_EXP_IEEE;
4926 alu.src[0].sel = ctx->temp_reg;
4927 alu.src[0].chan = 0;
4928 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
4929 alu.last = 1;
4930 r = r600_bytecode_add_alu(ctx->bc, &alu);
4931 if (r)
4932 return r;
4933 }
4934 }
4935
4936 /* dst.x, <- 1.0 */
4937 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4938 alu.op = ALU_OP1_MOV;
4939 alu.src[0].sel = V_SQ_ALU_SRC_1; /*1.0*/
4940 alu.src[0].chan = 0;
4941 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
4942 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 0) & 1;
4943 r = r600_bytecode_add_alu(ctx->bc, &alu);
4944 if (r)
4945 return r;
4946
4947 /* dst.y = max(src.x, 0.0) */
4948 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4949 alu.op = ALU_OP2_MAX;
4950 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4951 alu.src[1].sel = V_SQ_ALU_SRC_0; /*0.0*/
4952 alu.src[1].chan = 0;
4953 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
4954 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 1) & 1;
4955 r = r600_bytecode_add_alu(ctx->bc, &alu);
4956 if (r)
4957 return r;
4958
4959 /* dst.w, <- 1.0 */
4960 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4961 alu.op = ALU_OP1_MOV;
4962 alu.src[0].sel = V_SQ_ALU_SRC_1;
4963 alu.src[0].chan = 0;
4964 tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
4965 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 3) & 1;
4966 alu.last = 1;
4967 r = r600_bytecode_add_alu(ctx->bc, &alu);
4968 if (r)
4969 return r;
4970
4971 return 0;
4972 }
4973
4974 static int tgsi_rsq(struct r600_shader_ctx *ctx)
4975 {
4976 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4977 struct r600_bytecode_alu alu;
4978 int i, r;
4979
4980 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4981
4982 alu.op = ALU_OP1_RECIPSQRT_IEEE;
4983
4984 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
4985 r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
4986 r600_bytecode_src_set_abs(&alu.src[i]);
4987 }
4988 alu.dst.sel = ctx->temp_reg;
4989 alu.dst.write = 1;
4990 alu.last = 1;
4991 r = r600_bytecode_add_alu(ctx->bc, &alu);
4992 if (r)
4993 return r;
4994 /* replicate result */
4995 return tgsi_helper_tempx_replicate(ctx);
4996 }
4997
4998 static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx)
4999 {
5000 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5001 struct r600_bytecode_alu alu;
5002 int i, r;
5003
5004 for (i = 0; i < 4; i++) {
5005 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5006 alu.src[0].sel = ctx->temp_reg;
5007 alu.op = ALU_OP1_MOV;
5008 alu.dst.chan = i;
5009 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5010 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
5011 if (i == 3)
5012 alu.last = 1;
5013 r = r600_bytecode_add_alu(ctx->bc, &alu);
5014 if (r)
5015 return r;
5016 }
5017 return 0;
5018 }
5019
5020 static int tgsi_trans_srcx_replicate(struct r600_shader_ctx *ctx)
5021 {
5022 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5023 struct r600_bytecode_alu alu;
5024 int i, r;
5025
5026 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5027 alu.op = ctx->inst_info->op;
5028 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
5029 r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
5030 }
5031 alu.dst.sel = ctx->temp_reg;
5032 alu.dst.write = 1;
5033 alu.last = 1;
5034 r = r600_bytecode_add_alu(ctx->bc, &alu);
5035 if (r)
5036 return r;
5037 /* replicate result */
5038 return tgsi_helper_tempx_replicate(ctx);
5039 }
5040
5041 static int cayman_pow(struct r600_shader_ctx *ctx)
5042 {
5043 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5044 int i, r;
5045 struct r600_bytecode_alu alu;
5046 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
5047
5048 for (i = 0; i < 3; i++) {
5049 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5050 alu.op = ALU_OP1_LOG_IEEE;
5051 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5052 alu.dst.sel = ctx->temp_reg;
5053 alu.dst.chan = i;
5054 alu.dst.write = 1;
5055 if (i == 2)
5056 alu.last = 1;
5057 r = r600_bytecode_add_alu(ctx->bc, &alu);
5058 if (r)
5059 return r;
5060 }
5061
5062 /* b * LOG2(a) */
5063 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5064 alu.op = ALU_OP2_MUL;
5065 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
5066 alu.src[1].sel = ctx->temp_reg;
5067 alu.dst.sel = ctx->temp_reg;
5068 alu.dst.write = 1;
5069 alu.last = 1;
5070 r = r600_bytecode_add_alu(ctx->bc, &alu);
5071 if (r)
5072 return r;
5073
5074 for (i = 0; i < last_slot; i++) {
5075 /* POW(a,b) = EXP2(b * LOG2(a))*/
5076 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5077 alu.op = ALU_OP1_EXP_IEEE;
5078 alu.src[0].sel = ctx->temp_reg;
5079
5080 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5081 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
5082 if (i == last_slot - 1)
5083 alu.last = 1;
5084 r = r600_bytecode_add_alu(ctx->bc, &alu);
5085 if (r)
5086 return r;
5087 }
5088 return 0;
5089 }
5090
5091 static int tgsi_pow(struct r600_shader_ctx *ctx)
5092 {
5093 struct r600_bytecode_alu alu;
5094 int r;
5095
5096 /* LOG2(a) */
5097 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5098 alu.op = ALU_OP1_LOG_IEEE;
5099 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5100 alu.dst.sel = ctx->temp_reg;
5101 alu.dst.write = 1;
5102 alu.last = 1;
5103 r = r600_bytecode_add_alu(ctx->bc, &alu);
5104 if (r)
5105 return r;
5106 /* b * LOG2(a) */
5107 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5108 alu.op = ALU_OP2_MUL;
5109 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
5110 alu.src[1].sel = ctx->temp_reg;
5111 alu.dst.sel = ctx->temp_reg;
5112 alu.dst.write = 1;
5113 alu.last = 1;
5114 r = r600_bytecode_add_alu(ctx->bc, &alu);
5115 if (r)
5116 return r;
5117 /* POW(a,b) = EXP2(b * LOG2(a))*/
5118 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5119 alu.op = ALU_OP1_EXP_IEEE;
5120 alu.src[0].sel = ctx->temp_reg;
5121 alu.dst.sel = ctx->temp_reg;
5122 alu.dst.write = 1;
5123 alu.last = 1;
5124 r = r600_bytecode_add_alu(ctx->bc, &alu);
5125 if (r)
5126 return r;
5127 return tgsi_helper_tempx_replicate(ctx);
5128 }
5129
5130 static int tgsi_divmod(struct r600_shader_ctx *ctx, int mod, int signed_op)
5131 {
5132 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5133 struct r600_bytecode_alu alu;
5134 int i, r, j;
5135 unsigned write_mask = inst->Dst[0].Register.WriteMask;
5136 int tmp0 = ctx->temp_reg;
5137 int tmp1 = r600_get_temp(ctx);
5138 int tmp2 = r600_get_temp(ctx);
5139 int tmp3 = r600_get_temp(ctx);
5140 /* Unsigned path:
5141 *
5142 * we need to represent src1 as src2*q + r, where q - quotient, r - remainder
5143 *
5144 * 1. tmp0.x = rcp (src2) = 2^32/src2 + e, where e is rounding error
5145 * 2. tmp0.z = lo (tmp0.x * src2)
5146 * 3. tmp0.w = -tmp0.z
5147 * 4. tmp0.y = hi (tmp0.x * src2)
5148 * 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z) = abs(lo(rcp*src2))
5149 * 6. tmp0.w = hi (tmp0.z * tmp0.x) = e, rounding error
5150 * 7. tmp1.x = tmp0.x - tmp0.w
5151 * 8. tmp1.y = tmp0.x + tmp0.w
5152 * 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x)
5153 * 10. tmp0.z = hi(tmp0.x * src1) = q
5154 * 11. tmp0.y = lo (tmp0.z * src2) = src2*q = src1 - r
5155 *
5156 * 12. tmp0.w = src1 - tmp0.y = r
5157 * 13. tmp1.x = tmp0.w >= src2 = r >= src2 (uint comparison)
5158 * 14. tmp1.y = src1 >= tmp0.y = r >= 0 (uint comparison)
5159 *
5160 * if DIV
5161 *
5162 * 15. tmp1.z = tmp0.z + 1 = q + 1
5163 * 16. tmp1.w = tmp0.z - 1 = q - 1
5164 *
5165 * else MOD
5166 *
5167 * 15. tmp1.z = tmp0.w - src2 = r - src2
5168 * 16. tmp1.w = tmp0.w + src2 = r + src2
5169 *
5170 * endif
5171 *
5172 * 17. tmp1.x = tmp1.x & tmp1.y
5173 *
5174 * DIV: 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z
5175 * MOD: 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z
5176 *
5177 * 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z
5178 * 20. dst = src2==0 ? MAX_UINT : tmp0.z
5179 *
5180 * Signed path:
5181 *
5182 * Same as unsigned, using abs values of the operands,
5183 * and fixing the sign of the result in the end.
5184 */
5185
5186 for (i = 0; i < 4; i++) {
5187 if (!(write_mask & (1<<i)))
5188 continue;
5189
5190 if (signed_op) {
5191
5192 /* tmp2.x = -src0 */
5193 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5194 alu.op = ALU_OP2_SUB_INT;
5195
5196 alu.dst.sel = tmp2;
5197 alu.dst.chan = 0;
5198 alu.dst.write = 1;
5199
5200 alu.src[0].sel = V_SQ_ALU_SRC_0;
5201
5202 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5203
5204 alu.last = 1;
5205 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5206 return r;
5207
5208 /* tmp2.y = -src1 */
5209 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5210 alu.op = ALU_OP2_SUB_INT;
5211
5212 alu.dst.sel = tmp2;
5213 alu.dst.chan = 1;
5214 alu.dst.write = 1;
5215
5216 alu.src[0].sel = V_SQ_ALU_SRC_0;
5217
5218 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5219
5220 alu.last = 1;
5221 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5222 return r;
5223
5224 /* tmp2.z sign bit is set if src0 and src2 signs are different */
5225 /* it will be a sign of the quotient */
5226 if (!mod) {
5227
5228 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5229 alu.op = ALU_OP2_XOR_INT;
5230
5231 alu.dst.sel = tmp2;
5232 alu.dst.chan = 2;
5233 alu.dst.write = 1;
5234
5235 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5236 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5237
5238 alu.last = 1;
5239 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5240 return r;
5241 }
5242
5243 /* tmp2.x = |src0| */
5244 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5245 alu.op = ALU_OP3_CNDGE_INT;
5246 alu.is_op3 = 1;
5247
5248 alu.dst.sel = tmp2;
5249 alu.dst.chan = 0;
5250 alu.dst.write = 1;
5251
5252 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5253 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5254 alu.src[2].sel = tmp2;
5255 alu.src[2].chan = 0;
5256
5257 alu.last = 1;
5258 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5259 return r;
5260
5261 /* tmp2.y = |src1| */
5262 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5263 alu.op = ALU_OP3_CNDGE_INT;
5264 alu.is_op3 = 1;
5265
5266 alu.dst.sel = tmp2;
5267 alu.dst.chan = 1;
5268 alu.dst.write = 1;
5269
5270 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5271 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5272 alu.src[2].sel = tmp2;
5273 alu.src[2].chan = 1;
5274
5275 alu.last = 1;
5276 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5277 return r;
5278
5279 }
5280
5281 /* 1. tmp0.x = rcp_u (src2) = 2^32/src2 + e, where e is rounding error */
5282 if (ctx->bc->chip_class == CAYMAN) {
5283 /* tmp3.x = u2f(src2) */
5284 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5285 alu.op = ALU_OP1_UINT_TO_FLT;
5286
5287 alu.dst.sel = tmp3;
5288 alu.dst.chan = 0;
5289 alu.dst.write = 1;
5290
5291 if (signed_op) {
5292 alu.src[0].sel = tmp2;
5293 alu.src[0].chan = 1;
5294 } else {
5295 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5296 }
5297
5298 alu.last = 1;
5299 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5300 return r;
5301
5302 /* tmp0.x = recip(tmp3.x) */
5303 for (j = 0 ; j < 3; j++) {
5304 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5305 alu.op = ALU_OP1_RECIP_IEEE;
5306
5307 alu.dst.sel = tmp0;
5308 alu.dst.chan = j;
5309 alu.dst.write = (j == 0);
5310
5311 alu.src[0].sel = tmp3;
5312 alu.src[0].chan = 0;
5313
5314 if (j == 2)
5315 alu.last = 1;
5316 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5317 return r;
5318 }
5319
5320 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5321 alu.op = ALU_OP2_MUL;
5322
5323 alu.src[0].sel = tmp0;
5324 alu.src[0].chan = 0;
5325
5326 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
5327 alu.src[1].value = 0x4f800000;
5328
5329 alu.dst.sel = tmp3;
5330 alu.dst.write = 1;
5331 alu.last = 1;
5332 r = r600_bytecode_add_alu(ctx->bc, &alu);
5333 if (r)
5334 return r;
5335
5336 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5337 alu.op = ALU_OP1_FLT_TO_UINT;
5338
5339 alu.dst.sel = tmp0;
5340 alu.dst.chan = 0;
5341 alu.dst.write = 1;
5342
5343 alu.src[0].sel = tmp3;
5344 alu.src[0].chan = 0;
5345
5346 alu.last = 1;
5347 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5348 return r;
5349
5350 } else {
5351 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5352 alu.op = ALU_OP1_RECIP_UINT;
5353
5354 alu.dst.sel = tmp0;
5355 alu.dst.chan = 0;
5356 alu.dst.write = 1;
5357
5358 if (signed_op) {
5359 alu.src[0].sel = tmp2;
5360 alu.src[0].chan = 1;
5361 } else {
5362 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5363 }
5364
5365 alu.last = 1;
5366 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5367 return r;
5368 }
5369
5370 /* 2. tmp0.z = lo (tmp0.x * src2) */
5371 if (ctx->bc->chip_class == CAYMAN) {
5372 for (j = 0 ; j < 4; j++) {
5373 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5374 alu.op = ALU_OP2_MULLO_UINT;
5375
5376 alu.dst.sel = tmp0;
5377 alu.dst.chan = j;
5378 alu.dst.write = (j == 2);
5379
5380 alu.src[0].sel = tmp0;
5381 alu.src[0].chan = 0;
5382 if (signed_op) {
5383 alu.src[1].sel = tmp2;
5384 alu.src[1].chan = 1;
5385 } else {
5386 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5387 }
5388
5389 alu.last = (j == 3);
5390 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5391 return r;
5392 }
5393 } else {
5394 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5395 alu.op = ALU_OP2_MULLO_UINT;
5396
5397 alu.dst.sel = tmp0;
5398 alu.dst.chan = 2;
5399 alu.dst.write = 1;
5400
5401 alu.src[0].sel = tmp0;
5402 alu.src[0].chan = 0;
5403 if (signed_op) {
5404 alu.src[1].sel = tmp2;
5405 alu.src[1].chan = 1;
5406 } else {
5407 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5408 }
5409
5410 alu.last = 1;
5411 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5412 return r;
5413 }
5414
5415 /* 3. tmp0.w = -tmp0.z */
5416 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5417 alu.op = ALU_OP2_SUB_INT;
5418
5419 alu.dst.sel = tmp0;
5420 alu.dst.chan = 3;
5421 alu.dst.write = 1;
5422
5423 alu.src[0].sel = V_SQ_ALU_SRC_0;
5424 alu.src[1].sel = tmp0;
5425 alu.src[1].chan = 2;
5426
5427 alu.last = 1;
5428 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5429 return r;
5430
5431 /* 4. tmp0.y = hi (tmp0.x * src2) */
5432 if (ctx->bc->chip_class == CAYMAN) {
5433 for (j = 0 ; j < 4; j++) {
5434 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5435 alu.op = ALU_OP2_MULHI_UINT;
5436
5437 alu.dst.sel = tmp0;
5438 alu.dst.chan = j;
5439 alu.dst.write = (j == 1);
5440
5441 alu.src[0].sel = tmp0;
5442 alu.src[0].chan = 0;
5443
5444 if (signed_op) {
5445 alu.src[1].sel = tmp2;
5446 alu.src[1].chan = 1;
5447 } else {
5448 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5449 }
5450 alu.last = (j == 3);
5451 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5452 return r;
5453 }
5454 } else {
5455 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5456 alu.op = ALU_OP2_MULHI_UINT;
5457
5458 alu.dst.sel = tmp0;
5459 alu.dst.chan = 1;
5460 alu.dst.write = 1;
5461
5462 alu.src[0].sel = tmp0;
5463 alu.src[0].chan = 0;
5464
5465 if (signed_op) {
5466 alu.src[1].sel = tmp2;
5467 alu.src[1].chan = 1;
5468 } else {
5469 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5470 }
5471
5472 alu.last = 1;
5473 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5474 return r;
5475 }
5476
5477 /* 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z) = abs(lo(rcp*src)) */
5478 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5479 alu.op = ALU_OP3_CNDE_INT;
5480 alu.is_op3 = 1;
5481
5482 alu.dst.sel = tmp0;
5483 alu.dst.chan = 2;
5484 alu.dst.write = 1;
5485
5486 alu.src[0].sel = tmp0;
5487 alu.src[0].chan = 1;
5488 alu.src[1].sel = tmp0;
5489 alu.src[1].chan = 3;
5490 alu.src[2].sel = tmp0;
5491 alu.src[2].chan = 2;
5492
5493 alu.last = 1;
5494 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5495 return r;
5496
5497 /* 6. tmp0.w = hi (tmp0.z * tmp0.x) = e, rounding error */
5498 if (ctx->bc->chip_class == CAYMAN) {
5499 for (j = 0 ; j < 4; j++) {
5500 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5501 alu.op = ALU_OP2_MULHI_UINT;
5502
5503 alu.dst.sel = tmp0;
5504 alu.dst.chan = j;
5505 alu.dst.write = (j == 3);
5506
5507 alu.src[0].sel = tmp0;
5508 alu.src[0].chan = 2;
5509
5510 alu.src[1].sel = tmp0;
5511 alu.src[1].chan = 0;
5512
5513 alu.last = (j == 3);
5514 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5515 return r;
5516 }
5517 } else {
5518 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5519 alu.op = ALU_OP2_MULHI_UINT;
5520
5521 alu.dst.sel = tmp0;
5522 alu.dst.chan = 3;
5523 alu.dst.write = 1;
5524
5525 alu.src[0].sel = tmp0;
5526 alu.src[0].chan = 2;
5527
5528 alu.src[1].sel = tmp0;
5529 alu.src[1].chan = 0;
5530
5531 alu.last = 1;
5532 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5533 return r;
5534 }
5535
5536 /* 7. tmp1.x = tmp0.x - tmp0.w */
5537 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5538 alu.op = ALU_OP2_SUB_INT;
5539
5540 alu.dst.sel = tmp1;
5541 alu.dst.chan = 0;
5542 alu.dst.write = 1;
5543
5544 alu.src[0].sel = tmp0;
5545 alu.src[0].chan = 0;
5546 alu.src[1].sel = tmp0;
5547 alu.src[1].chan = 3;
5548
5549 alu.last = 1;
5550 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5551 return r;
5552
5553 /* 8. tmp1.y = tmp0.x + tmp0.w */
5554 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5555 alu.op = ALU_OP2_ADD_INT;
5556
5557 alu.dst.sel = tmp1;
5558 alu.dst.chan = 1;
5559 alu.dst.write = 1;
5560
5561 alu.src[0].sel = tmp0;
5562 alu.src[0].chan = 0;
5563 alu.src[1].sel = tmp0;
5564 alu.src[1].chan = 3;
5565
5566 alu.last = 1;
5567 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5568 return r;
5569
5570 /* 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) */
5571 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5572 alu.op = ALU_OP3_CNDE_INT;
5573 alu.is_op3 = 1;
5574
5575 alu.dst.sel = tmp0;
5576 alu.dst.chan = 0;
5577 alu.dst.write = 1;
5578
5579 alu.src[0].sel = tmp0;
5580 alu.src[0].chan = 1;
5581 alu.src[1].sel = tmp1;
5582 alu.src[1].chan = 1;
5583 alu.src[2].sel = tmp1;
5584 alu.src[2].chan = 0;
5585
5586 alu.last = 1;
5587 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5588 return r;
5589
5590 /* 10. tmp0.z = hi(tmp0.x * src1) = q */
5591 if (ctx->bc->chip_class == CAYMAN) {
5592 for (j = 0 ; j < 4; j++) {
5593 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5594 alu.op = ALU_OP2_MULHI_UINT;
5595
5596 alu.dst.sel = tmp0;
5597 alu.dst.chan = j;
5598 alu.dst.write = (j == 2);
5599
5600 alu.src[0].sel = tmp0;
5601 alu.src[0].chan = 0;
5602
5603 if (signed_op) {
5604 alu.src[1].sel = tmp2;
5605 alu.src[1].chan = 0;
5606 } else {
5607 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5608 }
5609
5610 alu.last = (j == 3);
5611 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5612 return r;
5613 }
5614 } else {
5615 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5616 alu.op = ALU_OP2_MULHI_UINT;
5617
5618 alu.dst.sel = tmp0;
5619 alu.dst.chan = 2;
5620 alu.dst.write = 1;
5621
5622 alu.src[0].sel = tmp0;
5623 alu.src[0].chan = 0;
5624
5625 if (signed_op) {
5626 alu.src[1].sel = tmp2;
5627 alu.src[1].chan = 0;
5628 } else {
5629 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5630 }
5631
5632 alu.last = 1;
5633 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5634 return r;
5635 }
5636
5637 /* 11. tmp0.y = lo (src2 * tmp0.z) = src2*q = src1 - r */
5638 if (ctx->bc->chip_class == CAYMAN) {
5639 for (j = 0 ; j < 4; j++) {
5640 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5641 alu.op = ALU_OP2_MULLO_UINT;
5642
5643 alu.dst.sel = tmp0;
5644 alu.dst.chan = j;
5645 alu.dst.write = (j == 1);
5646
5647 if (signed_op) {
5648 alu.src[0].sel = tmp2;
5649 alu.src[0].chan = 1;
5650 } else {
5651 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5652 }
5653
5654 alu.src[1].sel = tmp0;
5655 alu.src[1].chan = 2;
5656
5657 alu.last = (j == 3);
5658 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5659 return r;
5660 }
5661 } else {
5662 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5663 alu.op = ALU_OP2_MULLO_UINT;
5664
5665 alu.dst.sel = tmp0;
5666 alu.dst.chan = 1;
5667 alu.dst.write = 1;
5668
5669 if (signed_op) {
5670 alu.src[0].sel = tmp2;
5671 alu.src[0].chan = 1;
5672 } else {
5673 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5674 }
5675
5676 alu.src[1].sel = tmp0;
5677 alu.src[1].chan = 2;
5678
5679 alu.last = 1;
5680 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5681 return r;
5682 }
5683
5684 /* 12. tmp0.w = src1 - tmp0.y = r */
5685 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5686 alu.op = ALU_OP2_SUB_INT;
5687
5688 alu.dst.sel = tmp0;
5689 alu.dst.chan = 3;
5690 alu.dst.write = 1;
5691
5692 if (signed_op) {
5693 alu.src[0].sel = tmp2;
5694 alu.src[0].chan = 0;
5695 } else {
5696 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5697 }
5698
5699 alu.src[1].sel = tmp0;
5700 alu.src[1].chan = 1;
5701
5702 alu.last = 1;
5703 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5704 return r;
5705
5706 /* 13. tmp1.x = tmp0.w >= src2 = r >= src2 */
5707 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5708 alu.op = ALU_OP2_SETGE_UINT;
5709
5710 alu.dst.sel = tmp1;
5711 alu.dst.chan = 0;
5712 alu.dst.write = 1;
5713
5714 alu.src[0].sel = tmp0;
5715 alu.src[0].chan = 3;
5716 if (signed_op) {
5717 alu.src[1].sel = tmp2;
5718 alu.src[1].chan = 1;
5719 } else {
5720 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5721 }
5722
5723 alu.last = 1;
5724 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5725 return r;
5726
5727 /* 14. tmp1.y = src1 >= tmp0.y = r >= 0 */
5728 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5729 alu.op = ALU_OP2_SETGE_UINT;
5730
5731 alu.dst.sel = tmp1;
5732 alu.dst.chan = 1;
5733 alu.dst.write = 1;
5734
5735 if (signed_op) {
5736 alu.src[0].sel = tmp2;
5737 alu.src[0].chan = 0;
5738 } else {
5739 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5740 }
5741
5742 alu.src[1].sel = tmp0;
5743 alu.src[1].chan = 1;
5744
5745 alu.last = 1;
5746 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5747 return r;
5748
5749 if (mod) { /* UMOD */
5750
5751 /* 15. tmp1.z = tmp0.w - src2 = r - src2 */
5752 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5753 alu.op = ALU_OP2_SUB_INT;
5754
5755 alu.dst.sel = tmp1;
5756 alu.dst.chan = 2;
5757 alu.dst.write = 1;
5758
5759 alu.src[0].sel = tmp0;
5760 alu.src[0].chan = 3;
5761
5762 if (signed_op) {
5763 alu.src[1].sel = tmp2;
5764 alu.src[1].chan = 1;
5765 } else {
5766 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5767 }
5768
5769 alu.last = 1;
5770 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5771 return r;
5772
5773 /* 16. tmp1.w = tmp0.w + src2 = r + src2 */
5774 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5775 alu.op = ALU_OP2_ADD_INT;
5776
5777 alu.dst.sel = tmp1;
5778 alu.dst.chan = 3;
5779 alu.dst.write = 1;
5780
5781 alu.src[0].sel = tmp0;
5782 alu.src[0].chan = 3;
5783 if (signed_op) {
5784 alu.src[1].sel = tmp2;
5785 alu.src[1].chan = 1;
5786 } else {
5787 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5788 }
5789
5790 alu.last = 1;
5791 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5792 return r;
5793
5794 } else { /* UDIV */
5795
5796 /* 15. tmp1.z = tmp0.z + 1 = q + 1 DIV */
5797 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5798 alu.op = ALU_OP2_ADD_INT;
5799
5800 alu.dst.sel = tmp1;
5801 alu.dst.chan = 2;
5802 alu.dst.write = 1;
5803
5804 alu.src[0].sel = tmp0;
5805 alu.src[0].chan = 2;
5806 alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
5807
5808 alu.last = 1;
5809 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5810 return r;
5811
5812 /* 16. tmp1.w = tmp0.z - 1 = q - 1 */
5813 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5814 alu.op = ALU_OP2_ADD_INT;
5815
5816 alu.dst.sel = tmp1;
5817 alu.dst.chan = 3;
5818 alu.dst.write = 1;
5819
5820 alu.src[0].sel = tmp0;
5821 alu.src[0].chan = 2;
5822 alu.src[1].sel = V_SQ_ALU_SRC_M_1_INT;
5823
5824 alu.last = 1;
5825 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5826 return r;
5827
5828 }
5829
5830 /* 17. tmp1.x = tmp1.x & tmp1.y */
5831 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5832 alu.op = ALU_OP2_AND_INT;
5833
5834 alu.dst.sel = tmp1;
5835 alu.dst.chan = 0;
5836 alu.dst.write = 1;
5837
5838 alu.src[0].sel = tmp1;
5839 alu.src[0].chan = 0;
5840 alu.src[1].sel = tmp1;
5841 alu.src[1].chan = 1;
5842
5843 alu.last = 1;
5844 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5845 return r;
5846
5847 /* 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z DIV */
5848 /* 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z MOD */
5849 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5850 alu.op = ALU_OP3_CNDE_INT;
5851 alu.is_op3 = 1;
5852
5853 alu.dst.sel = tmp0;
5854 alu.dst.chan = 2;
5855 alu.dst.write = 1;
5856
5857 alu.src[0].sel = tmp1;
5858 alu.src[0].chan = 0;
5859 alu.src[1].sel = tmp0;
5860 alu.src[1].chan = mod ? 3 : 2;
5861 alu.src[2].sel = tmp1;
5862 alu.src[2].chan = 2;
5863
5864 alu.last = 1;
5865 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5866 return r;
5867
5868 /* 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z */
5869 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5870 alu.op = ALU_OP3_CNDE_INT;
5871 alu.is_op3 = 1;
5872
5873 if (signed_op) {
5874 alu.dst.sel = tmp0;
5875 alu.dst.chan = 2;
5876 alu.dst.write = 1;
5877 } else {
5878 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5879 }
5880
5881 alu.src[0].sel = tmp1;
5882 alu.src[0].chan = 1;
5883 alu.src[1].sel = tmp1;
5884 alu.src[1].chan = 3;
5885 alu.src[2].sel = tmp0;
5886 alu.src[2].chan = 2;
5887
5888 alu.last = 1;
5889 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5890 return r;
5891
5892 if (signed_op) {
5893
5894 /* fix the sign of the result */
5895
5896 if (mod) {
5897
5898 /* tmp0.x = -tmp0.z */
5899 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5900 alu.op = ALU_OP2_SUB_INT;
5901
5902 alu.dst.sel = tmp0;
5903 alu.dst.chan = 0;
5904 alu.dst.write = 1;
5905
5906 alu.src[0].sel = V_SQ_ALU_SRC_0;
5907 alu.src[1].sel = tmp0;
5908 alu.src[1].chan = 2;
5909
5910 alu.last = 1;
5911 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5912 return r;
5913
5914 /* sign of the remainder is the same as the sign of src0 */
5915 /* tmp0.x = src0>=0 ? tmp0.z : tmp0.x */
5916 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5917 alu.op = ALU_OP3_CNDGE_INT;
5918 alu.is_op3 = 1;
5919
5920 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5921
5922 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5923 alu.src[1].sel = tmp0;
5924 alu.src[1].chan = 2;
5925 alu.src[2].sel = tmp0;
5926 alu.src[2].chan = 0;
5927
5928 alu.last = 1;
5929 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5930 return r;
5931
5932 } else {
5933
5934 /* tmp0.x = -tmp0.z */
5935 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5936 alu.op = ALU_OP2_SUB_INT;
5937
5938 alu.dst.sel = tmp0;
5939 alu.dst.chan = 0;
5940 alu.dst.write = 1;
5941
5942 alu.src[0].sel = V_SQ_ALU_SRC_0;
5943 alu.src[1].sel = tmp0;
5944 alu.src[1].chan = 2;
5945
5946 alu.last = 1;
5947 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5948 return r;
5949
5950 /* fix the quotient sign (same as the sign of src0*src1) */
5951 /* tmp0.x = tmp2.z>=0 ? tmp0.z : tmp0.x */
5952 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5953 alu.op = ALU_OP3_CNDGE_INT;
5954 alu.is_op3 = 1;
5955
5956 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5957
5958 alu.src[0].sel = tmp2;
5959 alu.src[0].chan = 2;
5960 alu.src[1].sel = tmp0;
5961 alu.src[1].chan = 2;
5962 alu.src[2].sel = tmp0;
5963 alu.src[2].chan = 0;
5964
5965 alu.last = 1;
5966 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5967 return r;
5968 }
5969 }
5970 }
5971 return 0;
5972 }
5973
5974 static int tgsi_udiv(struct r600_shader_ctx *ctx)
5975 {
5976 return tgsi_divmod(ctx, 0, 0);
5977 }
5978
5979 static int tgsi_umod(struct r600_shader_ctx *ctx)
5980 {
5981 return tgsi_divmod(ctx, 1, 0);
5982 }
5983
5984 static int tgsi_idiv(struct r600_shader_ctx *ctx)
5985 {
5986 return tgsi_divmod(ctx, 0, 1);
5987 }
5988
5989 static int tgsi_imod(struct r600_shader_ctx *ctx)
5990 {
5991 return tgsi_divmod(ctx, 1, 1);
5992 }
5993
5994
5995 static int tgsi_f2i(struct r600_shader_ctx *ctx)
5996 {
5997 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5998 struct r600_bytecode_alu alu;
5999 int i, r;
6000 unsigned write_mask = inst->Dst[0].Register.WriteMask;
6001 int last_inst = tgsi_last_instruction(write_mask);
6002
6003 for (i = 0; i < 4; i++) {
6004 if (!(write_mask & (1<<i)))
6005 continue;
6006
6007 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6008 alu.op = ALU_OP1_TRUNC;
6009
6010 alu.dst.sel = ctx->temp_reg;
6011 alu.dst.chan = i;
6012 alu.dst.write = 1;
6013
6014 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6015 if (i == last_inst)
6016 alu.last = 1;
6017 r = r600_bytecode_add_alu(ctx->bc, &alu);
6018 if (r)
6019 return r;
6020 }
6021
6022 for (i = 0; i < 4; i++) {
6023 if (!(write_mask & (1<<i)))
6024 continue;
6025
6026 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6027 alu.op = ctx->inst_info->op;
6028
6029 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6030
6031 alu.src[0].sel = ctx->temp_reg;
6032 alu.src[0].chan = i;
6033
6034 if (i == last_inst || alu.op == ALU_OP1_FLT_TO_UINT)
6035 alu.last = 1;
6036 r = r600_bytecode_add_alu(ctx->bc, &alu);
6037 if (r)
6038 return r;
6039 }
6040
6041 return 0;
6042 }
6043
6044 static int tgsi_iabs(struct r600_shader_ctx *ctx)
6045 {
6046 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6047 struct r600_bytecode_alu alu;
6048 int i, r;
6049 unsigned write_mask = inst->Dst[0].Register.WriteMask;
6050 int last_inst = tgsi_last_instruction(write_mask);
6051
6052 /* tmp = -src */
6053 for (i = 0; i < 4; i++) {
6054 if (!(write_mask & (1<<i)))
6055 continue;
6056
6057 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6058 alu.op = ALU_OP2_SUB_INT;
6059
6060 alu.dst.sel = ctx->temp_reg;
6061 alu.dst.chan = i;
6062 alu.dst.write = 1;
6063
6064 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6065 alu.src[0].sel = V_SQ_ALU_SRC_0;
6066
6067 if (i == last_inst)
6068 alu.last = 1;
6069 r = r600_bytecode_add_alu(ctx->bc, &alu);
6070 if (r)
6071 return r;
6072 }
6073
6074 /* dst = (src >= 0 ? src : tmp) */
6075 for (i = 0; i < 4; i++) {
6076 if (!(write_mask & (1<<i)))
6077 continue;
6078
6079 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6080 alu.op = ALU_OP3_CNDGE_INT;
6081 alu.is_op3 = 1;
6082 alu.dst.write = 1;
6083
6084 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6085
6086 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6087 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6088 alu.src[2].sel = ctx->temp_reg;
6089 alu.src[2].chan = i;
6090
6091 if (i == last_inst)
6092 alu.last = 1;
6093 r = r600_bytecode_add_alu(ctx->bc, &alu);
6094 if (r)
6095 return r;
6096 }
6097 return 0;
6098 }
6099
6100 static int tgsi_issg(struct r600_shader_ctx *ctx)
6101 {
6102 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6103 struct r600_bytecode_alu alu;
6104 int i, r;
6105 unsigned write_mask = inst->Dst[0].Register.WriteMask;
6106 int last_inst = tgsi_last_instruction(write_mask);
6107
6108 /* tmp = (src >= 0 ? src : -1) */
6109 for (i = 0; i < 4; i++) {
6110 if (!(write_mask & (1<<i)))
6111 continue;
6112
6113 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6114 alu.op = ALU_OP3_CNDGE_INT;
6115 alu.is_op3 = 1;
6116
6117 alu.dst.sel = ctx->temp_reg;
6118 alu.dst.chan = i;
6119 alu.dst.write = 1;
6120
6121 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6122 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6123 alu.src[2].sel = V_SQ_ALU_SRC_M_1_INT;
6124
6125 if (i == last_inst)
6126 alu.last = 1;
6127 r = r600_bytecode_add_alu(ctx->bc, &alu);
6128 if (r)
6129 return r;
6130 }
6131
6132 /* dst = (tmp > 0 ? 1 : tmp) */
6133 for (i = 0; i < 4; i++) {
6134 if (!(write_mask & (1<<i)))
6135 continue;
6136
6137 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6138 alu.op = ALU_OP3_CNDGT_INT;
6139 alu.is_op3 = 1;
6140 alu.dst.write = 1;
6141
6142 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6143
6144 alu.src[0].sel = ctx->temp_reg;
6145 alu.src[0].chan = i;
6146
6147 alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
6148
6149 alu.src[2].sel = ctx->temp_reg;
6150 alu.src[2].chan = i;
6151
6152 if (i == last_inst)
6153 alu.last = 1;
6154 r = r600_bytecode_add_alu(ctx->bc, &alu);
6155 if (r)
6156 return r;
6157 }
6158 return 0;
6159 }
6160
6161
6162
6163 static int tgsi_ssg(struct r600_shader_ctx *ctx)
6164 {
6165 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6166 struct r600_bytecode_alu alu;
6167 int i, r;
6168
6169 /* tmp = (src > 0 ? 1 : src) */
6170 for (i = 0; i < 4; i++) {
6171 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6172 alu.op = ALU_OP3_CNDGT;
6173 alu.is_op3 = 1;
6174
6175 alu.dst.sel = ctx->temp_reg;
6176 alu.dst.chan = i;
6177
6178 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6179 alu.src[1].sel = V_SQ_ALU_SRC_1;
6180 r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
6181
6182 if (i == 3)
6183 alu.last = 1;
6184 r = r600_bytecode_add_alu(ctx->bc, &alu);
6185 if (r)
6186 return r;
6187 }
6188
6189 /* dst = (-tmp > 0 ? -1 : tmp) */
6190 for (i = 0; i < 4; i++) {
6191 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6192 alu.op = ALU_OP3_CNDGT;
6193 alu.is_op3 = 1;
6194 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6195
6196 alu.src[0].sel = ctx->temp_reg;
6197 alu.src[0].chan = i;
6198 alu.src[0].neg = 1;
6199
6200 alu.src[1].sel = V_SQ_ALU_SRC_1;
6201 alu.src[1].neg = 1;
6202
6203 alu.src[2].sel = ctx->temp_reg;
6204 alu.src[2].chan = i;
6205
6206 if (i == 3)
6207 alu.last = 1;
6208 r = r600_bytecode_add_alu(ctx->bc, &alu);
6209 if (r)
6210 return r;
6211 }
6212 return 0;
6213 }
6214
6215 static int tgsi_bfi(struct r600_shader_ctx *ctx)
6216 {
6217 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6218 struct r600_bytecode_alu alu;
6219 int i, r, t1, t2;
6220
6221 unsigned write_mask = inst->Dst[0].Register.WriteMask;
6222 int last_inst = tgsi_last_instruction(write_mask);
6223
6224 t1 = r600_get_temp(ctx);
6225
6226 for (i = 0; i < 4; i++) {
6227 if (!(write_mask & (1<<i)))
6228 continue;
6229
6230 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6231 alu.op = ALU_OP2_SETGE_INT;
6232 r600_bytecode_src(&alu.src[0], &ctx->src[3], i);
6233 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6234 alu.src[1].value = 32;
6235 alu.dst.sel = ctx->temp_reg;
6236 alu.dst.chan = i;
6237 alu.dst.write = 1;
6238 alu.last = i == last_inst;
6239 r = r600_bytecode_add_alu(ctx->bc, &alu);
6240 if (r)
6241 return r;
6242 }
6243
6244 for (i = 0; i < 4; i++) {
6245 if (!(write_mask & (1<<i)))
6246 continue;
6247
6248 /* create mask tmp */
6249 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6250 alu.op = ALU_OP2_BFM_INT;
6251 alu.dst.sel = t1;
6252 alu.dst.chan = i;
6253 alu.dst.write = 1;
6254 alu.last = i == last_inst;
6255
6256 r600_bytecode_src(&alu.src[0], &ctx->src[3], i);
6257 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6258
6259 r = r600_bytecode_add_alu(ctx->bc, &alu);
6260 if (r)
6261 return r;
6262 }
6263
6264 t2 = r600_get_temp(ctx);
6265
6266 for (i = 0; i < 4; i++) {
6267 if (!(write_mask & (1<<i)))
6268 continue;
6269
6270 /* shift insert left */
6271 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6272 alu.op = ALU_OP2_LSHL_INT;
6273 alu.dst.sel = t2;
6274 alu.dst.chan = i;
6275 alu.dst.write = 1;
6276 alu.last = i == last_inst;
6277
6278 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
6279 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6280
6281 r = r600_bytecode_add_alu(ctx->bc, &alu);
6282 if (r)
6283 return r;
6284 }
6285
6286 for (i = 0; i < 4; i++) {
6287 if (!(write_mask & (1<<i)))
6288 continue;
6289
6290 /* actual bitfield insert */
6291 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6292 alu.op = ALU_OP3_BFI_INT;
6293 alu.is_op3 = 1;
6294 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6295 alu.dst.chan = i;
6296 alu.dst.write = 1;
6297 alu.last = i == last_inst;
6298
6299 alu.src[0].sel = t1;
6300 alu.src[0].chan = i;
6301 alu.src[1].sel = t2;
6302 alu.src[1].chan = i;
6303 r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
6304
6305 r = r600_bytecode_add_alu(ctx->bc, &alu);
6306 if (r)
6307 return r;
6308 }
6309
6310 for (i = 0; i < 4; i++) {
6311 if (!(write_mask & (1<<i)))
6312 continue;
6313 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6314 alu.op = ALU_OP3_CNDE_INT;
6315 alu.is_op3 = 1;
6316 alu.src[0].sel = ctx->temp_reg;
6317 alu.src[0].chan = i;
6318 r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
6319
6320 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6321
6322 alu.src[1].sel = alu.dst.sel;
6323 alu.src[1].chan = i;
6324
6325 alu.last = i == last_inst;
6326 r = r600_bytecode_add_alu(ctx->bc, &alu);
6327 if (r)
6328 return r;
6329 }
6330 return 0;
6331 }
6332
6333 static int tgsi_msb(struct r600_shader_ctx *ctx)
6334 {
6335 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6336 struct r600_bytecode_alu alu;
6337 int i, r, t1, t2;
6338
6339 unsigned write_mask = inst->Dst[0].Register.WriteMask;
6340 int last_inst = tgsi_last_instruction(write_mask);
6341
6342 assert(ctx->inst_info->op == ALU_OP1_FFBH_INT ||
6343 ctx->inst_info->op == ALU_OP1_FFBH_UINT);
6344
6345 t1 = ctx->temp_reg;
6346
6347 /* bit position is indexed from lsb by TGSI, and from msb by the hardware */
6348 for (i = 0; i < 4; i++) {
6349 if (!(write_mask & (1<<i)))
6350 continue;
6351
6352 /* t1 = FFBH_INT / FFBH_UINT */
6353 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6354 alu.op = ctx->inst_info->op;
6355 alu.dst.sel = t1;
6356 alu.dst.chan = i;
6357 alu.dst.write = 1;
6358 alu.last = i == last_inst;
6359
6360 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6361
6362 r = r600_bytecode_add_alu(ctx->bc, &alu);
6363 if (r)
6364 return r;
6365 }
6366
6367 t2 = r600_get_temp(ctx);
6368
6369 for (i = 0; i < 4; i++) {
6370 if (!(write_mask & (1<<i)))
6371 continue;
6372
6373 /* t2 = 31 - t1 */
6374 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6375 alu.op = ALU_OP2_SUB_INT;
6376 alu.dst.sel = t2;
6377 alu.dst.chan = i;
6378 alu.dst.write = 1;
6379 alu.last = i == last_inst;
6380
6381 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
6382 alu.src[0].value = 31;
6383 alu.src[1].sel = t1;
6384 alu.src[1].chan = i;
6385
6386 r = r600_bytecode_add_alu(ctx->bc, &alu);
6387 if (r)
6388 return r;
6389 }
6390
6391 for (i = 0; i < 4; i++) {
6392 if (!(write_mask & (1<<i)))
6393 continue;
6394
6395 /* result = t1 >= 0 ? t2 : t1 */
6396 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6397 alu.op = ALU_OP3_CNDGE_INT;
6398 alu.is_op3 = 1;
6399 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6400 alu.dst.chan = i;
6401 alu.dst.write = 1;
6402 alu.last = i == last_inst;
6403
6404 alu.src[0].sel = t1;
6405 alu.src[0].chan = i;
6406 alu.src[1].sel = t2;
6407 alu.src[1].chan = i;
6408 alu.src[2].sel = t1;
6409 alu.src[2].chan = i;
6410
6411 r = r600_bytecode_add_alu(ctx->bc, &alu);
6412 if (r)
6413 return r;
6414 }
6415
6416 return 0;
6417 }
6418
6419 static int tgsi_interp_egcm(struct r600_shader_ctx *ctx)
6420 {
6421 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6422 struct r600_bytecode_alu alu;
6423 int r, i = 0, k, interp_gpr, interp_base_chan, tmp, lasti;
6424 unsigned location;
6425 const int input = inst->Src[0].Register.Index + ctx->shader->nsys_inputs;
6426
6427 assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
6428
6429 /* Interpolators have been marked for use already by allocate_system_value_inputs */
6430 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
6431 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6432 location = TGSI_INTERPOLATE_LOC_CENTER; /* sample offset will be added explicitly */
6433 }
6434 else {
6435 location = TGSI_INTERPOLATE_LOC_CENTROID;
6436 }
6437
6438 k = eg_get_interpolator_index(ctx->shader->input[input].interpolate, location);
6439 if (k < 0)
6440 k = 0;
6441 interp_gpr = ctx->eg_interpolators[k].ij_index / 2;
6442 interp_base_chan = 2 * (ctx->eg_interpolators[k].ij_index % 2);
6443
6444 /* NOTE: currently offset is not perspective correct */
6445 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
6446 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6447 int sample_gpr = -1;
6448 int gradientsH, gradientsV;
6449 struct r600_bytecode_tex tex;
6450
6451 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6452 sample_gpr = load_sample_position(ctx, &ctx->src[1], ctx->src[1].swizzle[0]);
6453 }
6454
6455 gradientsH = r600_get_temp(ctx);
6456 gradientsV = r600_get_temp(ctx);
6457 for (i = 0; i < 2; i++) {
6458 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
6459 tex.op = i == 0 ? FETCH_OP_GET_GRADIENTS_H : FETCH_OP_GET_GRADIENTS_V;
6460 tex.src_gpr = interp_gpr;
6461 tex.src_sel_x = interp_base_chan + 0;
6462 tex.src_sel_y = interp_base_chan + 1;
6463 tex.src_sel_z = 0;
6464 tex.src_sel_w = 0;
6465 tex.dst_gpr = i == 0 ? gradientsH : gradientsV;
6466 tex.dst_sel_x = 0;
6467 tex.dst_sel_y = 1;
6468 tex.dst_sel_z = 7;
6469 tex.dst_sel_w = 7;
6470 tex.inst_mod = 1; // Use per pixel gradient calculation
6471 tex.sampler_id = 0;
6472 tex.resource_id = tex.sampler_id;
6473 r = r600_bytecode_add_tex(ctx->bc, &tex);
6474 if (r)
6475 return r;
6476 }
6477
6478 for (i = 0; i < 2; i++) {
6479 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6480 alu.op = ALU_OP3_MULADD;
6481 alu.is_op3 = 1;
6482 alu.src[0].sel = gradientsH;
6483 alu.src[0].chan = i;
6484 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6485 alu.src[1].sel = sample_gpr;
6486 alu.src[1].chan = 2;
6487 }
6488 else {
6489 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
6490 }
6491 alu.src[2].sel = interp_gpr;
6492 alu.src[2].chan = interp_base_chan + i;
6493 alu.dst.sel = ctx->temp_reg;
6494 alu.dst.chan = i;
6495 alu.last = i == 1;
6496
6497 r = r600_bytecode_add_alu(ctx->bc, &alu);
6498 if (r)
6499 return r;
6500 }
6501
6502 for (i = 0; i < 2; i++) {
6503 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6504 alu.op = ALU_OP3_MULADD;
6505 alu.is_op3 = 1;
6506 alu.src[0].sel = gradientsV;
6507 alu.src[0].chan = i;
6508 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6509 alu.src[1].sel = sample_gpr;
6510 alu.src[1].chan = 3;
6511 }
6512 else {
6513 r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
6514 }
6515 alu.src[2].sel = ctx->temp_reg;
6516 alu.src[2].chan = i;
6517 alu.dst.sel = ctx->temp_reg;
6518 alu.dst.chan = i;
6519 alu.last = i == 1;
6520
6521 r = r600_bytecode_add_alu(ctx->bc, &alu);
6522 if (r)
6523 return r;
6524 }
6525 }
6526
6527 tmp = r600_get_temp(ctx);
6528 for (i = 0; i < 8; i++) {
6529 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6530 alu.op = i < 4 ? ALU_OP2_INTERP_ZW : ALU_OP2_INTERP_XY;
6531
6532 alu.dst.sel = tmp;
6533 if ((i > 1 && i < 6)) {
6534 alu.dst.write = 1;
6535 }
6536 else {
6537 alu.dst.write = 0;
6538 }
6539 alu.dst.chan = i % 4;
6540
6541 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
6542 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6543 alu.src[0].sel = ctx->temp_reg;
6544 alu.src[0].chan = 1 - (i % 2);
6545 } else {
6546 alu.src[0].sel = interp_gpr;
6547 alu.src[0].chan = interp_base_chan + 1 - (i % 2);
6548 }
6549 alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
6550 alu.src[1].chan = 0;
6551
6552 alu.last = i % 4 == 3;
6553 alu.bank_swizzle_force = SQ_ALU_VEC_210;
6554
6555 r = r600_bytecode_add_alu(ctx->bc, &alu);
6556 if (r)
6557 return r;
6558 }
6559
6560 // INTERP can't swizzle dst
6561 lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6562 for (i = 0; i <= lasti; i++) {
6563 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6564 continue;
6565
6566 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6567 alu.op = ALU_OP1_MOV;
6568 alu.src[0].sel = tmp;
6569 alu.src[0].chan = ctx->src[0].swizzle[i];
6570 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6571 alu.dst.write = 1;
6572 alu.last = i == lasti;
6573 r = r600_bytecode_add_alu(ctx->bc, &alu);
6574 if (r)
6575 return r;
6576 }
6577
6578 return 0;
6579 }
6580
6581
6582 static int tgsi_helper_copy(struct r600_shader_ctx *ctx, struct tgsi_full_instruction *inst)
6583 {
6584 struct r600_bytecode_alu alu;
6585 int i, r;
6586
6587 for (i = 0; i < 4; i++) {
6588 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6589 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) {
6590 alu.op = ALU_OP0_NOP;
6591 alu.dst.chan = i;
6592 } else {
6593 alu.op = ALU_OP1_MOV;
6594 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6595 alu.src[0].sel = ctx->temp_reg;
6596 alu.src[0].chan = i;
6597 }
6598 if (i == 3) {
6599 alu.last = 1;
6600 }
6601 r = r600_bytecode_add_alu(ctx->bc, &alu);
6602 if (r)
6603 return r;
6604 }
6605 return 0;
6606 }
6607
6608 static int tgsi_make_src_for_op3(struct r600_shader_ctx *ctx,
6609 unsigned temp, int chan,
6610 struct r600_bytecode_alu_src *bc_src,
6611 const struct r600_shader_src *shader_src)
6612 {
6613 struct r600_bytecode_alu alu;
6614 int r;
6615
6616 r600_bytecode_src(bc_src, shader_src, chan);
6617
6618 /* op3 operands don't support abs modifier */
6619 if (bc_src->abs) {
6620 assert(temp!=0); /* we actually need the extra register, make sure it is allocated. */
6621 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6622 alu.op = ALU_OP1_MOV;
6623 alu.dst.sel = temp;
6624 alu.dst.chan = chan;
6625 alu.dst.write = 1;
6626
6627 alu.src[0] = *bc_src;
6628 alu.last = true; // sufficient?
6629 r = r600_bytecode_add_alu(ctx->bc, &alu);
6630 if (r)
6631 return r;
6632
6633 memset(bc_src, 0, sizeof(*bc_src));
6634 bc_src->sel = temp;
6635 bc_src->chan = chan;
6636 }
6637 return 0;
6638 }
6639
6640 static int tgsi_op3(struct r600_shader_ctx *ctx)
6641 {
6642 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6643 struct r600_bytecode_alu alu;
6644 int i, j, r;
6645 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6646 int temp_regs[4];
6647 unsigned op = ctx->inst_info->op;
6648
6649 if (op == ALU_OP3_MULADD_IEEE &&
6650 ctx->info.properties[TGSI_PROPERTY_MUL_ZERO_WINS])
6651 op = ALU_OP3_MULADD;
6652
6653 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
6654 temp_regs[j] = 0;
6655 if (ctx->src[j].abs)
6656 temp_regs[j] = r600_get_temp(ctx);
6657 }
6658 for (i = 0; i < lasti + 1; i++) {
6659 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6660 continue;
6661
6662 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6663 alu.op = op;
6664 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
6665 r = tgsi_make_src_for_op3(ctx, temp_regs[j], i, &alu.src[j], &ctx->src[j]);
6666 if (r)
6667 return r;
6668 }
6669
6670 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6671 alu.dst.chan = i;
6672 alu.dst.write = 1;
6673 alu.is_op3 = 1;
6674 if (i == lasti) {
6675 alu.last = 1;
6676 }
6677 r = r600_bytecode_add_alu(ctx->bc, &alu);
6678 if (r)
6679 return r;
6680 }
6681 return 0;
6682 }
6683
6684 static int tgsi_dp(struct r600_shader_ctx *ctx)
6685 {
6686 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6687 struct r600_bytecode_alu alu;
6688 int i, j, r;
6689 unsigned op = ctx->inst_info->op;
6690 if (op == ALU_OP2_DOT4_IEEE &&
6691 ctx->info.properties[TGSI_PROPERTY_MUL_ZERO_WINS])
6692 op = ALU_OP2_DOT4;
6693
6694 for (i = 0; i < 4; i++) {
6695 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6696 alu.op = op;
6697 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
6698 r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
6699 }
6700
6701 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6702 alu.dst.chan = i;
6703 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
6704 /* handle some special cases */
6705 switch (inst->Instruction.Opcode) {
6706 case TGSI_OPCODE_DP2:
6707 if (i > 1) {
6708 alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
6709 alu.src[0].chan = alu.src[1].chan = 0;
6710 }
6711 break;
6712 case TGSI_OPCODE_DP3:
6713 if (i > 2) {
6714 alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
6715 alu.src[0].chan = alu.src[1].chan = 0;
6716 }
6717 break;
6718 default:
6719 break;
6720 }
6721 if (i == 3) {
6722 alu.last = 1;
6723 }
6724 r = r600_bytecode_add_alu(ctx->bc, &alu);
6725 if (r)
6726 return r;
6727 }
6728 return 0;
6729 }
6730
6731 static inline boolean tgsi_tex_src_requires_loading(struct r600_shader_ctx *ctx,
6732 unsigned index)
6733 {
6734 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6735 return (inst->Src[index].Register.File != TGSI_FILE_TEMPORARY &&
6736 inst->Src[index].Register.File != TGSI_FILE_INPUT &&
6737 inst->Src[index].Register.File != TGSI_FILE_OUTPUT) ||
6738 ctx->src[index].neg || ctx->src[index].abs ||
6739 (inst->Src[index].Register.File == TGSI_FILE_INPUT && ctx->type == PIPE_SHADER_GEOMETRY);
6740 }
6741
6742 static inline unsigned tgsi_tex_get_src_gpr(struct r600_shader_ctx *ctx,
6743 unsigned index)
6744 {
6745 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6746 return ctx->file_offset[inst->Src[index].Register.File] + inst->Src[index].Register.Index;
6747 }
6748
6749 static int do_vtx_fetch_inst(struct r600_shader_ctx *ctx, boolean src_requires_loading)
6750 {
6751 struct r600_bytecode_vtx vtx;
6752 struct r600_bytecode_alu alu;
6753 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6754 int src_gpr, r, i;
6755 int id = tgsi_tex_get_src_gpr(ctx, 1);
6756
6757 src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
6758 if (src_requires_loading) {
6759 for (i = 0; i < 4; i++) {
6760 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6761 alu.op = ALU_OP1_MOV;
6762 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6763 alu.dst.sel = ctx->temp_reg;
6764 alu.dst.chan = i;
6765 if (i == 3)
6766 alu.last = 1;
6767 alu.dst.write = 1;
6768 r = r600_bytecode_add_alu(ctx->bc, &alu);
6769 if (r)
6770 return r;
6771 }
6772 src_gpr = ctx->temp_reg;
6773 }
6774
6775 memset(&vtx, 0, sizeof(vtx));
6776 vtx.op = FETCH_OP_VFETCH;
6777 vtx.buffer_id = id + R600_MAX_CONST_BUFFERS;
6778 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
6779 vtx.src_gpr = src_gpr;
6780 vtx.mega_fetch_count = 16;
6781 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
6782 vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; /* SEL_X */
6783 vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; /* SEL_Y */
6784 vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; /* SEL_Z */
6785 vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; /* SEL_W */
6786 vtx.use_const_fields = 1;
6787
6788 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
6789 return r;
6790
6791 if (ctx->bc->chip_class >= EVERGREEN)
6792 return 0;
6793
6794 for (i = 0; i < 4; i++) {
6795 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6796 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6797 continue;
6798
6799 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6800 alu.op = ALU_OP2_AND_INT;
6801
6802 alu.dst.chan = i;
6803 alu.dst.sel = vtx.dst_gpr;
6804 alu.dst.write = 1;
6805
6806 alu.src[0].sel = vtx.dst_gpr;
6807 alu.src[0].chan = i;
6808
6809 alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL;
6810 alu.src[1].sel += (id * 2);
6811 alu.src[1].chan = i % 4;
6812 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
6813
6814 if (i == lasti)
6815 alu.last = 1;
6816 r = r600_bytecode_add_alu(ctx->bc, &alu);
6817 if (r)
6818 return r;
6819 }
6820
6821 if (inst->Dst[0].Register.WriteMask & 3) {
6822 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6823 alu.op = ALU_OP2_OR_INT;
6824
6825 alu.dst.chan = 3;
6826 alu.dst.sel = vtx.dst_gpr;
6827 alu.dst.write = 1;
6828
6829 alu.src[0].sel = vtx.dst_gpr;
6830 alu.src[0].chan = 3;
6831
6832 alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL + (id * 2) + 1;
6833 alu.src[1].chan = 0;
6834 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
6835
6836 alu.last = 1;
6837 r = r600_bytecode_add_alu(ctx->bc, &alu);
6838 if (r)
6839 return r;
6840 }
6841 return 0;
6842 }
6843
6844 static int r600_do_buffer_txq(struct r600_shader_ctx *ctx, int reg_idx, int offset)
6845 {
6846 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6847 struct r600_bytecode_alu alu;
6848 int r;
6849 int id = tgsi_tex_get_src_gpr(ctx, reg_idx) + offset;
6850
6851 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6852 alu.op = ALU_OP1_MOV;
6853 alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
6854 if (ctx->bc->chip_class >= EVERGREEN) {
6855 /* channel 0 or 2 of each word */
6856 alu.src[0].sel += (id / 2);
6857 alu.src[0].chan = (id % 2) * 2;
6858 } else {
6859 /* r600 we have them at channel 2 of the second dword */
6860 alu.src[0].sel += (id * 2) + 1;
6861 alu.src[0].chan = 1;
6862 }
6863 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
6864 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
6865 alu.last = 1;
6866 r = r600_bytecode_add_alu(ctx->bc, &alu);
6867 if (r)
6868 return r;
6869 return 0;
6870 }
6871
6872 static int tgsi_tex(struct r600_shader_ctx *ctx)
6873 {
6874 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6875 struct r600_bytecode_tex tex;
6876 struct r600_bytecode_alu alu;
6877 unsigned src_gpr;
6878 int r, i, j;
6879 int opcode;
6880 bool read_compressed_msaa = ctx->bc->has_compressed_msaa_texturing &&
6881 inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
6882 (inst->Texture.Texture == TGSI_TEXTURE_2D_MSAA ||
6883 inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA);
6884
6885 bool txf_add_offsets = inst->Texture.NumOffsets &&
6886 inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
6887 inst->Texture.Texture != TGSI_TEXTURE_BUFFER;
6888
6889 /* Texture fetch instructions can only use gprs as source.
6890 * Also they cannot negate the source or take the absolute value */
6891 const boolean src_requires_loading = (inst->Instruction.Opcode != TGSI_OPCODE_TXQS &&
6892 tgsi_tex_src_requires_loading(ctx, 0)) ||
6893 read_compressed_msaa || txf_add_offsets;
6894
6895 boolean src_loaded = FALSE;
6896 unsigned sampler_src_reg = 1;
6897 int8_t offset_x = 0, offset_y = 0, offset_z = 0;
6898 boolean has_txq_cube_array_z = false;
6899 unsigned sampler_index_mode;
6900
6901 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ &&
6902 ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
6903 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)))
6904 if (inst->Dst[0].Register.WriteMask & 4) {
6905 ctx->shader->has_txq_cube_array_z_comp = true;
6906 has_txq_cube_array_z = true;
6907 }
6908
6909 if (inst->Instruction.Opcode == TGSI_OPCODE_TEX2 ||
6910 inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
6911 inst->Instruction.Opcode == TGSI_OPCODE_TXL2 ||
6912 inst->Instruction.Opcode == TGSI_OPCODE_TG4)
6913 sampler_src_reg = 2;
6914
6915 /* TGSI moves the sampler to src reg 3 for TXD */
6916 if (inst->Instruction.Opcode == TGSI_OPCODE_TXD)
6917 sampler_src_reg = 3;
6918
6919 sampler_index_mode = inst->Src[sampler_src_reg].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
6920
6921 src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
6922
6923 if (inst->Texture.Texture == TGSI_TEXTURE_BUFFER) {
6924 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) {
6925 ctx->shader->uses_tex_buffers = true;
6926 return r600_do_buffer_txq(ctx, 1, 0);
6927 }
6928 else if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
6929 if (ctx->bc->chip_class < EVERGREEN)
6930 ctx->shader->uses_tex_buffers = true;
6931 return do_vtx_fetch_inst(ctx, src_requires_loading);
6932 }
6933 }
6934
6935 if (inst->Instruction.Opcode == TGSI_OPCODE_TXP) {
6936 int out_chan;
6937 /* Add perspective divide */
6938 if (ctx->bc->chip_class == CAYMAN) {
6939 out_chan = 2;
6940 for (i = 0; i < 3; i++) {
6941 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6942 alu.op = ALU_OP1_RECIP_IEEE;
6943 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
6944
6945 alu.dst.sel = ctx->temp_reg;
6946 alu.dst.chan = i;
6947 if (i == 2)
6948 alu.last = 1;
6949 if (out_chan == i)
6950 alu.dst.write = 1;
6951 r = r600_bytecode_add_alu(ctx->bc, &alu);
6952 if (r)
6953 return r;
6954 }
6955
6956 } else {
6957 out_chan = 3;
6958 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6959 alu.op = ALU_OP1_RECIP_IEEE;
6960 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
6961
6962 alu.dst.sel = ctx->temp_reg;
6963 alu.dst.chan = out_chan;
6964 alu.last = 1;
6965 alu.dst.write = 1;
6966 r = r600_bytecode_add_alu(ctx->bc, &alu);
6967 if (r)
6968 return r;
6969 }
6970
6971 for (i = 0; i < 3; i++) {
6972 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6973 alu.op = ALU_OP2_MUL;
6974 alu.src[0].sel = ctx->temp_reg;
6975 alu.src[0].chan = out_chan;
6976 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6977 alu.dst.sel = ctx->temp_reg;
6978 alu.dst.chan = i;
6979 alu.dst.write = 1;
6980 r = r600_bytecode_add_alu(ctx->bc, &alu);
6981 if (r)
6982 return r;
6983 }
6984 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6985 alu.op = ALU_OP1_MOV;
6986 alu.src[0].sel = V_SQ_ALU_SRC_1;
6987 alu.src[0].chan = 0;
6988 alu.dst.sel = ctx->temp_reg;
6989 alu.dst.chan = 3;
6990 alu.last = 1;
6991 alu.dst.write = 1;
6992 r = r600_bytecode_add_alu(ctx->bc, &alu);
6993 if (r)
6994 return r;
6995 src_loaded = TRUE;
6996 src_gpr = ctx->temp_reg;
6997 }
6998
6999
7000 if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
7001 inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7002 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
7003 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
7004 inst->Instruction.Opcode != TGSI_OPCODE_TXQ) {
7005
7006 static const unsigned src0_swizzle[] = {2, 2, 0, 1};
7007 static const unsigned src1_swizzle[] = {1, 0, 2, 2};
7008
7009 /* tmp1.xyzw = CUBE(R0.zzxy, R0.yxzz) */
7010 for (i = 0; i < 4; i++) {
7011 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7012 alu.op = ALU_OP2_CUBE;
7013 r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
7014 r600_bytecode_src(&alu.src[1], &ctx->src[0], src1_swizzle[i]);
7015 alu.dst.sel = ctx->temp_reg;
7016 alu.dst.chan = i;
7017 if (i == 3)
7018 alu.last = 1;
7019 alu.dst.write = 1;
7020 r = r600_bytecode_add_alu(ctx->bc, &alu);
7021 if (r)
7022 return r;
7023 }
7024
7025 /* tmp1.z = RCP_e(|tmp1.z|) */
7026 if (ctx->bc->chip_class == CAYMAN) {
7027 for (i = 0; i < 3; i++) {
7028 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7029 alu.op = ALU_OP1_RECIP_IEEE;
7030 alu.src[0].sel = ctx->temp_reg;
7031 alu.src[0].chan = 2;
7032 alu.src[0].abs = 1;
7033 alu.dst.sel = ctx->temp_reg;
7034 alu.dst.chan = i;
7035 if (i == 2)
7036 alu.dst.write = 1;
7037 if (i == 2)
7038 alu.last = 1;
7039 r = r600_bytecode_add_alu(ctx->bc, &alu);
7040 if (r)
7041 return r;
7042 }
7043 } else {
7044 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7045 alu.op = ALU_OP1_RECIP_IEEE;
7046 alu.src[0].sel = ctx->temp_reg;
7047 alu.src[0].chan = 2;
7048 alu.src[0].abs = 1;
7049 alu.dst.sel = ctx->temp_reg;
7050 alu.dst.chan = 2;
7051 alu.dst.write = 1;
7052 alu.last = 1;
7053 r = r600_bytecode_add_alu(ctx->bc, &alu);
7054 if (r)
7055 return r;
7056 }
7057
7058 /* MULADD R0.x, R0.x, PS1, (0x3FC00000, 1.5f).x
7059 * MULADD R0.y, R0.y, PS1, (0x3FC00000, 1.5f).x
7060 * muladd has no writemask, have to use another temp
7061 */
7062 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7063 alu.op = ALU_OP3_MULADD;
7064 alu.is_op3 = 1;
7065
7066 alu.src[0].sel = ctx->temp_reg;
7067 alu.src[0].chan = 0;
7068 alu.src[1].sel = ctx->temp_reg;
7069 alu.src[1].chan = 2;
7070
7071 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
7072 alu.src[2].chan = 0;
7073 alu.src[2].value = u_bitcast_f2u(1.5f);
7074
7075 alu.dst.sel = ctx->temp_reg;
7076 alu.dst.chan = 0;
7077 alu.dst.write = 1;
7078
7079 r = r600_bytecode_add_alu(ctx->bc, &alu);
7080 if (r)
7081 return r;
7082
7083 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7084 alu.op = ALU_OP3_MULADD;
7085 alu.is_op3 = 1;
7086
7087 alu.src[0].sel = ctx->temp_reg;
7088 alu.src[0].chan = 1;
7089 alu.src[1].sel = ctx->temp_reg;
7090 alu.src[1].chan = 2;
7091
7092 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
7093 alu.src[2].chan = 0;
7094 alu.src[2].value = u_bitcast_f2u(1.5f);
7095
7096 alu.dst.sel = ctx->temp_reg;
7097 alu.dst.chan = 1;
7098 alu.dst.write = 1;
7099
7100 alu.last = 1;
7101 r = r600_bytecode_add_alu(ctx->bc, &alu);
7102 if (r)
7103 return r;
7104 /* write initial compare value into Z component
7105 - W src 0 for shadow cube
7106 - X src 1 for shadow cube array */
7107 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
7108 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
7109 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7110 alu.op = ALU_OP1_MOV;
7111 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
7112 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
7113 else
7114 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7115 alu.dst.sel = ctx->temp_reg;
7116 alu.dst.chan = 2;
7117 alu.dst.write = 1;
7118 alu.last = 1;
7119 r = r600_bytecode_add_alu(ctx->bc, &alu);
7120 if (r)
7121 return r;
7122 }
7123
7124 if (inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7125 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
7126 if (ctx->bc->chip_class >= EVERGREEN) {
7127 int mytmp = r600_get_temp(ctx);
7128 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7129 alu.op = ALU_OP1_MOV;
7130 alu.src[0].sel = ctx->temp_reg;
7131 alu.src[0].chan = 3;
7132 alu.dst.sel = mytmp;
7133 alu.dst.chan = 0;
7134 alu.dst.write = 1;
7135 alu.last = 1;
7136 r = r600_bytecode_add_alu(ctx->bc, &alu);
7137 if (r)
7138 return r;
7139
7140 /* have to multiply original layer by 8 and add to face id (temp.w) in Z */
7141 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7142 alu.op = ALU_OP3_MULADD;
7143 alu.is_op3 = 1;
7144 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7145 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7146 alu.src[1].chan = 0;
7147 alu.src[1].value = u_bitcast_f2u(8.0f);
7148 alu.src[2].sel = mytmp;
7149 alu.src[2].chan = 0;
7150 alu.dst.sel = ctx->temp_reg;
7151 alu.dst.chan = 3;
7152 alu.dst.write = 1;
7153 alu.last = 1;
7154 r = r600_bytecode_add_alu(ctx->bc, &alu);
7155 if (r)
7156 return r;
7157 } else if (ctx->bc->chip_class < EVERGREEN) {
7158 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7159 tex.op = FETCH_OP_SET_CUBEMAP_INDEX;
7160 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7161 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7162 tex.src_gpr = r600_get_temp(ctx);
7163 tex.src_sel_x = 0;
7164 tex.src_sel_y = 0;
7165 tex.src_sel_z = 0;
7166 tex.src_sel_w = 0;
7167 tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
7168 tex.coord_type_x = 1;
7169 tex.coord_type_y = 1;
7170 tex.coord_type_z = 1;
7171 tex.coord_type_w = 1;
7172 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7173 alu.op = ALU_OP1_MOV;
7174 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7175 alu.dst.sel = tex.src_gpr;
7176 alu.dst.chan = 0;
7177 alu.last = 1;
7178 alu.dst.write = 1;
7179 r = r600_bytecode_add_alu(ctx->bc, &alu);
7180 if (r)
7181 return r;
7182
7183 r = r600_bytecode_add_tex(ctx->bc, &tex);
7184 if (r)
7185 return r;
7186 }
7187
7188 }
7189
7190 /* for cube forms of lod and bias we need to route things */
7191 if (inst->Instruction.Opcode == TGSI_OPCODE_TXB ||
7192 inst->Instruction.Opcode == TGSI_OPCODE_TXL ||
7193 inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
7194 inst->Instruction.Opcode == TGSI_OPCODE_TXL2) {
7195 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7196 alu.op = ALU_OP1_MOV;
7197 if (inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
7198 inst->Instruction.Opcode == TGSI_OPCODE_TXL2)
7199 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
7200 else
7201 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7202 alu.dst.sel = ctx->temp_reg;
7203 alu.dst.chan = 2;
7204 alu.last = 1;
7205 alu.dst.write = 1;
7206 r = r600_bytecode_add_alu(ctx->bc, &alu);
7207 if (r)
7208 return r;
7209 }
7210
7211 src_loaded = TRUE;
7212 src_gpr = ctx->temp_reg;
7213 }
7214
7215 if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) {
7216 int temp_h = 0, temp_v = 0;
7217 int start_val = 0;
7218
7219 /* if we've already loaded the src (i.e. CUBE don't reload it). */
7220 if (src_loaded == TRUE)
7221 start_val = 1;
7222 else
7223 src_loaded = TRUE;
7224 for (i = start_val; i < 3; i++) {
7225 int treg = r600_get_temp(ctx);
7226
7227 if (i == 0)
7228 src_gpr = treg;
7229 else if (i == 1)
7230 temp_h = treg;
7231 else
7232 temp_v = treg;
7233
7234 for (j = 0; j < 4; j++) {
7235 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7236 alu.op = ALU_OP1_MOV;
7237 r600_bytecode_src(&alu.src[0], &ctx->src[i], j);
7238 alu.dst.sel = treg;
7239 alu.dst.chan = j;
7240 if (j == 3)
7241 alu.last = 1;
7242 alu.dst.write = 1;
7243 r = r600_bytecode_add_alu(ctx->bc, &alu);
7244 if (r)
7245 return r;
7246 }
7247 }
7248 for (i = 1; i < 3; i++) {
7249 /* set gradients h/v */
7250 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7251 tex.op = (i == 1) ? FETCH_OP_SET_GRADIENTS_H :
7252 FETCH_OP_SET_GRADIENTS_V;
7253 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7254 tex.sampler_index_mode = sampler_index_mode;
7255 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7256 tex.resource_index_mode = sampler_index_mode;
7257
7258 tex.src_gpr = (i == 1) ? temp_h : temp_v;
7259 tex.src_sel_x = 0;
7260 tex.src_sel_y = 1;
7261 tex.src_sel_z = 2;
7262 tex.src_sel_w = 3;
7263
7264 tex.dst_gpr = r600_get_temp(ctx); /* just to avoid confusing the asm scheduler */
7265 tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
7266 if (inst->Texture.Texture != TGSI_TEXTURE_RECT) {
7267 tex.coord_type_x = 1;
7268 tex.coord_type_y = 1;
7269 tex.coord_type_z = 1;
7270 tex.coord_type_w = 1;
7271 }
7272 r = r600_bytecode_add_tex(ctx->bc, &tex);
7273 if (r)
7274 return r;
7275 }
7276 }
7277
7278 if (src_requires_loading && !src_loaded) {
7279 for (i = 0; i < 4; i++) {
7280 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7281 alu.op = ALU_OP1_MOV;
7282 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7283 alu.dst.sel = ctx->temp_reg;
7284 alu.dst.chan = i;
7285 if (i == 3)
7286 alu.last = 1;
7287 alu.dst.write = 1;
7288 r = r600_bytecode_add_alu(ctx->bc, &alu);
7289 if (r)
7290 return r;
7291 }
7292 src_loaded = TRUE;
7293 src_gpr = ctx->temp_reg;
7294 }
7295
7296 /* get offset values */
7297 if (inst->Texture.NumOffsets) {
7298 assert(inst->Texture.NumOffsets == 1);
7299
7300 /* The texture offset feature doesn't work with the TXF instruction
7301 * and must be emulated by adding the offset to the texture coordinates. */
7302 if (txf_add_offsets) {
7303 const struct tgsi_texture_offset *off = inst->TexOffsets;
7304
7305 switch (inst->Texture.Texture) {
7306 case TGSI_TEXTURE_3D:
7307 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7308 alu.op = ALU_OP2_ADD_INT;
7309 alu.src[0].sel = src_gpr;
7310 alu.src[0].chan = 2;
7311 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7312 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleZ];
7313 alu.dst.sel = src_gpr;
7314 alu.dst.chan = 2;
7315 alu.dst.write = 1;
7316 alu.last = 1;
7317 r = r600_bytecode_add_alu(ctx->bc, &alu);
7318 if (r)
7319 return r;
7320 /* fall through */
7321
7322 case TGSI_TEXTURE_2D:
7323 case TGSI_TEXTURE_SHADOW2D:
7324 case TGSI_TEXTURE_RECT:
7325 case TGSI_TEXTURE_SHADOWRECT:
7326 case TGSI_TEXTURE_2D_ARRAY:
7327 case TGSI_TEXTURE_SHADOW2D_ARRAY:
7328 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7329 alu.op = ALU_OP2_ADD_INT;
7330 alu.src[0].sel = src_gpr;
7331 alu.src[0].chan = 1;
7332 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7333 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleY];
7334 alu.dst.sel = src_gpr;
7335 alu.dst.chan = 1;
7336 alu.dst.write = 1;
7337 alu.last = 1;
7338 r = r600_bytecode_add_alu(ctx->bc, &alu);
7339 if (r)
7340 return r;
7341 /* fall through */
7342
7343 case TGSI_TEXTURE_1D:
7344 case TGSI_TEXTURE_SHADOW1D:
7345 case TGSI_TEXTURE_1D_ARRAY:
7346 case TGSI_TEXTURE_SHADOW1D_ARRAY:
7347 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7348 alu.op = ALU_OP2_ADD_INT;
7349 alu.src[0].sel = src_gpr;
7350 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7351 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleX];
7352 alu.dst.sel = src_gpr;
7353 alu.dst.write = 1;
7354 alu.last = 1;
7355 r = r600_bytecode_add_alu(ctx->bc, &alu);
7356 if (r)
7357 return r;
7358 break;
7359 /* texture offsets do not apply to other texture targets */
7360 }
7361 } else {
7362 switch (inst->Texture.Texture) {
7363 case TGSI_TEXTURE_3D:
7364 offset_z = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleZ] << 1;
7365 /* fallthrough */
7366 case TGSI_TEXTURE_2D:
7367 case TGSI_TEXTURE_SHADOW2D:
7368 case TGSI_TEXTURE_RECT:
7369 case TGSI_TEXTURE_SHADOWRECT:
7370 case TGSI_TEXTURE_2D_ARRAY:
7371 case TGSI_TEXTURE_SHADOW2D_ARRAY:
7372 offset_y = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleY] << 1;
7373 /* fallthrough */
7374 case TGSI_TEXTURE_1D:
7375 case TGSI_TEXTURE_SHADOW1D:
7376 case TGSI_TEXTURE_1D_ARRAY:
7377 case TGSI_TEXTURE_SHADOW1D_ARRAY:
7378 offset_x = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleX] << 1;
7379 }
7380 }
7381 }
7382
7383 /* Obtain the sample index for reading a compressed MSAA color texture.
7384 * To read the FMASK, we use the ldfptr instruction, which tells us
7385 * where the samples are stored.
7386 * For uncompressed 8x MSAA surfaces, ldfptr should return 0x76543210,
7387 * which is the identity mapping. Each nibble says which physical sample
7388 * should be fetched to get that sample.
7389 *
7390 * Assume src.z contains the sample index. It should be modified like this:
7391 * src.z = (ldfptr() >> (src.z * 4)) & 0xF;
7392 * Then fetch the texel with src.
7393 */
7394 if (read_compressed_msaa) {
7395 unsigned sample_chan = 3;
7396 unsigned temp = r600_get_temp(ctx);
7397 assert(src_loaded);
7398
7399 /* temp.w = ldfptr() */
7400 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7401 tex.op = FETCH_OP_LD;
7402 tex.inst_mod = 1; /* to indicate this is ldfptr */
7403 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7404 tex.sampler_index_mode = sampler_index_mode;
7405 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7406 tex.resource_index_mode = sampler_index_mode;
7407 tex.src_gpr = src_gpr;
7408 tex.dst_gpr = temp;
7409 tex.dst_sel_x = 7; /* mask out these components */
7410 tex.dst_sel_y = 7;
7411 tex.dst_sel_z = 7;
7412 tex.dst_sel_w = 0; /* store X */
7413 tex.src_sel_x = 0;
7414 tex.src_sel_y = 1;
7415 tex.src_sel_z = 2;
7416 tex.src_sel_w = 3;
7417 tex.offset_x = offset_x;
7418 tex.offset_y = offset_y;
7419 tex.offset_z = offset_z;
7420 r = r600_bytecode_add_tex(ctx->bc, &tex);
7421 if (r)
7422 return r;
7423
7424 /* temp.x = sample_index*4 */
7425 if (ctx->bc->chip_class == CAYMAN) {
7426 for (i = 0 ; i < 4; i++) {
7427 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7428 alu.op = ALU_OP2_MULLO_INT;
7429 alu.src[0].sel = src_gpr;
7430 alu.src[0].chan = sample_chan;
7431 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7432 alu.src[1].value = 4;
7433 alu.dst.sel = temp;
7434 alu.dst.chan = i;
7435 alu.dst.write = i == 0;
7436 if (i == 3)
7437 alu.last = 1;
7438 r = r600_bytecode_add_alu(ctx->bc, &alu);
7439 if (r)
7440 return r;
7441 }
7442 } else {
7443 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7444 alu.op = ALU_OP2_MULLO_INT;
7445 alu.src[0].sel = src_gpr;
7446 alu.src[0].chan = sample_chan;
7447 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7448 alu.src[1].value = 4;
7449 alu.dst.sel = temp;
7450 alu.dst.chan = 0;
7451 alu.dst.write = 1;
7452 alu.last = 1;
7453 r = r600_bytecode_add_alu(ctx->bc, &alu);
7454 if (r)
7455 return r;
7456 }
7457
7458 /* sample_index = temp.w >> temp.x */
7459 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7460 alu.op = ALU_OP2_LSHR_INT;
7461 alu.src[0].sel = temp;
7462 alu.src[0].chan = 3;
7463 alu.src[1].sel = temp;
7464 alu.src[1].chan = 0;
7465 alu.dst.sel = src_gpr;
7466 alu.dst.chan = sample_chan;
7467 alu.dst.write = 1;
7468 alu.last = 1;
7469 r = r600_bytecode_add_alu(ctx->bc, &alu);
7470 if (r)
7471 return r;
7472
7473 /* sample_index & 0xF */
7474 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7475 alu.op = ALU_OP2_AND_INT;
7476 alu.src[0].sel = src_gpr;
7477 alu.src[0].chan = sample_chan;
7478 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7479 alu.src[1].value = 0xF;
7480 alu.dst.sel = src_gpr;
7481 alu.dst.chan = sample_chan;
7482 alu.dst.write = 1;
7483 alu.last = 1;
7484 r = r600_bytecode_add_alu(ctx->bc, &alu);
7485 if (r)
7486 return r;
7487 #if 0
7488 /* visualize the FMASK */
7489 for (i = 0; i < 4; i++) {
7490 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7491 alu.op = ALU_OP1_INT_TO_FLT;
7492 alu.src[0].sel = src_gpr;
7493 alu.src[0].chan = sample_chan;
7494 alu.dst.sel = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
7495 alu.dst.chan = i;
7496 alu.dst.write = 1;
7497 alu.last = 1;
7498 r = r600_bytecode_add_alu(ctx->bc, &alu);
7499 if (r)
7500 return r;
7501 }
7502 return 0;
7503 #endif
7504 }
7505
7506 /* does this shader want a num layers from TXQ for a cube array? */
7507 if (has_txq_cube_array_z) {
7508 int id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7509
7510 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7511 alu.op = ALU_OP1_MOV;
7512
7513 alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
7514 if (ctx->bc->chip_class >= EVERGREEN) {
7515 /* channel 1 or 3 of each word */
7516 alu.src[0].sel += (id / 2);
7517 alu.src[0].chan = ((id % 2) * 2) + 1;
7518 } else {
7519 /* r600 we have them at channel 2 of the second dword */
7520 alu.src[0].sel += (id * 2) + 1;
7521 alu.src[0].chan = 2;
7522 }
7523 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
7524 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
7525 alu.last = 1;
7526 r = r600_bytecode_add_alu(ctx->bc, &alu);
7527 if (r)
7528 return r;
7529 /* disable writemask from texture instruction */
7530 inst->Dst[0].Register.WriteMask &= ~4;
7531 }
7532
7533 opcode = ctx->inst_info->op;
7534 if (opcode == FETCH_OP_GATHER4 &&
7535 inst->TexOffsets[0].File != TGSI_FILE_NULL &&
7536 inst->TexOffsets[0].File != TGSI_FILE_IMMEDIATE) {
7537 opcode = FETCH_OP_GATHER4_O;
7538
7539 /* GATHER4_O/GATHER4_C_O use offset values loaded by
7540 SET_TEXTURE_OFFSETS instruction. The immediate offset values
7541 encoded in the instruction are ignored. */
7542 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7543 tex.op = FETCH_OP_SET_TEXTURE_OFFSETS;
7544 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7545 tex.sampler_index_mode = sampler_index_mode;
7546 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7547 tex.resource_index_mode = sampler_index_mode;
7548
7549 tex.src_gpr = ctx->file_offset[inst->TexOffsets[0].File] + inst->TexOffsets[0].Index;
7550 tex.src_sel_x = inst->TexOffsets[0].SwizzleX;
7551 tex.src_sel_y = inst->TexOffsets[0].SwizzleY;
7552 tex.src_sel_z = inst->TexOffsets[0].SwizzleZ;
7553 tex.src_sel_w = 4;
7554
7555 tex.dst_sel_x = 7;
7556 tex.dst_sel_y = 7;
7557 tex.dst_sel_z = 7;
7558 tex.dst_sel_w = 7;
7559
7560 r = r600_bytecode_add_tex(ctx->bc, &tex);
7561 if (r)
7562 return r;
7563 }
7564
7565 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
7566 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
7567 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
7568 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
7569 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY ||
7570 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
7571 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
7572 switch (opcode) {
7573 case FETCH_OP_SAMPLE:
7574 opcode = FETCH_OP_SAMPLE_C;
7575 break;
7576 case FETCH_OP_SAMPLE_L:
7577 opcode = FETCH_OP_SAMPLE_C_L;
7578 break;
7579 case FETCH_OP_SAMPLE_LB:
7580 opcode = FETCH_OP_SAMPLE_C_LB;
7581 break;
7582 case FETCH_OP_SAMPLE_G:
7583 opcode = FETCH_OP_SAMPLE_C_G;
7584 break;
7585 /* Texture gather variants */
7586 case FETCH_OP_GATHER4:
7587 opcode = FETCH_OP_GATHER4_C;
7588 break;
7589 case FETCH_OP_GATHER4_O:
7590 opcode = FETCH_OP_GATHER4_C_O;
7591 break;
7592 }
7593 }
7594
7595 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7596 tex.op = opcode;
7597
7598 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7599 tex.sampler_index_mode = sampler_index_mode;
7600 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7601 tex.resource_index_mode = sampler_index_mode;
7602 tex.src_gpr = src_gpr;
7603 tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
7604
7605 if (inst->Instruction.Opcode == TGSI_OPCODE_DDX_FINE ||
7606 inst->Instruction.Opcode == TGSI_OPCODE_DDY_FINE) {
7607 tex.inst_mod = 1; /* per pixel gradient calculation instead of per 2x2 quad */
7608 }
7609
7610 if (inst->Instruction.Opcode == TGSI_OPCODE_TG4) {
7611 int8_t texture_component_select = ctx->literals[4 * inst->Src[1].Register.Index + inst->Src[1].Register.SwizzleX];
7612 tex.inst_mod = texture_component_select;
7613
7614 if (ctx->bc->chip_class == CAYMAN) {
7615 /* GATHER4 result order is different from TGSI TG4 */
7616 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 0 : 7;
7617 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 4) ? 1 : 7;
7618 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 1) ? 2 : 7;
7619 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
7620 } else {
7621 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
7622 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
7623 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
7624 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
7625 }
7626 }
7627 else if (inst->Instruction.Opcode == TGSI_OPCODE_LODQ) {
7628 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
7629 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
7630 tex.dst_sel_z = 7;
7631 tex.dst_sel_w = 7;
7632 }
7633 else if (inst->Instruction.Opcode == TGSI_OPCODE_TXQS) {
7634 tex.dst_sel_x = 3;
7635 tex.dst_sel_y = 7;
7636 tex.dst_sel_z = 7;
7637 tex.dst_sel_w = 7;
7638 }
7639 else {
7640 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
7641 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
7642 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
7643 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
7644 }
7645
7646
7647 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQS) {
7648 tex.src_sel_x = 4;
7649 tex.src_sel_y = 4;
7650 tex.src_sel_z = 4;
7651 tex.src_sel_w = 4;
7652 } else if (src_loaded) {
7653 tex.src_sel_x = 0;
7654 tex.src_sel_y = 1;
7655 tex.src_sel_z = 2;
7656 tex.src_sel_w = 3;
7657 } else {
7658 tex.src_sel_x = ctx->src[0].swizzle[0];
7659 tex.src_sel_y = ctx->src[0].swizzle[1];
7660 tex.src_sel_z = ctx->src[0].swizzle[2];
7661 tex.src_sel_w = ctx->src[0].swizzle[3];
7662 tex.src_rel = ctx->src[0].rel;
7663 }
7664
7665 if (inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
7666 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
7667 inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7668 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
7669 tex.src_sel_x = 1;
7670 tex.src_sel_y = 0;
7671 tex.src_sel_z = 3;
7672 tex.src_sel_w = 2; /* route Z compare or Lod value into W */
7673 }
7674
7675 if (inst->Texture.Texture != TGSI_TEXTURE_RECT &&
7676 inst->Texture.Texture != TGSI_TEXTURE_SHADOWRECT) {
7677 tex.coord_type_x = 1;
7678 tex.coord_type_y = 1;
7679 }
7680 tex.coord_type_z = 1;
7681 tex.coord_type_w = 1;
7682
7683 tex.offset_x = offset_x;
7684 tex.offset_y = offset_y;
7685 if (inst->Instruction.Opcode == TGSI_OPCODE_TG4 &&
7686 (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
7687 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY)) {
7688 tex.offset_z = 0;
7689 }
7690 else {
7691 tex.offset_z = offset_z;
7692 }
7693
7694 /* Put the depth for comparison in W.
7695 * TGSI_TEXTURE_SHADOW2D_ARRAY already has the depth in W.
7696 * Some instructions expect the depth in Z. */
7697 if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
7698 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
7699 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
7700 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) &&
7701 opcode != FETCH_OP_SAMPLE_C_L &&
7702 opcode != FETCH_OP_SAMPLE_C_LB) {
7703 tex.src_sel_w = tex.src_sel_z;
7704 }
7705
7706 if (inst->Texture.Texture == TGSI_TEXTURE_1D_ARRAY ||
7707 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) {
7708 if (opcode == FETCH_OP_SAMPLE_C_L ||
7709 opcode == FETCH_OP_SAMPLE_C_LB) {
7710 /* the array index is read from Y */
7711 tex.coord_type_y = 0;
7712 } else {
7713 /* the array index is read from Z */
7714 tex.coord_type_z = 0;
7715 tex.src_sel_z = tex.src_sel_y;
7716 }
7717 } else if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
7718 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
7719 ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7720 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
7721 (ctx->bc->chip_class >= EVERGREEN)))
7722 /* the array index is read from Z */
7723 tex.coord_type_z = 0;
7724
7725 /* mask unused source components */
7726 if (opcode == FETCH_OP_SAMPLE || opcode == FETCH_OP_GATHER4) {
7727 switch (inst->Texture.Texture) {
7728 case TGSI_TEXTURE_2D:
7729 case TGSI_TEXTURE_RECT:
7730 tex.src_sel_z = 7;
7731 tex.src_sel_w = 7;
7732 break;
7733 case TGSI_TEXTURE_1D_ARRAY:
7734 tex.src_sel_y = 7;
7735 tex.src_sel_w = 7;
7736 break;
7737 case TGSI_TEXTURE_1D:
7738 tex.src_sel_y = 7;
7739 tex.src_sel_z = 7;
7740 tex.src_sel_w = 7;
7741 break;
7742 }
7743 }
7744
7745 r = r600_bytecode_add_tex(ctx->bc, &tex);
7746 if (r)
7747 return r;
7748
7749 /* add shadow ambient support - gallium doesn't do it yet */
7750 return 0;
7751 }
7752
7753 static int find_hw_atomic_counter(struct r600_shader_ctx *ctx,
7754 struct tgsi_full_src_register *src)
7755 {
7756 unsigned i;
7757
7758 if (src->Register.Indirect) {
7759 for (i = 0; i < ctx->shader->nhwatomic_ranges; i++) {
7760 if (src->Indirect.ArrayID == ctx->shader->atomics[i].array_id)
7761 return ctx->shader->atomics[i].hw_idx;
7762 }
7763 } else {
7764 uint32_t index = src->Register.Index;
7765 for (i = 0; i < ctx->shader->nhwatomic_ranges; i++) {
7766 if (ctx->shader->atomics[i].buffer_id != (unsigned)src->Dimension.Index)
7767 continue;
7768 if (index > ctx->shader->atomics[i].end)
7769 continue;
7770 if (index < ctx->shader->atomics[i].start)
7771 continue;
7772 uint32_t offset = (index - ctx->shader->atomics[i].start);
7773 return ctx->shader->atomics[i].hw_idx + offset;
7774 }
7775 }
7776 assert(0);
7777 return -1;
7778 }
7779
7780
7781 static int tgsi_load_gds(struct r600_shader_ctx *ctx)
7782 {
7783 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7784 int r;
7785 struct r600_bytecode_gds gds;
7786 int uav_id = 0;
7787 int uav_index_mode = 0;
7788
7789 uav_id = find_hw_atomic_counter(ctx, &inst->Src[0]);
7790
7791 if (inst->Src[0].Register.Indirect)
7792 uav_index_mode = 2;
7793
7794 memset(&gds, 0, sizeof(struct r600_bytecode_gds));
7795 gds.op = FETCH_OP_GDS_READ_RET;
7796 gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
7797 gds.uav_id = uav_id;
7798 gds.uav_index_mode = uav_index_mode;
7799 gds.src_gpr = ctx->temp_reg;
7800 gds.src_sel_x = 4;
7801 gds.src_sel_y = 4;
7802 gds.src_sel_z = 4;
7803 gds.dst_sel_x = 0;
7804 gds.dst_sel_y = 7;
7805 gds.dst_sel_z = 7;
7806 gds.dst_sel_w = 7;
7807 gds.src_gpr2 = ctx->temp_reg;
7808 gds.alloc_consume = 1;
7809 r = r600_bytecode_add_gds(ctx->bc, &gds);
7810 if (r)
7811 return r;
7812
7813 ctx->bc->cf_last->vpm = 1;
7814 return 0;
7815 }
7816
7817 /* this fixes up 1D arrays properly */
7818 static int load_index_src(struct r600_shader_ctx *ctx, int src_index, int *idx_gpr)
7819 {
7820 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7821 int r, i;
7822 struct r600_bytecode_alu alu;
7823 int temp_reg = r600_get_temp(ctx);
7824
7825 for (i = 0; i < 4; i++) {
7826 bool def_val = true, write_zero = false;
7827 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7828 alu.op = ALU_OP1_MOV;
7829 alu.dst.sel = temp_reg;
7830 alu.dst.chan = i;
7831
7832 switch (inst->Memory.Texture) {
7833 case TGSI_TEXTURE_BUFFER:
7834 case TGSI_TEXTURE_1D:
7835 if (i == 1 || i == 2 || i == 3) {
7836 write_zero = true;
7837 }
7838 break;
7839 case TGSI_TEXTURE_1D_ARRAY:
7840 if (i == 1 || i == 3)
7841 write_zero = true;
7842 else if (i == 2) {
7843 r600_bytecode_src(&alu.src[0], &ctx->src[src_index], 1);
7844 def_val = false;
7845 }
7846 break;
7847 case TGSI_TEXTURE_2D:
7848 if (i == 2 || i == 3)
7849 write_zero = true;
7850 break;
7851 default:
7852 if (i == 3)
7853 write_zero = true;
7854 break;
7855 }
7856
7857 if (write_zero) {
7858 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
7859 alu.src[0].value = 0;
7860 } else if (def_val) {
7861 r600_bytecode_src(&alu.src[0], &ctx->src[src_index], i);
7862 }
7863
7864 if (i == 3)
7865 alu.last = 1;
7866 alu.dst.write = 1;
7867 r = r600_bytecode_add_alu(ctx->bc, &alu);
7868 if (r)
7869 return r;
7870 }
7871 *idx_gpr = temp_reg;
7872 return 0;
7873 }
7874
7875 static int tgsi_load_rat(struct r600_shader_ctx *ctx)
7876 {
7877 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7878 /* have to work out the offset into the RAT immediate return buffer */
7879 struct r600_bytecode_vtx vtx;
7880 struct r600_bytecode_cf *cf;
7881 int r;
7882 int idx_gpr;
7883 unsigned format, num_format, format_comp, endian;
7884 const struct util_format_description *desc;
7885 unsigned rat_index_mode;
7886 unsigned immed_base;
7887
7888 rat_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
7889
7890 immed_base = R600_IMAGE_IMMED_RESOURCE_OFFSET;
7891 r = load_index_src(ctx, 1, &idx_gpr);
7892 if (r)
7893 return r;
7894
7895 if (rat_index_mode)
7896 egcm_load_index_reg(ctx->bc, 1, false);
7897
7898 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
7899 cf = ctx->bc->cf_last;
7900
7901 cf->rat.id = ctx->shader->rat_base + inst->Src[0].Register.Index;
7902 cf->rat.inst = V_RAT_INST_NOP_RTN;
7903 cf->rat.index_mode = rat_index_mode;
7904 cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND;
7905 cf->output.gpr = ctx->thread_id_gpr;
7906 cf->output.index_gpr = idx_gpr;
7907 cf->output.comp_mask = 0xf;
7908 cf->output.burst_count = 1;
7909 cf->vpm = 1;
7910 cf->barrier = 1;
7911 cf->mark = 1;
7912 cf->output.elem_size = 0;
7913
7914 r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK);
7915 cf = ctx->bc->cf_last;
7916 cf->barrier = 1;
7917
7918 desc = util_format_description(inst->Memory.Format);
7919 r600_vertex_data_type(inst->Memory.Format,
7920 &format, &num_format, &format_comp, &endian);
7921 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
7922 vtx.op = FETCH_OP_VFETCH;
7923 vtx.buffer_id = immed_base + inst->Src[0].Register.Index;
7924 vtx.buffer_index_mode = rat_index_mode;
7925 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
7926 vtx.src_gpr = ctx->thread_id_gpr;
7927 vtx.src_sel_x = 1;
7928 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
7929 vtx.dst_sel_x = desc->swizzle[0];
7930 vtx.dst_sel_y = desc->swizzle[1];
7931 vtx.dst_sel_z = desc->swizzle[2];
7932 vtx.dst_sel_w = desc->swizzle[3];
7933 vtx.srf_mode_all = 1;
7934 vtx.data_format = format;
7935 vtx.num_format_all = num_format;
7936 vtx.format_comp_all = format_comp;
7937 vtx.endian = endian;
7938 vtx.offset = 0;
7939 vtx.mega_fetch_count = 3;
7940 r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx);
7941 if (r)
7942 return r;
7943 cf = ctx->bc->cf_last;
7944 cf->barrier = 1;
7945 return 0;
7946 }
7947
7948 static int tgsi_load(struct r600_shader_ctx *ctx)
7949 {
7950 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7951 if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
7952 return tgsi_load_rat(ctx);
7953 if (inst->Src[0].Register.File == TGSI_FILE_HW_ATOMIC)
7954 return tgsi_load_gds(ctx);
7955 return 0;
7956 }
7957
7958 static int tgsi_store_rat(struct r600_shader_ctx *ctx)
7959 {
7960 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7961 struct r600_bytecode_cf *cf;
7962 bool src_requires_loading = false;
7963 int val_gpr, idx_gpr;
7964 int r, i;
7965 unsigned rat_index_mode;
7966
7967 rat_index_mode = inst->Dst[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
7968
7969 r = load_index_src(ctx, 0, &idx_gpr);
7970 if (r)
7971 return r;
7972
7973 if (inst->Src[1].Register.File != TGSI_FILE_TEMPORARY)
7974 src_requires_loading = true;
7975
7976 if (src_requires_loading) {
7977 struct r600_bytecode_alu alu;
7978 for (i = 0; i < 4; i++) {
7979 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7980 alu.op = ALU_OP1_MOV;
7981 alu.dst.sel = ctx->temp_reg;
7982 alu.dst.chan = i;
7983
7984 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
7985 if (i == 3)
7986 alu.last = 1;
7987 alu.dst.write = 1;
7988 r = r600_bytecode_add_alu(ctx->bc, &alu);
7989 if (r)
7990 return r;
7991 }
7992 val_gpr = ctx->temp_reg;
7993 } else
7994 val_gpr = tgsi_tex_get_src_gpr(ctx, 1);
7995 if (rat_index_mode)
7996 egcm_load_index_reg(ctx->bc, 1, false);
7997
7998 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
7999 cf = ctx->bc->cf_last;
8000
8001 cf->rat.id = ctx->shader->rat_base + inst->Dst[0].Register.Index;
8002 cf->rat.inst = V_RAT_INST_STORE_TYPED;
8003 cf->rat.index_mode = rat_index_mode;
8004 cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
8005 cf->output.gpr = val_gpr;
8006 cf->output.index_gpr = idx_gpr;
8007 cf->output.comp_mask = 0xf;
8008 cf->output.burst_count = 1;
8009 cf->vpm = 1;
8010 cf->barrier = 1;
8011 cf->output.elem_size = 0;
8012 return 0;
8013 }
8014
8015 static int tgsi_store(struct r600_shader_ctx *ctx)
8016 {
8017 return tgsi_store_rat(ctx);
8018 }
8019
8020 static int tgsi_atomic_op_rat(struct r600_shader_ctx *ctx)
8021 {
8022 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8023 /* have to work out the offset into the RAT immediate return buffer */
8024 struct r600_bytecode_alu alu;
8025 struct r600_bytecode_vtx vtx;
8026 struct r600_bytecode_cf *cf;
8027 int r;
8028 int idx_gpr;
8029 unsigned format, num_format, format_comp, endian;
8030 const struct util_format_description *desc;
8031 unsigned rat_index_mode;
8032 unsigned immed_base;
8033
8034 immed_base = R600_IMAGE_IMMED_RESOURCE_OFFSET;
8035
8036 assert (inst->Src[0].Register.File == TGSI_FILE_IMAGE);
8037 rat_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
8038
8039 r = load_index_src(ctx, 1, &idx_gpr);
8040 if (r)
8041 return r;
8042
8043 if (ctx->inst_info->op == V_RAT_INST_CMPXCHG_INT_RTN) {
8044 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8045 alu.op = ALU_OP1_MOV;
8046 alu.dst.sel = ctx->thread_id_gpr;
8047 alu.dst.chan = 0;
8048 alu.dst.write = 1;
8049 r600_bytecode_src(&alu.src[0], &ctx->src[3], 0);
8050 alu.last = 1;
8051 r = r600_bytecode_add_alu(ctx->bc, &alu);
8052 if (r)
8053 return r;
8054
8055 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8056 alu.op = ALU_OP1_MOV;
8057 alu.dst.sel = ctx->thread_id_gpr;
8058 alu.dst.chan = 3;
8059 alu.dst.write = 1;
8060 r600_bytecode_src(&alu.src[0], &ctx->src[2], 0);
8061 alu.last = 1;
8062 r = r600_bytecode_add_alu(ctx->bc, &alu);
8063 if (r)
8064 return r;
8065 } else {
8066 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8067 alu.op = ALU_OP1_MOV;
8068 alu.dst.sel = ctx->thread_id_gpr;
8069 alu.dst.chan = 0;
8070 alu.dst.write = 1;
8071 r600_bytecode_src(&alu.src[0], &ctx->src[2], 0);
8072 alu.last = 1;
8073 r = r600_bytecode_add_alu(ctx->bc, &alu);
8074 if (r)
8075 return r;
8076 }
8077
8078 if (rat_index_mode)
8079 egcm_load_index_reg(ctx->bc, 1, false);
8080 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
8081 cf = ctx->bc->cf_last;
8082
8083 cf->rat.id = ctx->shader->rat_base + inst->Src[0].Register.Index;
8084 cf->rat.inst = ctx->inst_info->op;
8085 cf->rat.index_mode = rat_index_mode;
8086 cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND;
8087 cf->output.gpr = ctx->thread_id_gpr;
8088 cf->output.index_gpr = idx_gpr;
8089 cf->output.comp_mask = 0xf;
8090 cf->output.burst_count = 1;
8091 cf->vpm = 1;
8092 cf->barrier = 1;
8093 cf->mark = 1;
8094 cf->output.elem_size = 0;
8095 r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK);
8096 cf = ctx->bc->cf_last;
8097 cf->barrier = 1;
8098 cf->cf_addr = 1;
8099
8100 desc = util_format_description(inst->Memory.Format);
8101 r600_vertex_data_type(inst->Memory.Format,
8102 &format, &num_format, &format_comp, &endian);
8103 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
8104 vtx.op = FETCH_OP_VFETCH;
8105 vtx.buffer_id = immed_base + inst->Src[0].Register.Index;
8106 vtx.buffer_index_mode = rat_index_mode;
8107 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
8108 vtx.src_gpr = ctx->thread_id_gpr;
8109 vtx.src_sel_x = 1;
8110 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8111 vtx.dst_sel_x = desc->swizzle[0];
8112 vtx.dst_sel_y = 7;
8113 vtx.dst_sel_z = 7;
8114 vtx.dst_sel_w = 7;
8115 vtx.use_const_fields = 0;
8116 vtx.srf_mode_all = 1;
8117 vtx.data_format = format;
8118 vtx.num_format_all = num_format;
8119 vtx.format_comp_all = format_comp;
8120 vtx.endian = endian;
8121 vtx.offset = 0;
8122 vtx.mega_fetch_count = 0xf;
8123 r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx);
8124 if (r)
8125 return r;
8126 cf = ctx->bc->cf_last;
8127 cf->vpm = 1;
8128 cf->barrier = 1;
8129 return 0;
8130 }
8131
8132 static int get_gds_op(int opcode)
8133 {
8134 switch (opcode) {
8135 case TGSI_OPCODE_ATOMUADD:
8136 return FETCH_OP_GDS_ADD_RET;
8137 case TGSI_OPCODE_ATOMAND:
8138 return FETCH_OP_GDS_AND_RET;
8139 case TGSI_OPCODE_ATOMOR:
8140 return FETCH_OP_GDS_OR_RET;
8141 case TGSI_OPCODE_ATOMXOR:
8142 return FETCH_OP_GDS_XOR_RET;
8143 case TGSI_OPCODE_ATOMUMIN:
8144 return FETCH_OP_GDS_MIN_UINT_RET;
8145 case TGSI_OPCODE_ATOMUMAX:
8146 return FETCH_OP_GDS_MAX_UINT_RET;
8147 case TGSI_OPCODE_ATOMXCHG:
8148 return FETCH_OP_GDS_XCHG_RET;
8149 case TGSI_OPCODE_ATOMCAS:
8150 return FETCH_OP_GDS_CMP_XCHG_RET;
8151 default:
8152 return -1;
8153 }
8154 }
8155
8156 static int tgsi_atomic_op_gds(struct r600_shader_ctx *ctx)
8157 {
8158 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8159 struct r600_bytecode_gds gds;
8160 struct r600_bytecode_alu alu;
8161 int gds_op = get_gds_op(inst->Instruction.Opcode);
8162 int r;
8163 int uav_id = 0;
8164 int uav_index_mode = 0;
8165
8166 if (gds_op == -1) {
8167 fprintf(stderr, "unknown GDS op for opcode %d\n", inst->Instruction.Opcode);
8168 return -1;
8169 }
8170
8171 uav_id = find_hw_atomic_counter(ctx, &inst->Src[0]);
8172
8173 if (inst->Src[0].Register.Indirect)
8174 uav_index_mode = 2;
8175
8176 if (inst->Src[2].Register.File == TGSI_FILE_IMMEDIATE) {
8177 int value = (ctx->literals[4 * inst->Src[2].Register.Index + inst->Src[2].Register.SwizzleX]);
8178 int abs_value = abs(value);
8179 if (abs_value != value && gds_op == FETCH_OP_GDS_ADD_RET)
8180 gds_op = FETCH_OP_GDS_SUB_RET;
8181 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8182 alu.op = ALU_OP1_MOV;
8183 alu.dst.sel = ctx->temp_reg;
8184 alu.dst.chan = 0;
8185 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
8186 alu.src[0].value = abs_value;
8187 alu.last = 1;
8188 alu.dst.write = 1;
8189 r = r600_bytecode_add_alu(ctx->bc, &alu);
8190 if (r)
8191 return r;
8192 } else {
8193 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8194 alu.op = ALU_OP1_MOV;
8195 alu.dst.sel = ctx->temp_reg;
8196 alu.dst.chan = 0;
8197 r600_bytecode_src(&alu.src[0], &ctx->src[2], 0);
8198 alu.last = 1;
8199 alu.dst.write = 1;
8200 r = r600_bytecode_add_alu(ctx->bc, &alu);
8201 if (r)
8202 return r;
8203 }
8204
8205 memset(&gds, 0, sizeof(struct r600_bytecode_gds));
8206 gds.op = gds_op;
8207 gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8208 gds.uav_id = uav_id;
8209 gds.uav_index_mode = uav_index_mode;
8210 gds.src_gpr = ctx->temp_reg;
8211 gds.src_gpr2 = ctx->temp_reg;
8212 gds.src_sel_x = 4;
8213 gds.src_sel_y = 0;
8214 gds.src_sel_z = 4;
8215 gds.dst_sel_x = 0;
8216 gds.dst_sel_y = 7;
8217 gds.dst_sel_z = 7;
8218 gds.dst_sel_w = 7;
8219 gds.alloc_consume = 1;
8220 r = r600_bytecode_add_gds(ctx->bc, &gds);
8221 if (r)
8222 return r;
8223 ctx->bc->cf_last->vpm = 1;
8224 return 0;
8225 }
8226
8227 static int tgsi_atomic_op(struct r600_shader_ctx *ctx)
8228 {
8229 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8230 if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
8231 return tgsi_atomic_op_rat(ctx);
8232 if (inst->Src[0].Register.File == TGSI_FILE_HW_ATOMIC)
8233 return tgsi_atomic_op_gds(ctx);
8234 return 0;
8235 }
8236
8237 static int tgsi_resq(struct r600_shader_ctx *ctx)
8238 {
8239 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8240 unsigned sampler_index_mode;
8241 struct r600_bytecode_tex tex;
8242 int r;
8243 boolean has_txq_cube_array_z = false;
8244
8245 if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
8246 ctx->shader->uses_tex_buffers = true;
8247 return r600_do_buffer_txq(ctx, 0, ctx->shader->image_size_const_offset);
8248 }
8249
8250 if (inst->Memory.Texture == TGSI_TEXTURE_CUBE_ARRAY &&
8251 inst->Dst[0].Register.WriteMask & 4) {
8252 ctx->shader->has_txq_cube_array_z_comp = true;
8253 has_txq_cube_array_z = true;
8254 }
8255
8256 sampler_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
8257 if (sampler_index_mode)
8258 egcm_load_index_reg(ctx->bc, 1, false);
8259
8260
8261 /* does this shader want a num layers from TXQ for a cube array? */
8262 if (has_txq_cube_array_z) {
8263 int id = tgsi_tex_get_src_gpr(ctx, 0) + ctx->shader->image_size_const_offset;
8264 struct r600_bytecode_alu alu;
8265
8266 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8267 alu.op = ALU_OP1_MOV;
8268
8269 alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
8270 /* channel 1 or 3 of each word */
8271 alu.src[0].sel += (id / 2);
8272 alu.src[0].chan = ((id % 2) * 2) + 1;
8273 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
8274 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
8275 alu.last = 1;
8276 r = r600_bytecode_add_alu(ctx->bc, &alu);
8277 if (r)
8278 return r;
8279 /* disable writemask from texture instruction */
8280 inst->Dst[0].Register.WriteMask &= ~4;
8281 }
8282 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
8283 tex.op = ctx->inst_info->op;
8284 tex.sampler_id = R600_IMAGE_REAL_RESOURCE_OFFSET + inst->Src[0].Register.Index;
8285 tex.sampler_index_mode = sampler_index_mode;
8286 tex.resource_id = tex.sampler_id;
8287 tex.resource_index_mode = sampler_index_mode;
8288 tex.src_sel_x = 4;
8289 tex.src_sel_y = 4;
8290 tex.src_sel_z = 4;
8291 tex.src_sel_w = 4;
8292 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
8293 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
8294 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
8295 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
8296 tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8297 r = r600_bytecode_add_tex(ctx->bc, &tex);
8298 if (r)
8299 return r;
8300
8301 return 0;
8302 }
8303
8304 static int tgsi_lrp(struct r600_shader_ctx *ctx)
8305 {
8306 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8307 struct r600_bytecode_alu alu;
8308 unsigned lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
8309 unsigned i, temp_regs[2];
8310 int r;
8311
8312 /* optimize if it's just an equal balance */
8313 if (ctx->src[0].sel == V_SQ_ALU_SRC_0_5) {
8314 for (i = 0; i < lasti + 1; i++) {
8315 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
8316 continue;
8317
8318 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8319 alu.op = ALU_OP2_ADD;
8320 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
8321 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
8322 alu.omod = 3;
8323 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
8324 alu.dst.chan = i;
8325 if (i == lasti) {
8326 alu.last = 1;
8327 }
8328 r = r600_bytecode_add_alu(ctx->bc, &alu);
8329 if (r)
8330 return r;
8331 }
8332 return 0;
8333 }
8334
8335 /* 1 - src0 */
8336 for (i = 0; i < lasti + 1; i++) {
8337 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
8338 continue;
8339
8340 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8341 alu.op = ALU_OP2_ADD;
8342 alu.src[0].sel = V_SQ_ALU_SRC_1;
8343 alu.src[0].chan = 0;
8344 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
8345 r600_bytecode_src_toggle_neg(&alu.src[1]);
8346 alu.dst.sel = ctx->temp_reg;
8347 alu.dst.chan = i;
8348 if (i == lasti) {
8349 alu.last = 1;
8350 }
8351 alu.dst.write = 1;
8352 r = r600_bytecode_add_alu(ctx->bc, &alu);
8353 if (r)
8354 return r;
8355 }
8356
8357 /* (1 - src0) * src2 */
8358 for (i = 0; i < lasti + 1; i++) {
8359 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
8360 continue;
8361
8362 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8363 alu.op = ALU_OP2_MUL;
8364 alu.src[0].sel = ctx->temp_reg;
8365 alu.src[0].chan = i;
8366 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
8367 alu.dst.sel = ctx->temp_reg;
8368 alu.dst.chan = i;
8369 if (i == lasti) {
8370 alu.last = 1;
8371 }
8372 alu.dst.write = 1;
8373 r = r600_bytecode_add_alu(ctx->bc, &alu);
8374 if (r)
8375 return r;
8376 }
8377
8378 /* src0 * src1 + (1 - src0) * src2 */
8379 if (ctx->src[0].abs)
8380 temp_regs[0] = r600_get_temp(ctx);
8381 else
8382 temp_regs[0] = 0;
8383 if (ctx->src[1].abs)
8384 temp_regs[1] = r600_get_temp(ctx);
8385 else
8386 temp_regs[1] = 0;
8387
8388 for (i = 0; i < lasti + 1; i++) {
8389 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
8390 continue;
8391
8392 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8393 alu.op = ALU_OP3_MULADD;
8394 alu.is_op3 = 1;
8395 r = tgsi_make_src_for_op3(ctx, temp_regs[0], i, &alu.src[0], &ctx->src[0]);
8396 if (r)
8397 return r;
8398 r = tgsi_make_src_for_op3(ctx, temp_regs[1], i, &alu.src[1], &ctx->src[1]);
8399 if (r)
8400 return r;
8401 alu.src[2].sel = ctx->temp_reg;
8402 alu.src[2].chan = i;
8403
8404 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
8405 alu.dst.chan = i;
8406 if (i == lasti) {
8407 alu.last = 1;
8408 }
8409 r = r600_bytecode_add_alu(ctx->bc, &alu);
8410 if (r)
8411 return r;
8412 }
8413 return 0;
8414 }
8415
8416 static int tgsi_cmp(struct r600_shader_ctx *ctx)
8417 {
8418 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8419 struct r600_bytecode_alu alu;
8420 int i, r, j;
8421 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
8422 int temp_regs[3];
8423 unsigned op;
8424
8425 if (ctx->src[0].abs && ctx->src[0].neg) {
8426 op = ALU_OP3_CNDE;
8427 ctx->src[0].abs = 0;
8428 ctx->src[0].neg = 0;
8429 } else {
8430 op = ALU_OP3_CNDGE;
8431 }
8432
8433 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
8434 temp_regs[j] = 0;
8435 if (ctx->src[j].abs)
8436 temp_regs[j] = r600_get_temp(ctx);
8437 }
8438
8439 for (i = 0; i < lasti + 1; i++) {
8440 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
8441 continue;
8442
8443 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8444 alu.op = op;
8445 r = tgsi_make_src_for_op3(ctx, temp_regs[0], i, &alu.src[0], &ctx->src[0]);
8446 if (r)
8447 return r;
8448 r = tgsi_make_src_for_op3(ctx, temp_regs[2], i, &alu.src[1], &ctx->src[2]);
8449 if (r)
8450 return r;
8451 r = tgsi_make_src_for_op3(ctx, temp_regs[1], i, &alu.src[2], &ctx->src[1]);
8452 if (r)
8453 return r;
8454 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
8455 alu.dst.chan = i;
8456 alu.dst.write = 1;
8457 alu.is_op3 = 1;
8458 if (i == lasti)
8459 alu.last = 1;
8460 r = r600_bytecode_add_alu(ctx->bc, &alu);
8461 if (r)
8462 return r;
8463 }
8464 return 0;
8465 }
8466
8467 static int tgsi_ucmp(struct r600_shader_ctx *ctx)
8468 {
8469 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8470 struct r600_bytecode_alu alu;
8471 int i, r;
8472 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
8473
8474 for (i = 0; i < lasti + 1; i++) {
8475 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
8476 continue;
8477
8478 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8479 alu.op = ALU_OP3_CNDE_INT;
8480 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
8481 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
8482 r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
8483 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
8484 alu.dst.chan = i;
8485 alu.dst.write = 1;
8486 alu.is_op3 = 1;
8487 if (i == lasti)
8488 alu.last = 1;
8489 r = r600_bytecode_add_alu(ctx->bc, &alu);
8490 if (r)
8491 return r;
8492 }
8493 return 0;
8494 }
8495
8496 static int tgsi_exp(struct r600_shader_ctx *ctx)
8497 {
8498 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8499 struct r600_bytecode_alu alu;
8500 int r;
8501 unsigned i;
8502
8503 /* result.x = 2^floor(src); */
8504 if (inst->Dst[0].Register.WriteMask & 1) {
8505 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8506
8507 alu.op = ALU_OP1_FLOOR;
8508 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
8509
8510 alu.dst.sel = ctx->temp_reg;
8511 alu.dst.chan = 0;
8512 alu.dst.write = 1;
8513 alu.last = 1;
8514 r = r600_bytecode_add_alu(ctx->bc, &alu);
8515 if (r)
8516 return r;
8517
8518 if (ctx->bc->chip_class == CAYMAN) {
8519 for (i = 0; i < 3; i++) {
8520 alu.op = ALU_OP1_EXP_IEEE;
8521 alu.src[0].sel = ctx->temp_reg;
8522 alu.src[0].chan = 0;
8523
8524 alu.dst.sel = ctx->temp_reg;
8525 alu.dst.chan = i;
8526 alu.dst.write = i == 0;
8527 alu.last = i == 2;
8528 r = r600_bytecode_add_alu(ctx->bc, &alu);
8529 if (r)
8530 return r;
8531 }
8532 } else {
8533 alu.op = ALU_OP1_EXP_IEEE;
8534 alu.src[0].sel = ctx->temp_reg;
8535 alu.src[0].chan = 0;
8536
8537 alu.dst.sel = ctx->temp_reg;
8538 alu.dst.chan = 0;
8539 alu.dst.write = 1;
8540 alu.last = 1;
8541 r = r600_bytecode_add_alu(ctx->bc, &alu);
8542 if (r)
8543 return r;
8544 }
8545 }
8546
8547 /* result.y = tmp - floor(tmp); */
8548 if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
8549 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8550
8551 alu.op = ALU_OP1_FRACT;
8552 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
8553
8554 alu.dst.sel = ctx->temp_reg;
8555 #if 0
8556 r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
8557 if (r)
8558 return r;
8559 #endif
8560 alu.dst.write = 1;
8561 alu.dst.chan = 1;
8562
8563 alu.last = 1;
8564
8565 r = r600_bytecode_add_alu(ctx->bc, &alu);
8566 if (r)
8567 return r;
8568 }
8569
8570 /* result.z = RoughApprox2ToX(tmp);*/
8571 if ((inst->Dst[0].Register.WriteMask >> 2) & 0x1) {
8572 if (ctx->bc->chip_class == CAYMAN) {
8573 for (i = 0; i < 3; i++) {
8574 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8575 alu.op = ALU_OP1_EXP_IEEE;
8576 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
8577
8578 alu.dst.sel = ctx->temp_reg;
8579 alu.dst.chan = i;
8580 if (i == 2) {
8581 alu.dst.write = 1;
8582 alu.last = 1;
8583 }
8584
8585 r = r600_bytecode_add_alu(ctx->bc, &alu);
8586 if (r)
8587 return r;
8588 }
8589 } else {
8590 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8591 alu.op = ALU_OP1_EXP_IEEE;
8592 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
8593
8594 alu.dst.sel = ctx->temp_reg;
8595 alu.dst.write = 1;
8596 alu.dst.chan = 2;
8597
8598 alu.last = 1;
8599
8600 r = r600_bytecode_add_alu(ctx->bc, &alu);
8601 if (r)
8602 return r;
8603 }
8604 }
8605
8606 /* result.w = 1.0;*/
8607 if ((inst->Dst[0].Register.WriteMask >> 3) & 0x1) {
8608 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8609
8610 alu.op = ALU_OP1_MOV;
8611 alu.src[0].sel = V_SQ_ALU_SRC_1;
8612 alu.src[0].chan = 0;
8613
8614 alu.dst.sel = ctx->temp_reg;
8615 alu.dst.chan = 3;
8616 alu.dst.write = 1;
8617 alu.last = 1;
8618 r = r600_bytecode_add_alu(ctx->bc, &alu);
8619 if (r)
8620 return r;
8621 }
8622 return tgsi_helper_copy(ctx, inst);
8623 }
8624
8625 static int tgsi_log(struct r600_shader_ctx *ctx)
8626 {
8627 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8628 struct r600_bytecode_alu alu;
8629 int r;
8630 unsigned i;
8631
8632 /* result.x = floor(log2(|src|)); */
8633 if (inst->Dst[0].Register.WriteMask & 1) {
8634 if (ctx->bc->chip_class == CAYMAN) {
8635 for (i = 0; i < 3; i++) {
8636 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8637
8638 alu.op = ALU_OP1_LOG_IEEE;
8639 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
8640 r600_bytecode_src_set_abs(&alu.src[0]);
8641
8642 alu.dst.sel = ctx->temp_reg;
8643 alu.dst.chan = i;
8644 if (i == 0)
8645 alu.dst.write = 1;
8646 if (i == 2)
8647 alu.last = 1;
8648 r = r600_bytecode_add_alu(ctx->bc, &alu);
8649 if (r)
8650 return r;
8651 }
8652
8653 } else {
8654 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8655
8656 alu.op = ALU_OP1_LOG_IEEE;
8657 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
8658 r600_bytecode_src_set_abs(&alu.src[0]);
8659
8660 alu.dst.sel = ctx->temp_reg;
8661 alu.dst.chan = 0;
8662 alu.dst.write = 1;
8663 alu.last = 1;
8664 r = r600_bytecode_add_alu(ctx->bc, &alu);
8665 if (r)
8666 return r;
8667 }
8668
8669 alu.op = ALU_OP1_FLOOR;
8670 alu.src[0].sel = ctx->temp_reg;
8671 alu.src[0].chan = 0;
8672
8673 alu.dst.sel = ctx->temp_reg;
8674 alu.dst.chan = 0;
8675 alu.dst.write = 1;
8676 alu.last = 1;
8677
8678 r = r600_bytecode_add_alu(ctx->bc, &alu);
8679 if (r)
8680 return r;
8681 }
8682
8683 /* result.y = |src.x| / (2 ^ floor(log2(|src.x|))); */
8684 if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
8685
8686 if (ctx->bc->chip_class == CAYMAN) {
8687 for (i = 0; i < 3; i++) {
8688 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8689
8690 alu.op = ALU_OP1_LOG_IEEE;
8691 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
8692 r600_bytecode_src_set_abs(&alu.src[0]);
8693
8694 alu.dst.sel = ctx->temp_reg;
8695 alu.dst.chan = i;
8696 if (i == 1)
8697 alu.dst.write = 1;
8698 if (i == 2)
8699 alu.last = 1;
8700
8701 r = r600_bytecode_add_alu(ctx->bc, &alu);
8702 if (r)
8703 return r;
8704 }
8705 } else {
8706 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8707
8708 alu.op = ALU_OP1_LOG_IEEE;
8709 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
8710 r600_bytecode_src_set_abs(&alu.src[0]);
8711
8712 alu.dst.sel = ctx->temp_reg;
8713 alu.dst.chan = 1;
8714 alu.dst.write = 1;
8715 alu.last = 1;
8716
8717 r = r600_bytecode_add_alu(ctx->bc, &alu);
8718 if (r)
8719 return r;
8720 }
8721
8722 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8723
8724 alu.op = ALU_OP1_FLOOR;
8725 alu.src[0].sel = ctx->temp_reg;
8726 alu.src[0].chan = 1;
8727
8728 alu.dst.sel = ctx->temp_reg;
8729 alu.dst.chan = 1;
8730 alu.dst.write = 1;
8731 alu.last = 1;
8732
8733 r = r600_bytecode_add_alu(ctx->bc, &alu);
8734 if (r)
8735 return r;
8736
8737 if (ctx->bc->chip_class == CAYMAN) {
8738 for (i = 0; i < 3; i++) {
8739 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8740 alu.op = ALU_OP1_EXP_IEEE;
8741 alu.src[0].sel = ctx->temp_reg;
8742 alu.src[0].chan = 1;
8743
8744 alu.dst.sel = ctx->temp_reg;
8745 alu.dst.chan = i;
8746 if (i == 1)
8747 alu.dst.write = 1;
8748 if (i == 2)
8749 alu.last = 1;
8750
8751 r = r600_bytecode_add_alu(ctx->bc, &alu);
8752 if (r)
8753 return r;
8754 }
8755 } else {
8756 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8757 alu.op = ALU_OP1_EXP_IEEE;
8758 alu.src[0].sel = ctx->temp_reg;
8759 alu.src[0].chan = 1;
8760
8761 alu.dst.sel = ctx->temp_reg;
8762 alu.dst.chan = 1;
8763 alu.dst.write = 1;
8764 alu.last = 1;
8765
8766 r = r600_bytecode_add_alu(ctx->bc, &alu);
8767 if (r)
8768 return r;
8769 }
8770
8771 if (ctx->bc->chip_class == CAYMAN) {
8772 for (i = 0; i < 3; i++) {
8773 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8774 alu.op = ALU_OP1_RECIP_IEEE;
8775 alu.src[0].sel = ctx->temp_reg;
8776 alu.src[0].chan = 1;
8777
8778 alu.dst.sel = ctx->temp_reg;
8779 alu.dst.chan = i;
8780 if (i == 1)
8781 alu.dst.write = 1;
8782 if (i == 2)
8783 alu.last = 1;
8784
8785 r = r600_bytecode_add_alu(ctx->bc, &alu);
8786 if (r)
8787 return r;
8788 }
8789 } else {
8790 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8791 alu.op = ALU_OP1_RECIP_IEEE;
8792 alu.src[0].sel = ctx->temp_reg;
8793 alu.src[0].chan = 1;
8794
8795 alu.dst.sel = ctx->temp_reg;
8796 alu.dst.chan = 1;
8797 alu.dst.write = 1;
8798 alu.last = 1;
8799
8800 r = r600_bytecode_add_alu(ctx->bc, &alu);
8801 if (r)
8802 return r;
8803 }
8804
8805 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8806
8807 alu.op = ALU_OP2_MUL;
8808
8809 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
8810 r600_bytecode_src_set_abs(&alu.src[0]);
8811
8812 alu.src[1].sel = ctx->temp_reg;
8813 alu.src[1].chan = 1;
8814
8815 alu.dst.sel = ctx->temp_reg;
8816 alu.dst.chan = 1;
8817 alu.dst.write = 1;
8818 alu.last = 1;
8819
8820 r = r600_bytecode_add_alu(ctx->bc, &alu);
8821 if (r)
8822 return r;
8823 }
8824
8825 /* result.z = log2(|src|);*/
8826 if ((inst->Dst[0].Register.WriteMask >> 2) & 1) {
8827 if (ctx->bc->chip_class == CAYMAN) {
8828 for (i = 0; i < 3; i++) {
8829 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8830
8831 alu.op = ALU_OP1_LOG_IEEE;
8832 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
8833 r600_bytecode_src_set_abs(&alu.src[0]);
8834
8835 alu.dst.sel = ctx->temp_reg;
8836 if (i == 2)
8837 alu.dst.write = 1;
8838 alu.dst.chan = i;
8839 if (i == 2)
8840 alu.last = 1;
8841
8842 r = r600_bytecode_add_alu(ctx->bc, &alu);
8843 if (r)
8844 return r;
8845 }
8846 } else {
8847 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8848
8849 alu.op = ALU_OP1_LOG_IEEE;
8850 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
8851 r600_bytecode_src_set_abs(&alu.src[0]);
8852
8853 alu.dst.sel = ctx->temp_reg;
8854 alu.dst.write = 1;
8855 alu.dst.chan = 2;
8856 alu.last = 1;
8857
8858 r = r600_bytecode_add_alu(ctx->bc, &alu);
8859 if (r)
8860 return r;
8861 }
8862 }
8863
8864 /* result.w = 1.0; */
8865 if ((inst->Dst[0].Register.WriteMask >> 3) & 1) {
8866 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8867
8868 alu.op = ALU_OP1_MOV;
8869 alu.src[0].sel = V_SQ_ALU_SRC_1;
8870 alu.src[0].chan = 0;
8871
8872 alu.dst.sel = ctx->temp_reg;
8873 alu.dst.chan = 3;
8874 alu.dst.write = 1;
8875 alu.last = 1;
8876
8877 r = r600_bytecode_add_alu(ctx->bc, &alu);
8878 if (r)
8879 return r;
8880 }
8881
8882 return tgsi_helper_copy(ctx, inst);
8883 }
8884
8885 static int tgsi_eg_arl(struct r600_shader_ctx *ctx)
8886 {
8887 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8888 struct r600_bytecode_alu alu;
8889 int r;
8890 int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
8891 unsigned reg = get_address_file_reg(ctx, inst->Dst[0].Register.Index);
8892
8893 assert(inst->Dst[0].Register.Index < 3);
8894 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8895
8896 switch (inst->Instruction.Opcode) {
8897 case TGSI_OPCODE_ARL:
8898 alu.op = ALU_OP1_FLT_TO_INT_FLOOR;
8899 break;
8900 case TGSI_OPCODE_ARR:
8901 alu.op = ALU_OP1_FLT_TO_INT;
8902 break;
8903 case TGSI_OPCODE_UARL:
8904 alu.op = ALU_OP1_MOV;
8905 break;
8906 default:
8907 assert(0);
8908 return -1;
8909 }
8910
8911 for (i = 0; i <= lasti; ++i) {
8912 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
8913 continue;
8914 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
8915 alu.last = i == lasti;
8916 alu.dst.sel = reg;
8917 alu.dst.chan = i;
8918 alu.dst.write = 1;
8919 r = r600_bytecode_add_alu(ctx->bc, &alu);
8920 if (r)
8921 return r;
8922 }
8923
8924 if (inst->Dst[0].Register.Index > 0)
8925 ctx->bc->index_loaded[inst->Dst[0].Register.Index - 1] = 0;
8926 else
8927 ctx->bc->ar_loaded = 0;
8928
8929 return 0;
8930 }
8931 static int tgsi_r600_arl(struct r600_shader_ctx *ctx)
8932 {
8933 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8934 struct r600_bytecode_alu alu;
8935 int r;
8936 int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
8937
8938 switch (inst->Instruction.Opcode) {
8939 case TGSI_OPCODE_ARL:
8940 memset(&alu, 0, sizeof(alu));
8941 alu.op = ALU_OP1_FLOOR;
8942 alu.dst.sel = ctx->bc->ar_reg;
8943 alu.dst.write = 1;
8944 for (i = 0; i <= lasti; ++i) {
8945 if (inst->Dst[0].Register.WriteMask & (1 << i)) {
8946 alu.dst.chan = i;
8947 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
8948 alu.last = i == lasti;
8949 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
8950 return r;
8951 }
8952 }
8953
8954 memset(&alu, 0, sizeof(alu));
8955 alu.op = ALU_OP1_FLT_TO_INT;
8956 alu.src[0].sel = ctx->bc->ar_reg;
8957 alu.dst.sel = ctx->bc->ar_reg;
8958 alu.dst.write = 1;
8959 /* FLT_TO_INT is trans-only on r600/r700 */
8960 alu.last = TRUE;
8961 for (i = 0; i <= lasti; ++i) {
8962 alu.dst.chan = i;
8963 alu.src[0].chan = i;
8964 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
8965 return r;
8966 }
8967 break;
8968 case TGSI_OPCODE_ARR:
8969 memset(&alu, 0, sizeof(alu));
8970 alu.op = ALU_OP1_FLT_TO_INT;
8971 alu.dst.sel = ctx->bc->ar_reg;
8972 alu.dst.write = 1;
8973 /* FLT_TO_INT is trans-only on r600/r700 */
8974 alu.last = TRUE;
8975 for (i = 0; i <= lasti; ++i) {
8976 if (inst->Dst[0].Register.WriteMask & (1 << i)) {
8977 alu.dst.chan = i;
8978 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
8979 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
8980 return r;
8981 }
8982 }
8983 break;
8984 case TGSI_OPCODE_UARL:
8985 memset(&alu, 0, sizeof(alu));
8986 alu.op = ALU_OP1_MOV;
8987 alu.dst.sel = ctx->bc->ar_reg;
8988 alu.dst.write = 1;
8989 for (i = 0; i <= lasti; ++i) {
8990 if (inst->Dst[0].Register.WriteMask & (1 << i)) {
8991 alu.dst.chan = i;
8992 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
8993 alu.last = i == lasti;
8994 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
8995 return r;
8996 }
8997 }
8998 break;
8999 default:
9000 assert(0);
9001 return -1;
9002 }
9003
9004 ctx->bc->ar_loaded = 0;
9005 return 0;
9006 }
9007
9008 static int tgsi_opdst(struct r600_shader_ctx *ctx)
9009 {
9010 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9011 struct r600_bytecode_alu alu;
9012 int i, r = 0;
9013
9014 for (i = 0; i < 4; i++) {
9015 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9016
9017 alu.op = ALU_OP2_MUL;
9018 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9019
9020 if (i == 0 || i == 3) {
9021 alu.src[0].sel = V_SQ_ALU_SRC_1;
9022 } else {
9023 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
9024 }
9025
9026 if (i == 0 || i == 2) {
9027 alu.src[1].sel = V_SQ_ALU_SRC_1;
9028 } else {
9029 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
9030 }
9031 if (i == 3)
9032 alu.last = 1;
9033 r = r600_bytecode_add_alu(ctx->bc, &alu);
9034 if (r)
9035 return r;
9036 }
9037 return 0;
9038 }
9039
9040 static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode, int alu_type)
9041 {
9042 struct r600_bytecode_alu alu;
9043 int r;
9044
9045 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9046 alu.op = opcode;
9047 alu.execute_mask = 1;
9048 alu.update_pred = 1;
9049
9050 alu.dst.sel = ctx->temp_reg;
9051 alu.dst.write = 1;
9052 alu.dst.chan = 0;
9053
9054 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9055 alu.src[1].sel = V_SQ_ALU_SRC_0;
9056 alu.src[1].chan = 0;
9057
9058 alu.last = 1;
9059
9060 r = r600_bytecode_add_alu_type(ctx->bc, &alu, alu_type);
9061 if (r)
9062 return r;
9063 return 0;
9064 }
9065
9066 static int pops(struct r600_shader_ctx *ctx, int pops)
9067 {
9068 unsigned force_pop = ctx->bc->force_add_cf;
9069
9070 if (!force_pop) {
9071 int alu_pop = 3;
9072 if (ctx->bc->cf_last) {
9073 if (ctx->bc->cf_last->op == CF_OP_ALU)
9074 alu_pop = 0;
9075 else if (ctx->bc->cf_last->op == CF_OP_ALU_POP_AFTER)
9076 alu_pop = 1;
9077 }
9078 alu_pop += pops;
9079 if (alu_pop == 1) {
9080 ctx->bc->cf_last->op = CF_OP_ALU_POP_AFTER;
9081 ctx->bc->force_add_cf = 1;
9082 } else if (alu_pop == 2) {
9083 ctx->bc->cf_last->op = CF_OP_ALU_POP2_AFTER;
9084 ctx->bc->force_add_cf = 1;
9085 } else {
9086 force_pop = 1;
9087 }
9088 }
9089
9090 if (force_pop) {
9091 r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP);
9092 ctx->bc->cf_last->pop_count = pops;
9093 ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
9094 }
9095
9096 return 0;
9097 }
9098
9099 static inline void callstack_update_max_depth(struct r600_shader_ctx *ctx,
9100 unsigned reason)
9101 {
9102 struct r600_stack_info *stack = &ctx->bc->stack;
9103 unsigned elements;
9104 int entries;
9105
9106 unsigned entry_size = stack->entry_size;
9107
9108 elements = (stack->loop + stack->push_wqm ) * entry_size;
9109 elements += stack->push;
9110
9111 switch (ctx->bc->chip_class) {
9112 case R600:
9113 case R700:
9114 /* pre-r8xx: if any non-WQM PUSH instruction is invoked, 2 elements on
9115 * the stack must be reserved to hold the current active/continue
9116 * masks */
9117 if (reason == FC_PUSH_VPM) {
9118 elements += 2;
9119 }
9120 break;
9121
9122 case CAYMAN:
9123 /* r9xx: any stack operation on empty stack consumes 2 additional
9124 * elements */
9125 elements += 2;
9126
9127 /* fallthrough */
9128 /* FIXME: do the two elements added above cover the cases for the
9129 * r8xx+ below? */
9130
9131 case EVERGREEN:
9132 /* r8xx+: 2 extra elements are not always required, but one extra
9133 * element must be added for each of the following cases:
9134 * 1. There is an ALU_ELSE_AFTER instruction at the point of greatest
9135 * stack usage.
9136 * (Currently we don't use ALU_ELSE_AFTER.)
9137 * 2. There are LOOP/WQM frames on the stack when any flavor of non-WQM
9138 * PUSH instruction executed.
9139 *
9140 * NOTE: it seems we also need to reserve additional element in some
9141 * other cases, e.g. when we have 4 levels of PUSH_VPM in the shader,
9142 * then STACK_SIZE should be 2 instead of 1 */
9143 if (reason == FC_PUSH_VPM) {
9144 elements += 1;
9145 }
9146 break;
9147
9148 default:
9149 assert(0);
9150 break;
9151 }
9152
9153 /* NOTE: it seems STACK_SIZE is interpreted by hw as if entry_size is 4
9154 * for all chips, so we use 4 in the final formula, not the real entry_size
9155 * for the chip */
9156 entry_size = 4;
9157
9158 entries = (elements + (entry_size - 1)) / entry_size;
9159
9160 if (entries > stack->max_entries)
9161 stack->max_entries = entries;
9162 }
9163
9164 static inline void callstack_pop(struct r600_shader_ctx *ctx, unsigned reason)
9165 {
9166 switch(reason) {
9167 case FC_PUSH_VPM:
9168 --ctx->bc->stack.push;
9169 assert(ctx->bc->stack.push >= 0);
9170 break;
9171 case FC_PUSH_WQM:
9172 --ctx->bc->stack.push_wqm;
9173 assert(ctx->bc->stack.push_wqm >= 0);
9174 break;
9175 case FC_LOOP:
9176 --ctx->bc->stack.loop;
9177 assert(ctx->bc->stack.loop >= 0);
9178 break;
9179 default:
9180 assert(0);
9181 break;
9182 }
9183 }
9184
9185 static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason)
9186 {
9187 switch (reason) {
9188 case FC_PUSH_VPM:
9189 ++ctx->bc->stack.push;
9190 break;
9191 case FC_PUSH_WQM:
9192 ++ctx->bc->stack.push_wqm;
9193 case FC_LOOP:
9194 ++ctx->bc->stack.loop;
9195 break;
9196 default:
9197 assert(0);
9198 }
9199
9200 callstack_update_max_depth(ctx, reason);
9201 }
9202
9203 static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp)
9204 {
9205 struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[fc_sp];
9206
9207 sp->mid = realloc((void *)sp->mid,
9208 sizeof(struct r600_bytecode_cf *) * (sp->num_mid + 1));
9209 sp->mid[sp->num_mid] = ctx->bc->cf_last;
9210 sp->num_mid++;
9211 }
9212
9213 static void fc_pushlevel(struct r600_shader_ctx *ctx, int type)
9214 {
9215 assert(ctx->bc->fc_sp < ARRAY_SIZE(ctx->bc->fc_stack));
9216 ctx->bc->fc_stack[ctx->bc->fc_sp].type = type;
9217 ctx->bc->fc_stack[ctx->bc->fc_sp].start = ctx->bc->cf_last;
9218 ctx->bc->fc_sp++;
9219 }
9220
9221 static void fc_poplevel(struct r600_shader_ctx *ctx)
9222 {
9223 struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[ctx->bc->fc_sp - 1];
9224 free(sp->mid);
9225 sp->mid = NULL;
9226 sp->num_mid = 0;
9227 sp->start = NULL;
9228 sp->type = 0;
9229 ctx->bc->fc_sp--;
9230 }
9231
9232 #if 0
9233 static int emit_return(struct r600_shader_ctx *ctx)
9234 {
9235 r600_bytecode_add_cfinst(ctx->bc, CF_OP_RETURN));
9236 return 0;
9237 }
9238
9239 static int emit_jump_to_offset(struct r600_shader_ctx *ctx, int pops, int offset)
9240 {
9241
9242 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP));
9243 ctx->bc->cf_last->pop_count = pops;
9244 /* XXX work out offset */
9245 return 0;
9246 }
9247
9248 static int emit_setret_in_loop_flag(struct r600_shader_ctx *ctx, unsigned flag_value)
9249 {
9250 return 0;
9251 }
9252
9253 static void emit_testflag(struct r600_shader_ctx *ctx)
9254 {
9255
9256 }
9257
9258 static void emit_return_on_flag(struct r600_shader_ctx *ctx, unsigned ifidx)
9259 {
9260 emit_testflag(ctx);
9261 emit_jump_to_offset(ctx, 1, 4);
9262 emit_setret_in_loop_flag(ctx, V_SQ_ALU_SRC_0);
9263 pops(ctx, ifidx + 1);
9264 emit_return(ctx);
9265 }
9266
9267 static void break_loop_on_flag(struct r600_shader_ctx *ctx, unsigned fc_sp)
9268 {
9269 emit_testflag(ctx);
9270
9271 r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
9272 ctx->bc->cf_last->pop_count = 1;
9273
9274 fc_set_mid(ctx, fc_sp);
9275
9276 pops(ctx, 1);
9277 }
9278 #endif
9279
9280 static int emit_if(struct r600_shader_ctx *ctx, int opcode)
9281 {
9282 int alu_type = CF_OP_ALU_PUSH_BEFORE;
9283
9284 /* There is a hardware bug on Cayman where a BREAK/CONTINUE followed by
9285 * LOOP_STARTxxx for nested loops may put the branch stack into a state
9286 * such that ALU_PUSH_BEFORE doesn't work as expected. Workaround this
9287 * by replacing the ALU_PUSH_BEFORE with a PUSH + ALU */
9288 if (ctx->bc->chip_class == CAYMAN && ctx->bc->stack.loop > 1) {
9289 r600_bytecode_add_cfinst(ctx->bc, CF_OP_PUSH);
9290 ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
9291 alu_type = CF_OP_ALU;
9292 }
9293
9294 emit_logic_pred(ctx, opcode, alu_type);
9295
9296 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
9297
9298 fc_pushlevel(ctx, FC_IF);
9299
9300 callstack_push(ctx, FC_PUSH_VPM);
9301 return 0;
9302 }
9303
9304 static int tgsi_if(struct r600_shader_ctx *ctx)
9305 {
9306 return emit_if(ctx, ALU_OP2_PRED_SETNE);
9307 }
9308
9309 static int tgsi_uif(struct r600_shader_ctx *ctx)
9310 {
9311 return emit_if(ctx, ALU_OP2_PRED_SETNE_INT);
9312 }
9313
9314 static int tgsi_else(struct r600_shader_ctx *ctx)
9315 {
9316 r600_bytecode_add_cfinst(ctx->bc, CF_OP_ELSE);
9317 ctx->bc->cf_last->pop_count = 1;
9318
9319 fc_set_mid(ctx, ctx->bc->fc_sp - 1);
9320 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id;
9321 return 0;
9322 }
9323
9324 static int tgsi_endif(struct r600_shader_ctx *ctx)
9325 {
9326 pops(ctx, 1);
9327 if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].type != FC_IF) {
9328 R600_ERR("if/endif unbalanced in shader\n");
9329 return -1;
9330 }
9331
9332 if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid == NULL) {
9333 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id + 2;
9334 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->pop_count = 1;
9335 } else {
9336 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid[0]->cf_addr = ctx->bc->cf_last->id + 2;
9337 }
9338 fc_poplevel(ctx);
9339
9340 callstack_pop(ctx, FC_PUSH_VPM);
9341 return 0;
9342 }
9343
9344 static int tgsi_bgnloop(struct r600_shader_ctx *ctx)
9345 {
9346 /* LOOP_START_DX10 ignores the LOOP_CONFIG* registers, so it is not
9347 * limited to 4096 iterations, like the other LOOP_* instructions. */
9348 r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_START_DX10);
9349
9350 fc_pushlevel(ctx, FC_LOOP);
9351
9352 /* check stack depth */
9353 callstack_push(ctx, FC_LOOP);
9354 return 0;
9355 }
9356
9357 static int tgsi_endloop(struct r600_shader_ctx *ctx)
9358 {
9359 int i;
9360
9361 r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_END);
9362
9363 if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].type != FC_LOOP) {
9364 R600_ERR("loop/endloop in shader code are not paired.\n");
9365 return -EINVAL;
9366 }
9367
9368 /* fixup loop pointers - from r600isa
9369 LOOP END points to CF after LOOP START,
9370 LOOP START point to CF after LOOP END
9371 BRK/CONT point to LOOP END CF
9372 */
9373 ctx->bc->cf_last->cf_addr = ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->id + 2;
9374
9375 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id + 2;
9376
9377 for (i = 0; i < ctx->bc->fc_stack[ctx->bc->fc_sp - 1].num_mid; i++) {
9378 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid[i]->cf_addr = ctx->bc->cf_last->id;
9379 }
9380 /* XXX add LOOPRET support */
9381 fc_poplevel(ctx);
9382 callstack_pop(ctx, FC_LOOP);
9383 return 0;
9384 }
9385
9386 static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx)
9387 {
9388 unsigned int fscp;
9389
9390 for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
9391 {
9392 if (FC_LOOP == ctx->bc->fc_stack[fscp - 1].type)
9393 break;
9394 }
9395
9396 if (fscp == 0) {
9397 R600_ERR("Break not inside loop/endloop pair\n");
9398 return -EINVAL;
9399 }
9400
9401 r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
9402
9403 fc_set_mid(ctx, fscp - 1);
9404
9405 return 0;
9406 }
9407
9408 static int tgsi_gs_emit(struct r600_shader_ctx *ctx)
9409 {
9410 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9411 int stream = ctx->literals[inst->Src[0].Register.Index * 4 + inst->Src[0].Register.SwizzleX];
9412 int r;
9413
9414 if (ctx->inst_info->op == CF_OP_EMIT_VERTEX)
9415 emit_gs_ring_writes(ctx, ctx->gs_stream_output_info, stream, TRUE);
9416
9417 r = r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
9418 if (!r) {
9419 ctx->bc->cf_last->count = stream; // Count field for CUT/EMIT_VERTEX indicates which stream
9420 if (ctx->inst_info->op == CF_OP_EMIT_VERTEX)
9421 return emit_inc_ring_offset(ctx, stream, TRUE);
9422 }
9423 return r;
9424 }
9425
9426 static int tgsi_umad(struct r600_shader_ctx *ctx)
9427 {
9428 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9429 struct r600_bytecode_alu alu;
9430 int i, j, k, r;
9431 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9432
9433 /* src0 * src1 */
9434 for (i = 0; i < lasti + 1; i++) {
9435 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9436 continue;
9437
9438 if (ctx->bc->chip_class == CAYMAN) {
9439 for (j = 0 ; j < 4; j++) {
9440 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9441
9442 alu.op = ALU_OP2_MULLO_UINT;
9443 for (k = 0; k < inst->Instruction.NumSrcRegs; k++) {
9444 r600_bytecode_src(&alu.src[k], &ctx->src[k], i);
9445 }
9446 alu.dst.chan = j;
9447 alu.dst.sel = ctx->temp_reg;
9448 alu.dst.write = (j == i);
9449 if (j == 3)
9450 alu.last = 1;
9451 r = r600_bytecode_add_alu(ctx->bc, &alu);
9452 if (r)
9453 return r;
9454 }
9455 } else {
9456 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9457
9458 alu.dst.chan = i;
9459 alu.dst.sel = ctx->temp_reg;
9460 alu.dst.write = 1;
9461
9462 alu.op = ALU_OP2_MULLO_UINT;
9463 for (j = 0; j < 2; j++) {
9464 r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
9465 }
9466
9467 alu.last = 1;
9468 r = r600_bytecode_add_alu(ctx->bc, &alu);
9469 if (r)
9470 return r;
9471 }
9472 }
9473
9474
9475 for (i = 0; i < lasti + 1; i++) {
9476 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9477 continue;
9478
9479 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9480 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9481
9482 alu.op = ALU_OP2_ADD_INT;
9483
9484 alu.src[0].sel = ctx->temp_reg;
9485 alu.src[0].chan = i;
9486
9487 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
9488 if (i == lasti) {
9489 alu.last = 1;
9490 }
9491 r = r600_bytecode_add_alu(ctx->bc, &alu);
9492 if (r)
9493 return r;
9494 }
9495 return 0;
9496 }
9497
9498 static int tgsi_pk2h(struct r600_shader_ctx *ctx)
9499 {
9500 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9501 struct r600_bytecode_alu alu;
9502 int r, i;
9503 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9504
9505 /* temp.xy = f32_to_f16(src) */
9506 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9507 alu.op = ALU_OP1_FLT32_TO_FLT16;
9508 alu.dst.chan = 0;
9509 alu.dst.sel = ctx->temp_reg;
9510 alu.dst.write = 1;
9511 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9512 r = r600_bytecode_add_alu(ctx->bc, &alu);
9513 if (r)
9514 return r;
9515 alu.dst.chan = 1;
9516 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
9517 alu.last = 1;
9518 r = r600_bytecode_add_alu(ctx->bc, &alu);
9519 if (r)
9520 return r;
9521
9522 /* dst.x = temp.y * 0x10000 + temp.x */
9523 for (i = 0; i < lasti + 1; i++) {
9524 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9525 continue;
9526
9527 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9528 alu.op = ALU_OP3_MULADD_UINT24;
9529 alu.is_op3 = 1;
9530 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9531 alu.last = i == lasti;
9532 alu.src[0].sel = ctx->temp_reg;
9533 alu.src[0].chan = 1;
9534 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
9535 alu.src[1].value = 0x10000;
9536 alu.src[2].sel = ctx->temp_reg;
9537 alu.src[2].chan = 0;
9538 r = r600_bytecode_add_alu(ctx->bc, &alu);
9539 if (r)
9540 return r;
9541 }
9542
9543 return 0;
9544 }
9545
9546 static int tgsi_up2h(struct r600_shader_ctx *ctx)
9547 {
9548 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9549 struct r600_bytecode_alu alu;
9550 int r, i;
9551 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9552
9553 /* temp.x = src.x */
9554 /* note: no need to mask out the high bits */
9555 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9556 alu.op = ALU_OP1_MOV;
9557 alu.dst.chan = 0;
9558 alu.dst.sel = ctx->temp_reg;
9559 alu.dst.write = 1;
9560 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9561 r = r600_bytecode_add_alu(ctx->bc, &alu);
9562 if (r)
9563 return r;
9564
9565 /* temp.y = src.x >> 16 */
9566 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9567 alu.op = ALU_OP2_LSHR_INT;
9568 alu.dst.chan = 1;
9569 alu.dst.sel = ctx->temp_reg;
9570 alu.dst.write = 1;
9571 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9572 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
9573 alu.src[1].value = 16;
9574 alu.last = 1;
9575 r = r600_bytecode_add_alu(ctx->bc, &alu);
9576 if (r)
9577 return r;
9578
9579 /* dst.wz = dst.xy = f16_to_f32(temp.xy) */
9580 for (i = 0; i < lasti + 1; i++) {
9581 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9582 continue;
9583 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9584 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9585 alu.op = ALU_OP1_FLT16_TO_FLT32;
9586 alu.src[0].sel = ctx->temp_reg;
9587 alu.src[0].chan = i % 2;
9588 alu.last = i == lasti;
9589 r = r600_bytecode_add_alu(ctx->bc, &alu);
9590 if (r)
9591 return r;
9592 }
9593
9594 return 0;
9595 }
9596
9597 static int tgsi_bfe(struct r600_shader_ctx *ctx)
9598 {
9599 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9600 struct r600_bytecode_alu alu;
9601 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9602 int r, i;
9603
9604 r = tgsi_op3(ctx);
9605 if (r)
9606 return r;
9607
9608 for (i = 0; i < lasti + 1; i++) {
9609 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9610 alu.op = ALU_OP2_SETGE_INT;
9611 r600_bytecode_src(&alu.src[0], &ctx->src[2], i);
9612 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
9613 alu.src[1].value = 32;
9614 alu.dst.sel = ctx->temp_reg;
9615 alu.dst.chan = i;
9616 alu.dst.write = 1;
9617 if (i == lasti)
9618 alu.last = 1;
9619 r = r600_bytecode_add_alu(ctx->bc, &alu);
9620 if (r)
9621 return r;
9622 }
9623
9624 for (i = 0; i < lasti + 1; i++) {
9625 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9626 alu.op = ALU_OP3_CNDE_INT;
9627 alu.is_op3 = 1;
9628 alu.src[0].sel = ctx->temp_reg;
9629 alu.src[1].chan = i;
9630
9631 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9632 alu.src[1].sel = alu.dst.sel;
9633 alu.src[1].chan = i;
9634 r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
9635 alu.dst.write = 1;
9636 if (i == lasti)
9637 alu.last = 1;
9638 r = r600_bytecode_add_alu(ctx->bc, &alu);
9639 if (r)
9640 return r;
9641 }
9642
9643 return 0;
9644 }
9645
9646 static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
9647 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_r600_arl},
9648 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2},
9649 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit},
9650
9651 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate},
9652
9653 [TGSI_OPCODE_RSQ] = { ALU_OP0_NOP, tgsi_rsq},
9654 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp},
9655 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log},
9656 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL_IEEE, tgsi_op2},
9657 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2},
9658 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
9659 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
9660 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst},
9661 /* MIN_DX10 returns non-nan result if one src is NaN, MIN returns NaN */
9662 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2},
9663 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2},
9664 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap},
9665 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2},
9666 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3},
9667 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp},
9668 [TGSI_OPCODE_FMA] = { ALU_OP0_NOP, tgsi_unsupported},
9669 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
9670 [21] = { ALU_OP0_NOP, tgsi_unsupported},
9671 [22] = { ALU_OP0_NOP, tgsi_unsupported},
9672 [23] = { ALU_OP0_NOP, tgsi_unsupported},
9673 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2},
9674 [25] = { ALU_OP0_NOP, tgsi_unsupported},
9675 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2},
9676 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2},
9677 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
9678 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
9679 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, tgsi_pow},
9680 [31] = { ALU_OP0_NOP, tgsi_unsupported},
9681 [32] = { ALU_OP0_NOP, tgsi_unsupported},
9682 [33] = { ALU_OP0_NOP, tgsi_unsupported},
9683 [34] = { ALU_OP0_NOP, tgsi_unsupported},
9684 [35] = { ALU_OP0_NOP, tgsi_unsupported},
9685 [TGSI_OPCODE_COS] = { ALU_OP1_COS, tgsi_trig},
9686 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
9687 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
9688 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */
9689 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_unsupported},
9690 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported},
9691 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported},
9692 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported},
9693 [44] = { ALU_OP0_NOP, tgsi_unsupported},
9694 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2},
9695 [46] = { ALU_OP0_NOP, tgsi_unsupported},
9696 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2},
9697 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, tgsi_trig},
9698 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap},
9699 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2},
9700 [51] = { ALU_OP0_NOP, tgsi_unsupported},
9701 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex},
9702 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex},
9703 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex},
9704 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_unsupported},
9705 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported},
9706 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported},
9707 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported},
9708 [59] = { ALU_OP0_NOP, tgsi_unsupported},
9709 [60] = { ALU_OP0_NOP, tgsi_unsupported},
9710 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_r600_arl},
9711 [62] = { ALU_OP0_NOP, tgsi_unsupported},
9712 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported},
9713 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported},
9714 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg},
9715 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp},
9716 [67] = { ALU_OP0_NOP, tgsi_unsupported},
9717 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
9718 [69] = { ALU_OP0_NOP, tgsi_unsupported},
9719 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported},
9720 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
9721 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex},
9722 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
9723 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if},
9724 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif},
9725 [76] = { ALU_OP0_NOP, tgsi_unsupported},
9726 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else},
9727 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif},
9728 [TGSI_OPCODE_DDX_FINE] = { ALU_OP0_NOP, tgsi_unsupported},
9729 [TGSI_OPCODE_DDY_FINE] = { ALU_OP0_NOP, tgsi_unsupported},
9730 [81] = { ALU_OP0_NOP, tgsi_unsupported},
9731 [82] = { ALU_OP0_NOP, tgsi_unsupported},
9732 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2},
9733 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
9734 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2},
9735 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2},
9736 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2_trans},
9737 [88] = { ALU_OP0_NOP, tgsi_unsupported},
9738 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2},
9739 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2},
9740 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod},
9741 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2},
9742 [93] = { ALU_OP0_NOP, tgsi_unsupported},
9743 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex},
9744 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
9745 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
9746 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
9747 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit},
9748 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop},
9749 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported},
9750 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop},
9751 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported},
9752 [103] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
9753 [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
9754 [TGSI_OPCODE_RESQ] = { ALU_OP0_NOP, tgsi_unsupported},
9755 [106] = { ALU_OP0_NOP, tgsi_unsupported},
9756 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported},
9757 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2},
9758 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2},
9759 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
9760 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
9761 [TGSI_OPCODE_MEMBAR] = { ALU_OP0_NOP, tgsi_unsupported},
9762 [113] = { ALU_OP0_NOP, tgsi_unsupported},
9763 [114] = { ALU_OP0_NOP, tgsi_unsupported},
9764 [115] = { ALU_OP0_NOP, tgsi_unsupported},
9765 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */
9766 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */
9767 [TGSI_OPCODE_DFMA] = { ALU_OP0_NOP, tgsi_unsupported},
9768 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_op2_trans},
9769 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv},
9770 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2},
9771 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2},
9772 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg},
9773 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2},
9774 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2_trans},
9775 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap},
9776 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_op2_trans},
9777 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
9778 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2},
9779 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv},
9780 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad},
9781 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2},
9782 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2},
9783 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod},
9784 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_UINT, tgsi_op2_trans},
9785 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2},
9786 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2},
9787 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2_trans},
9788 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
9789 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2_swap},
9790 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
9791 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported},
9792 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported},
9793 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
9794 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported},
9795 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported},
9796 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported},
9797 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported},
9798 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported},
9799 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported},
9800 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported},
9801 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported},
9802 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported},
9803 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported},
9804 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported},
9805 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported},
9806 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_r600_arl},
9807 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp},
9808 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs},
9809 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg},
9810 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_unsupported},
9811 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_unsupported},
9812 [163] = { ALU_OP0_NOP, tgsi_unsupported},
9813 [164] = { ALU_OP0_NOP, tgsi_unsupported},
9814 [165] = { ALU_OP0_NOP, tgsi_unsupported},
9815 [TGSI_OPCODE_BARRIER] = { ALU_OP0_NOP, tgsi_unsupported},
9816 [TGSI_OPCODE_ATOMUADD] = { ALU_OP0_NOP, tgsi_unsupported},
9817 [TGSI_OPCODE_ATOMXCHG] = { ALU_OP0_NOP, tgsi_unsupported},
9818 [TGSI_OPCODE_ATOMCAS] = { ALU_OP0_NOP, tgsi_unsupported},
9819 [TGSI_OPCODE_ATOMAND] = { ALU_OP0_NOP, tgsi_unsupported},
9820 [TGSI_OPCODE_ATOMOR] = { ALU_OP0_NOP, tgsi_unsupported},
9821 [TGSI_OPCODE_ATOMXOR] = { ALU_OP0_NOP, tgsi_unsupported},
9822 [TGSI_OPCODE_ATOMUMIN] = { ALU_OP0_NOP, tgsi_unsupported},
9823 [TGSI_OPCODE_ATOMUMAX] = { ALU_OP0_NOP, tgsi_unsupported},
9824 [TGSI_OPCODE_ATOMIMIN] = { ALU_OP0_NOP, tgsi_unsupported},
9825 [TGSI_OPCODE_ATOMIMAX] = { ALU_OP0_NOP, tgsi_unsupported},
9826 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex},
9827 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
9828 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex},
9829 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, tgsi_op2_trans},
9830 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, tgsi_op2_trans},
9831 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_unsupported},
9832 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_unsupported},
9833 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_unsupported},
9834 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_unsupported},
9835 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_unsupported},
9836 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_unsupported},
9837 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_unsupported},
9838 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_unsupported},
9839 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_unsupported},
9840 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_unsupported},
9841 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_unsupported},
9842 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_unsupported},
9843 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_unsupported},
9844 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported},
9845 };
9846
9847 static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = {
9848 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_eg_arl},
9849 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2},
9850 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit},
9851 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate},
9852 [TGSI_OPCODE_RSQ] = { ALU_OP0_NOP, tgsi_rsq},
9853 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp},
9854 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log},
9855 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL_IEEE, tgsi_op2},
9856 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2},
9857 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
9858 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
9859 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst},
9860 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2},
9861 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2},
9862 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap},
9863 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2},
9864 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3},
9865 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp},
9866 [TGSI_OPCODE_FMA] = { ALU_OP3_FMA, tgsi_op3},
9867 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
9868 [21] = { ALU_OP0_NOP, tgsi_unsupported},
9869 [22] = { ALU_OP0_NOP, tgsi_unsupported},
9870 [23] = { ALU_OP0_NOP, tgsi_unsupported},
9871 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2},
9872 [25] = { ALU_OP0_NOP, tgsi_unsupported},
9873 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2},
9874 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2},
9875 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
9876 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
9877 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, tgsi_pow},
9878 [31] = { ALU_OP0_NOP, tgsi_unsupported},
9879 [32] = { ALU_OP0_NOP, tgsi_unsupported},
9880 [33] = { ALU_OP0_NOP, tgsi_unsupported},
9881 [34] = { ALU_OP0_NOP, tgsi_unsupported},
9882 [35] = { ALU_OP0_NOP, tgsi_unsupported},
9883 [TGSI_OPCODE_COS] = { ALU_OP1_COS, tgsi_trig},
9884 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
9885 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
9886 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */
9887 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_pk2h},
9888 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported},
9889 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported},
9890 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported},
9891 [44] = { ALU_OP0_NOP, tgsi_unsupported},
9892 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2},
9893 [46] = { ALU_OP0_NOP, tgsi_unsupported},
9894 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2},
9895 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, tgsi_trig},
9896 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap},
9897 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2},
9898 [51] = { ALU_OP0_NOP, tgsi_unsupported},
9899 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex},
9900 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex},
9901 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex},
9902 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_up2h},
9903 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported},
9904 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported},
9905 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported},
9906 [59] = { ALU_OP0_NOP, tgsi_unsupported},
9907 [60] = { ALU_OP0_NOP, tgsi_unsupported},
9908 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_eg_arl},
9909 [62] = { ALU_OP0_NOP, tgsi_unsupported},
9910 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported},
9911 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported},
9912 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg},
9913 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp},
9914 [67] = { ALU_OP0_NOP, tgsi_unsupported},
9915 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
9916 [69] = { ALU_OP0_NOP, tgsi_unsupported},
9917 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported},
9918 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
9919 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex},
9920 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
9921 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if},
9922 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif},
9923 [76] = { ALU_OP0_NOP, tgsi_unsupported},
9924 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else},
9925 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif},
9926 [TGSI_OPCODE_DDX_FINE] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
9927 [TGSI_OPCODE_DDY_FINE] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
9928 [82] = { ALU_OP0_NOP, tgsi_unsupported},
9929 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2},
9930 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
9931 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2},
9932 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2},
9933 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2},
9934 [88] = { ALU_OP0_NOP, tgsi_unsupported},
9935 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2},
9936 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2},
9937 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod},
9938 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2},
9939 [93] = { ALU_OP0_NOP, tgsi_unsupported},
9940 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex},
9941 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
9942 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
9943 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
9944 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit},
9945 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop},
9946 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported},
9947 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop},
9948 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported},
9949 [103] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
9950 [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
9951 [TGSI_OPCODE_RESQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_resq},
9952 [106] = { ALU_OP0_NOP, tgsi_unsupported},
9953 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported},
9954 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2},
9955 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2},
9956 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
9957 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
9958 [TGSI_OPCODE_MEMBAR] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
9959 [113] = { ALU_OP0_NOP, tgsi_unsupported},
9960 [114] = { ALU_OP0_NOP, tgsi_unsupported},
9961 [115] = { ALU_OP0_NOP, tgsi_unsupported},
9962 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */
9963 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */
9964 /* Refer below for TGSI_OPCODE_DFMA */
9965 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_f2i},
9966 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv},
9967 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2},
9968 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2},
9969 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg},
9970 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2},
9971 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2},
9972 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap},
9973 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_f2i},
9974 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
9975 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2},
9976 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv},
9977 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad},
9978 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2},
9979 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2},
9980 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod},
9981 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_UINT, tgsi_op2_trans},
9982 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2},
9983 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2},
9984 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2},
9985 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
9986 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2},
9987 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
9988 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported},
9989 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported},
9990 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
9991 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported},
9992 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported},
9993 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported},
9994 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported},
9995 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported},
9996 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported},
9997 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported},
9998 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported},
9999 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported},
10000 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported},
10001 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported},
10002 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported},
10003 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_eg_arl},
10004 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp},
10005 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs},
10006 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg},
10007 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_load},
10008 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_store},
10009 [163] = { ALU_OP0_NOP, tgsi_unsupported},
10010 [164] = { ALU_OP0_NOP, tgsi_unsupported},
10011 [165] = { ALU_OP0_NOP, tgsi_unsupported},
10012 [TGSI_OPCODE_BARRIER] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
10013 [TGSI_OPCODE_ATOMUADD] = { V_RAT_INST_ADD_RTN, tgsi_atomic_op},
10014 [TGSI_OPCODE_ATOMXCHG] = { V_RAT_INST_XCHG_RTN, tgsi_atomic_op},
10015 [TGSI_OPCODE_ATOMCAS] = { V_RAT_INST_CMPXCHG_INT_RTN, tgsi_atomic_op},
10016 [TGSI_OPCODE_ATOMAND] = { V_RAT_INST_AND_RTN, tgsi_atomic_op},
10017 [TGSI_OPCODE_ATOMOR] = { V_RAT_INST_OR_RTN, tgsi_atomic_op},
10018 [TGSI_OPCODE_ATOMXOR] = { V_RAT_INST_XOR_RTN, tgsi_atomic_op},
10019 [TGSI_OPCODE_ATOMUMIN] = { V_RAT_INST_MIN_UINT_RTN, tgsi_atomic_op},
10020 [TGSI_OPCODE_ATOMUMAX] = { V_RAT_INST_MAX_UINT_RTN, tgsi_atomic_op},
10021 [TGSI_OPCODE_ATOMIMIN] = { V_RAT_INST_MIN_INT_RTN, tgsi_atomic_op},
10022 [TGSI_OPCODE_ATOMIMAX] = { V_RAT_INST_MAX_INT_RTN, tgsi_atomic_op},
10023 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex},
10024 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
10025 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex},
10026 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, tgsi_op2_trans},
10027 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, tgsi_op2_trans},
10028 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_tex},
10029 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_tex},
10030 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_bfe},
10031 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_bfe},
10032 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_bfi},
10033 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_op2},
10034 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_op2},
10035 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_op2},
10036 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_msb},
10037 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_msb},
10038 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_interp_egcm},
10039 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_interp_egcm},
10040 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_interp_egcm},
10041 [TGSI_OPCODE_F2D] = { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},
10042 [TGSI_OPCODE_D2F] = { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},
10043 [TGSI_OPCODE_DABS] = { ALU_OP1_MOV, tgsi_op2_64},
10044 [TGSI_OPCODE_DNEG] = { ALU_OP2_ADD_64, tgsi_dneg},
10045 [TGSI_OPCODE_DADD] = { ALU_OP2_ADD_64, tgsi_op2_64},
10046 [TGSI_OPCODE_DMUL] = { ALU_OP2_MUL_64, cayman_mul_double_instr},
10047 [TGSI_OPCODE_DDIV] = { 0, cayman_ddiv_instr },
10048 [TGSI_OPCODE_DMAX] = { ALU_OP2_MAX_64, tgsi_op2_64},
10049 [TGSI_OPCODE_DMIN] = { ALU_OP2_MIN_64, tgsi_op2_64},
10050 [TGSI_OPCODE_DSLT] = { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
10051 [TGSI_OPCODE_DSGE] = { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},
10052 [TGSI_OPCODE_DSEQ] = { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},
10053 [TGSI_OPCODE_DSNE] = { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},
10054 [TGSI_OPCODE_DRCP] = { ALU_OP2_RECIP_64, cayman_emit_double_instr},
10055 [TGSI_OPCODE_DSQRT] = { ALU_OP2_SQRT_64, cayman_emit_double_instr},
10056 [TGSI_OPCODE_DMAD] = { ALU_OP3_FMA_64, tgsi_op3_64},
10057 [TGSI_OPCODE_DFMA] = { ALU_OP3_FMA_64, tgsi_op3_64},
10058 [TGSI_OPCODE_DFRAC] = { ALU_OP1_FRACT_64, tgsi_op2_64},
10059 [TGSI_OPCODE_DLDEXP] = { ALU_OP2_LDEXP_64, tgsi_op2_64},
10060 [TGSI_OPCODE_DFRACEXP] = { ALU_OP1_FREXP_64, tgsi_dfracexp},
10061 [TGSI_OPCODE_D2I] = { ALU_OP1_FLT_TO_INT, egcm_double_to_int},
10062 [TGSI_OPCODE_I2D] = { ALU_OP1_INT_TO_FLT, egcm_int_to_double},
10063 [TGSI_OPCODE_D2U] = { ALU_OP1_FLT_TO_UINT, egcm_double_to_int},
10064 [TGSI_OPCODE_U2D] = { ALU_OP1_UINT_TO_FLT, egcm_int_to_double},
10065 [TGSI_OPCODE_DRSQ] = { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},
10066 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported},
10067 };
10068
10069 static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = {
10070 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_eg_arl},
10071 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2},
10072 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit},
10073 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, cayman_emit_float_instr},
10074 [TGSI_OPCODE_RSQ] = { ALU_OP1_RECIPSQRT_IEEE, cayman_emit_float_instr},
10075 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp},
10076 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log},
10077 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL_IEEE, tgsi_op2},
10078 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2},
10079 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
10080 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
10081 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst},
10082 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2},
10083 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2},
10084 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap},
10085 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2},
10086 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3},
10087 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp},
10088 [TGSI_OPCODE_FMA] = { ALU_OP3_FMA, tgsi_op3},
10089 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, cayman_emit_float_instr},
10090 [21] = { ALU_OP0_NOP, tgsi_unsupported},
10091 [22] = { ALU_OP0_NOP, tgsi_unsupported},
10092 [23] = { ALU_OP0_NOP, tgsi_unsupported},
10093 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2},
10094 [25] = { ALU_OP0_NOP, tgsi_unsupported},
10095 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2},
10096 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2},
10097 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, cayman_emit_float_instr},
10098 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, cayman_emit_float_instr},
10099 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, cayman_pow},
10100 [31] = { ALU_OP0_NOP, tgsi_unsupported},
10101 [32] = { ALU_OP0_NOP, tgsi_unsupported},
10102 [33] = { ALU_OP0_NOP, tgsi_unsupported},
10103 [34] = { ALU_OP0_NOP, tgsi_unsupported},
10104 [35] = { ALU_OP0_NOP, tgsi_unsupported},
10105 [TGSI_OPCODE_COS] = { ALU_OP1_COS, cayman_trig},
10106 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
10107 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
10108 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */
10109 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_pk2h},
10110 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported},
10111 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported},
10112 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported},
10113 [44] = { ALU_OP0_NOP, tgsi_unsupported},
10114 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2},
10115 [46] = { ALU_OP0_NOP, tgsi_unsupported},
10116 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2},
10117 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, cayman_trig},
10118 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap},
10119 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2},
10120 [51] = { ALU_OP0_NOP, tgsi_unsupported},
10121 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex},
10122 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex},
10123 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex},
10124 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_up2h},
10125 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported},
10126 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported},
10127 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported},
10128 [59] = { ALU_OP0_NOP, tgsi_unsupported},
10129 [60] = { ALU_OP0_NOP, tgsi_unsupported},
10130 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_eg_arl},
10131 [62] = { ALU_OP0_NOP, tgsi_unsupported},
10132 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported},
10133 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported},
10134 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg},
10135 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp},
10136 [67] = { ALU_OP0_NOP, tgsi_unsupported},
10137 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
10138 [69] = { ALU_OP0_NOP, tgsi_unsupported},
10139 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported},
10140 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
10141 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex},
10142 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
10143 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if},
10144 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif},
10145 [76] = { ALU_OP0_NOP, tgsi_unsupported},
10146 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else},
10147 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif},
10148 [TGSI_OPCODE_DDX_FINE] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
10149 [TGSI_OPCODE_DDY_FINE] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
10150 [82] = { ALU_OP0_NOP, tgsi_unsupported},
10151 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2},
10152 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2},
10153 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2},
10154 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2},
10155 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2},
10156 [88] = { ALU_OP0_NOP, tgsi_unsupported},
10157 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2},
10158 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2},
10159 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod},
10160 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2},
10161 [93] = { ALU_OP0_NOP, tgsi_unsupported},
10162 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex},
10163 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
10164 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
10165 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
10166 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit},
10167 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop},
10168 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported},
10169 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop},
10170 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported},
10171 [103] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
10172 [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
10173 [TGSI_OPCODE_RESQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_resq},
10174 [106] = { ALU_OP0_NOP, tgsi_unsupported},
10175 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported},
10176 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2},
10177 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2},
10178 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
10179 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
10180 [TGSI_OPCODE_MEMBAR] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
10181 [113] = { ALU_OP0_NOP, tgsi_unsupported},
10182 [114] = { ALU_OP0_NOP, tgsi_unsupported},
10183 [115] = { ALU_OP0_NOP, tgsi_unsupported},
10184 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */
10185 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */
10186 /* Refer below for TGSI_OPCODE_DFMA */
10187 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_op2},
10188 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv},
10189 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2},
10190 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2},
10191 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg},
10192 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2},
10193 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2},
10194 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap},
10195 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_op2},
10196 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2},
10197 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2},
10198 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv},
10199 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad},
10200 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2},
10201 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2},
10202 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod},
10203 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_INT, cayman_mul_int_instr},
10204 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2},
10205 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2},
10206 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2},
10207 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
10208 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2},
10209 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
10210 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported},
10211 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported},
10212 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
10213 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported},
10214 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported},
10215 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported},
10216 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported},
10217 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported},
10218 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported},
10219 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported},
10220 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported},
10221 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported},
10222 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported},
10223 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported},
10224 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported},
10225 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_eg_arl},
10226 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp},
10227 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs},
10228 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg},
10229 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_load},
10230 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_store},
10231 [163] = { ALU_OP0_NOP, tgsi_unsupported},
10232 [164] = { ALU_OP0_NOP, tgsi_unsupported},
10233 [165] = { ALU_OP0_NOP, tgsi_unsupported},
10234 [TGSI_OPCODE_BARRIER] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
10235 [TGSI_OPCODE_ATOMUADD] = { V_RAT_INST_ADD_RTN, tgsi_atomic_op},
10236 [TGSI_OPCODE_ATOMXCHG] = { V_RAT_INST_XCHG_RTN, tgsi_atomic_op},
10237 [TGSI_OPCODE_ATOMCAS] = { V_RAT_INST_CMPXCHG_INT_RTN, tgsi_atomic_op},
10238 [TGSI_OPCODE_ATOMAND] = { V_RAT_INST_AND_RTN, tgsi_atomic_op},
10239 [TGSI_OPCODE_ATOMOR] = { V_RAT_INST_OR_RTN, tgsi_atomic_op},
10240 [TGSI_OPCODE_ATOMXOR] = { V_RAT_INST_XOR_RTN, tgsi_atomic_op},
10241 [TGSI_OPCODE_ATOMUMIN] = { V_RAT_INST_MIN_UINT_RTN, tgsi_atomic_op},
10242 [TGSI_OPCODE_ATOMUMAX] = { V_RAT_INST_MAX_UINT_RTN, tgsi_atomic_op},
10243 [TGSI_OPCODE_ATOMIMIN] = { V_RAT_INST_MIN_INT_RTN, tgsi_atomic_op},
10244 [TGSI_OPCODE_ATOMIMAX] = { V_RAT_INST_MAX_INT_RTN, tgsi_atomic_op},
10245 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex},
10246 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
10247 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex},
10248 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, cayman_mul_int_instr},
10249 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, cayman_mul_int_instr},
10250 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_tex},
10251 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_tex},
10252 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_bfe},
10253 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_bfe},
10254 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_bfi},
10255 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_op2},
10256 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_op2},
10257 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_op2},
10258 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_msb},
10259 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_msb},
10260 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_interp_egcm},
10261 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_interp_egcm},
10262 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_interp_egcm},
10263 [TGSI_OPCODE_F2D] = { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},
10264 [TGSI_OPCODE_D2F] = { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},
10265 [TGSI_OPCODE_DABS] = { ALU_OP1_MOV, tgsi_op2_64},
10266 [TGSI_OPCODE_DNEG] = { ALU_OP2_ADD_64, tgsi_dneg},
10267 [TGSI_OPCODE_DADD] = { ALU_OP2_ADD_64, tgsi_op2_64},
10268 [TGSI_OPCODE_DMUL] = { ALU_OP2_MUL_64, cayman_mul_double_instr},
10269 [TGSI_OPCODE_DDIV] = { 0, cayman_ddiv_instr },
10270 [TGSI_OPCODE_DMAX] = { ALU_OP2_MAX_64, tgsi_op2_64},
10271 [TGSI_OPCODE_DMIN] = { ALU_OP2_MIN_64, tgsi_op2_64},
10272 [TGSI_OPCODE_DSLT] = { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
10273 [TGSI_OPCODE_DSGE] = { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},
10274 [TGSI_OPCODE_DSEQ] = { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},
10275 [TGSI_OPCODE_DSNE] = { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},
10276 [TGSI_OPCODE_DRCP] = { ALU_OP2_RECIP_64, cayman_emit_double_instr},
10277 [TGSI_OPCODE_DSQRT] = { ALU_OP2_SQRT_64, cayman_emit_double_instr},
10278 [TGSI_OPCODE_DMAD] = { ALU_OP3_FMA_64, tgsi_op3_64},
10279 [TGSI_OPCODE_DFMA] = { ALU_OP3_FMA_64, tgsi_op3_64},
10280 [TGSI_OPCODE_DFRAC] = { ALU_OP1_FRACT_64, tgsi_op2_64},
10281 [TGSI_OPCODE_DLDEXP] = { ALU_OP2_LDEXP_64, tgsi_op2_64},
10282 [TGSI_OPCODE_DFRACEXP] = { ALU_OP1_FREXP_64, tgsi_dfracexp},
10283 [TGSI_OPCODE_D2I] = { ALU_OP1_FLT_TO_INT, egcm_double_to_int},
10284 [TGSI_OPCODE_I2D] = { ALU_OP1_INT_TO_FLT, egcm_int_to_double},
10285 [TGSI_OPCODE_D2U] = { ALU_OP1_FLT_TO_UINT, egcm_double_to_int},
10286 [TGSI_OPCODE_U2D] = { ALU_OP1_UINT_TO_FLT, egcm_int_to_double},
10287 [TGSI_OPCODE_DRSQ] = { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},
10288 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported},
10289 };