r600: fix bfe where src/dst are same.
[mesa.git] / src / gallium / drivers / r600 / r600_shader.c
1 /*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 */
23 #include "r600_sq.h"
24 #include "r600_formats.h"
25 #include "r600_opcodes.h"
26 #include "r600_shader.h"
27 #include "r600d.h"
28
29 #include "sb/sb_public.h"
30
31 #include "pipe/p_shader_tokens.h"
32 #include "tgsi/tgsi_info.h"
33 #include "tgsi/tgsi_parse.h"
34 #include "tgsi/tgsi_scan.h"
35 #include "tgsi/tgsi_dump.h"
36 #include "util/u_bitcast.h"
37 #include "util/u_memory.h"
38 #include "util/u_math.h"
39 #include <stdio.h>
40 #include <errno.h>
41
42 /* CAYMAN notes
43 Why CAYMAN got loops for lots of instructions is explained here.
44
45 -These 8xx t-slot only ops are implemented in all vector slots.
46 MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT
47 These 8xx t-slot only opcodes become vector ops, with all four
48 slots expecting the arguments on sources a and b. Result is
49 broadcast to all channels.
50 MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT, MUL_64
51 These 8xx t-slot only opcodes become vector ops in the z, y, and
52 x slots.
53 EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64
54 RECIPSQRT_IEEE/CLAMPED/FF/_64/CLAMPED_64
55 SQRT_IEEE/_64
56 SIN/COS
57 The w slot may have an independent co-issued operation, or if the
58 result is required to be in the w slot, the opcode above may be
59 issued in the w slot as well.
60 The compiler must issue the source argument to slots z, y, and x
61 */
62
63 /* Contents of r0 on entry to various shaders
64
65 VS - .x = VertexID
66 .y = RelVertexID (??)
67 .w = InstanceID
68
69 GS - r0.xyw, r1.xyz = per-vertex offsets
70 r0.z = PrimitiveID
71
72 TCS - .x = PatchID
73 .y = RelPatchID (??)
74 .z = InvocationID
75 .w = tess factor base.
76
77 TES - .x = TessCoord.x
78 - .y = TessCoord.y
79 - .z = RelPatchID (??)
80 - .w = PrimitiveID
81
82 PS - face_gpr.z = SampleMask
83 face_gpr.w = SampleID
84 */
85 #define R600_SHADER_BUFFER_INFO_SEL (512 + R600_BUFFER_INFO_OFFSET / 16)
86 static int r600_shader_from_tgsi(struct r600_context *rctx,
87 struct r600_pipe_shader *pipeshader,
88 union r600_shader_key key);
89
90 static void r600_add_gpr_array(struct r600_shader *ps, int start_gpr,
91 int size, unsigned comp_mask) {
92
93 if (!size)
94 return;
95
96 if (ps->num_arrays == ps->max_arrays) {
97 ps->max_arrays += 64;
98 ps->arrays = realloc(ps->arrays, ps->max_arrays *
99 sizeof(struct r600_shader_array));
100 }
101
102 int n = ps->num_arrays;
103 ++ps->num_arrays;
104
105 ps->arrays[n].comp_mask = comp_mask;
106 ps->arrays[n].gpr_start = start_gpr;
107 ps->arrays[n].gpr_count = size;
108 }
109
110 static void r600_dump_streamout(struct pipe_stream_output_info *so)
111 {
112 unsigned i;
113
114 fprintf(stderr, "STREAMOUT\n");
115 for (i = 0; i < so->num_outputs; i++) {
116 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
117 so->output[i].start_component;
118 fprintf(stderr, " %i: MEM_STREAM%d_BUF%i[%i..%i] <- OUT[%i].%s%s%s%s%s\n",
119 i,
120 so->output[i].stream,
121 so->output[i].output_buffer,
122 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
123 so->output[i].register_index,
124 mask & 1 ? "x" : "",
125 mask & 2 ? "y" : "",
126 mask & 4 ? "z" : "",
127 mask & 8 ? "w" : "",
128 so->output[i].dst_offset < so->output[i].start_component ? " (will lower)" : "");
129 }
130 }
131
132 static int store_shader(struct pipe_context *ctx,
133 struct r600_pipe_shader *shader)
134 {
135 struct r600_context *rctx = (struct r600_context *)ctx;
136 uint32_t *ptr, i;
137
138 if (shader->bo == NULL) {
139 shader->bo = (struct r600_resource*)
140 pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_IMMUTABLE, shader->shader.bc.ndw * 4);
141 if (shader->bo == NULL) {
142 return -ENOMEM;
143 }
144 ptr = r600_buffer_map_sync_with_rings(&rctx->b, shader->bo, PIPE_TRANSFER_WRITE);
145 if (R600_BIG_ENDIAN) {
146 for (i = 0; i < shader->shader.bc.ndw; ++i) {
147 ptr[i] = util_cpu_to_le32(shader->shader.bc.bytecode[i]);
148 }
149 } else {
150 memcpy(ptr, shader->shader.bc.bytecode, shader->shader.bc.ndw * sizeof(*ptr));
151 }
152 rctx->b.ws->buffer_unmap(shader->bo->buf);
153 }
154
155 return 0;
156 }
157
158 int r600_pipe_shader_create(struct pipe_context *ctx,
159 struct r600_pipe_shader *shader,
160 union r600_shader_key key)
161 {
162 struct r600_context *rctx = (struct r600_context *)ctx;
163 struct r600_pipe_shader_selector *sel = shader->selector;
164 int r;
165 bool dump = r600_can_dump_shader(&rctx->screen->b,
166 tgsi_get_processor_type(sel->tokens));
167 unsigned use_sb = !(rctx->screen->b.debug_flags & DBG_NO_SB);
168 unsigned sb_disasm = use_sb || (rctx->screen->b.debug_flags & DBG_SB_DISASM);
169 unsigned export_shader;
170
171 shader->shader.bc.isa = rctx->isa;
172
173 if (dump) {
174 fprintf(stderr, "--------------------------------------------------------------\n");
175 tgsi_dump(sel->tokens, 0);
176
177 if (sel->so.num_outputs) {
178 r600_dump_streamout(&sel->so);
179 }
180 }
181 r = r600_shader_from_tgsi(rctx, shader, key);
182 if (r) {
183 R600_ERR("translation from TGSI failed !\n");
184 goto error;
185 }
186 if (shader->shader.processor_type == PIPE_SHADER_VERTEX) {
187 /* only disable for vertex shaders in tess paths */
188 if (key.vs.as_ls)
189 use_sb = 0;
190 }
191 use_sb &= (shader->shader.processor_type != PIPE_SHADER_TESS_CTRL);
192 use_sb &= (shader->shader.processor_type != PIPE_SHADER_TESS_EVAL);
193
194 /* disable SB for shaders using doubles */
195 use_sb &= !shader->shader.uses_doubles;
196
197 use_sb &= !shader->shader.uses_atomics;
198 use_sb &= !shader->shader.uses_images;
199
200 /* Check if the bytecode has already been built. */
201 if (!shader->shader.bc.bytecode) {
202 r = r600_bytecode_build(&shader->shader.bc);
203 if (r) {
204 R600_ERR("building bytecode failed !\n");
205 goto error;
206 }
207 }
208
209 if (dump && !sb_disasm) {
210 fprintf(stderr, "--------------------------------------------------------------\n");
211 r600_bytecode_disasm(&shader->shader.bc);
212 fprintf(stderr, "______________________________________________________________\n");
213 } else if ((dump && sb_disasm) || use_sb) {
214 r = r600_sb_bytecode_process(rctx, &shader->shader.bc, &shader->shader,
215 dump, use_sb);
216 if (r) {
217 R600_ERR("r600_sb_bytecode_process failed !\n");
218 goto error;
219 }
220 }
221
222 if (shader->gs_copy_shader) {
223 if (dump) {
224 // dump copy shader
225 r = r600_sb_bytecode_process(rctx, &shader->gs_copy_shader->shader.bc,
226 &shader->gs_copy_shader->shader, dump, 0);
227 if (r)
228 goto error;
229 }
230
231 if ((r = store_shader(ctx, shader->gs_copy_shader)))
232 goto error;
233 }
234
235 /* Store the shader in a buffer. */
236 if ((r = store_shader(ctx, shader)))
237 goto error;
238
239 /* Build state. */
240 switch (shader->shader.processor_type) {
241 case PIPE_SHADER_TESS_CTRL:
242 evergreen_update_hs_state(ctx, shader);
243 break;
244 case PIPE_SHADER_TESS_EVAL:
245 if (key.tes.as_es)
246 evergreen_update_es_state(ctx, shader);
247 else
248 evergreen_update_vs_state(ctx, shader);
249 break;
250 case PIPE_SHADER_GEOMETRY:
251 if (rctx->b.chip_class >= EVERGREEN) {
252 evergreen_update_gs_state(ctx, shader);
253 evergreen_update_vs_state(ctx, shader->gs_copy_shader);
254 } else {
255 r600_update_gs_state(ctx, shader);
256 r600_update_vs_state(ctx, shader->gs_copy_shader);
257 }
258 break;
259 case PIPE_SHADER_VERTEX:
260 export_shader = key.vs.as_es;
261 if (rctx->b.chip_class >= EVERGREEN) {
262 if (key.vs.as_ls)
263 evergreen_update_ls_state(ctx, shader);
264 else if (key.vs.as_es)
265 evergreen_update_es_state(ctx, shader);
266 else
267 evergreen_update_vs_state(ctx, shader);
268 } else {
269 if (export_shader)
270 r600_update_es_state(ctx, shader);
271 else
272 r600_update_vs_state(ctx, shader);
273 }
274 break;
275 case PIPE_SHADER_FRAGMENT:
276 if (rctx->b.chip_class >= EVERGREEN) {
277 evergreen_update_ps_state(ctx, shader);
278 } else {
279 r600_update_ps_state(ctx, shader);
280 }
281 break;
282 default:
283 r = -EINVAL;
284 goto error;
285 }
286 return 0;
287
288 error:
289 r600_pipe_shader_destroy(ctx, shader);
290 return r;
291 }
292
293 void r600_pipe_shader_destroy(struct pipe_context *ctx UNUSED, struct r600_pipe_shader *shader)
294 {
295 r600_resource_reference(&shader->bo, NULL);
296 r600_bytecode_clear(&shader->shader.bc);
297 r600_release_command_buffer(&shader->command_buffer);
298 }
299
300 /*
301 * tgsi -> r600 shader
302 */
303 struct r600_shader_tgsi_instruction;
304
305 struct r600_shader_src {
306 unsigned sel;
307 unsigned swizzle[4];
308 unsigned neg;
309 unsigned abs;
310 unsigned rel;
311 unsigned kc_bank;
312 boolean kc_rel; /* true if cache bank is indexed */
313 uint32_t value[4];
314 };
315
316 struct eg_interp {
317 boolean enabled;
318 unsigned ij_index;
319 };
320
321 struct r600_shader_ctx {
322 struct tgsi_shader_info info;
323 struct tgsi_parse_context parse;
324 const struct tgsi_token *tokens;
325 unsigned type;
326 unsigned file_offset[TGSI_FILE_COUNT];
327 unsigned temp_reg;
328 const struct r600_shader_tgsi_instruction *inst_info;
329 struct r600_bytecode *bc;
330 struct r600_shader *shader;
331 struct r600_shader_src src[4];
332 uint32_t *literals;
333 uint32_t nliterals;
334 uint32_t max_driver_temp_used;
335 /* needed for evergreen interpolation */
336 struct eg_interp eg_interpolators[6]; // indexed by Persp/Linear * 3 + sample/center/centroid
337 /* evergreen/cayman also store sample mask in face register */
338 int face_gpr;
339 /* sample id is .w component stored in fixed point position register */
340 int fixed_pt_position_gpr;
341 int colors_used;
342 boolean clip_vertex_write;
343 unsigned cv_output;
344 unsigned edgeflag_output;
345 int fragcoord_input;
346 int native_integers;
347 int next_ring_offset;
348 int gs_out_ring_offset;
349 int gs_next_vertex;
350 struct r600_shader *gs_for_vs;
351 int gs_export_gpr_tregs[4];
352 int gs_rotated_input[2];
353 const struct pipe_stream_output_info *gs_stream_output_info;
354 unsigned enabled_stream_buffers_mask;
355 unsigned tess_input_info; /* temp with tess input offsets */
356 unsigned tess_output_info; /* temp with tess input offsets */
357 unsigned thread_id_gpr; /* temp with thread id calculated for images */
358 };
359
360 struct r600_shader_tgsi_instruction {
361 unsigned op;
362 int (*process)(struct r600_shader_ctx *ctx);
363 };
364
365 static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so, int stream, bool ind);
366 static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[];
367 static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx);
368 static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason);
369 static void fc_pushlevel(struct r600_shader_ctx *ctx, int type);
370 static int tgsi_else(struct r600_shader_ctx *ctx);
371 static int tgsi_endif(struct r600_shader_ctx *ctx);
372 static int tgsi_bgnloop(struct r600_shader_ctx *ctx);
373 static int tgsi_endloop(struct r600_shader_ctx *ctx);
374 static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx);
375 static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
376 unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,
377 unsigned int dst_reg);
378 static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
379 const struct r600_shader_src *shader_src,
380 unsigned chan);
381 static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg,
382 unsigned dst_reg, unsigned mask);
383
384 static int tgsi_last_instruction(unsigned writemask)
385 {
386 int i, lasti = 0;
387
388 for (i = 0; i < 4; i++) {
389 if (writemask & (1 << i)) {
390 lasti = i;
391 }
392 }
393 return lasti;
394 }
395
396 static int tgsi_is_supported(struct r600_shader_ctx *ctx)
397 {
398 struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction;
399 unsigned j;
400
401 if (i->Instruction.NumDstRegs > 1 && i->Instruction.Opcode != TGSI_OPCODE_DFRACEXP) {
402 R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs);
403 return -EINVAL;
404 }
405 #if 0
406 if (i->Instruction.Label) {
407 R600_ERR("label unsupported\n");
408 return -EINVAL;
409 }
410 #endif
411 for (j = 0; j < i->Instruction.NumSrcRegs; j++) {
412 if (i->Src[j].Register.Dimension) {
413 switch (i->Src[j].Register.File) {
414 case TGSI_FILE_CONSTANT:
415 case TGSI_FILE_HW_ATOMIC:
416 break;
417 case TGSI_FILE_INPUT:
418 if (ctx->type == PIPE_SHADER_GEOMETRY ||
419 ctx->type == PIPE_SHADER_TESS_CTRL ||
420 ctx->type == PIPE_SHADER_TESS_EVAL)
421 break;
422 case TGSI_FILE_OUTPUT:
423 if (ctx->type == PIPE_SHADER_TESS_CTRL)
424 break;
425 default:
426 R600_ERR("unsupported src %d (file %d, dimension %d)\n", j,
427 i->Src[j].Register.File,
428 i->Src[j].Register.Dimension);
429 return -EINVAL;
430 }
431 }
432 }
433 for (j = 0; j < i->Instruction.NumDstRegs; j++) {
434 if (i->Dst[j].Register.Dimension) {
435 if (ctx->type == PIPE_SHADER_TESS_CTRL)
436 continue;
437 R600_ERR("unsupported dst (dimension)\n");
438 return -EINVAL;
439 }
440 }
441 return 0;
442 }
443
444 int eg_get_interpolator_index(unsigned interpolate, unsigned location)
445 {
446 if (interpolate == TGSI_INTERPOLATE_COLOR ||
447 interpolate == TGSI_INTERPOLATE_LINEAR ||
448 interpolate == TGSI_INTERPOLATE_PERSPECTIVE)
449 {
450 int is_linear = interpolate == TGSI_INTERPOLATE_LINEAR;
451 int loc;
452
453 switch(location) {
454 case TGSI_INTERPOLATE_LOC_CENTER:
455 loc = 1;
456 break;
457 case TGSI_INTERPOLATE_LOC_CENTROID:
458 loc = 2;
459 break;
460 case TGSI_INTERPOLATE_LOC_SAMPLE:
461 default:
462 loc = 0; break;
463 }
464
465 return is_linear * 3 + loc;
466 }
467
468 return -1;
469 }
470
471 static void evergreen_interp_assign_ij_index(struct r600_shader_ctx *ctx,
472 int input)
473 {
474 int i = eg_get_interpolator_index(
475 ctx->shader->input[input].interpolate,
476 ctx->shader->input[input].interpolate_location);
477 assert(i >= 0);
478 ctx->shader->input[input].ij_index = ctx->eg_interpolators[i].ij_index;
479 }
480
481 static int evergreen_interp_alu(struct r600_shader_ctx *ctx, int input)
482 {
483 int i, r;
484 struct r600_bytecode_alu alu;
485 int gpr = 0, base_chan = 0;
486 int ij_index = ctx->shader->input[input].ij_index;
487
488 /* work out gpr and base_chan from index */
489 gpr = ij_index / 2;
490 base_chan = (2 * (ij_index % 2)) + 1;
491
492 for (i = 0; i < 8; i++) {
493 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
494
495 if (i < 4)
496 alu.op = ALU_OP2_INTERP_ZW;
497 else
498 alu.op = ALU_OP2_INTERP_XY;
499
500 if ((i > 1) && (i < 6)) {
501 alu.dst.sel = ctx->shader->input[input].gpr;
502 alu.dst.write = 1;
503 }
504
505 alu.dst.chan = i % 4;
506
507 alu.src[0].sel = gpr;
508 alu.src[0].chan = (base_chan - (i % 2));
509
510 alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
511
512 alu.bank_swizzle_force = SQ_ALU_VEC_210;
513 if ((i % 4) == 3)
514 alu.last = 1;
515 r = r600_bytecode_add_alu(ctx->bc, &alu);
516 if (r)
517 return r;
518 }
519 return 0;
520 }
521
522 static int evergreen_interp_flat(struct r600_shader_ctx *ctx, int input)
523 {
524 int i, r;
525 struct r600_bytecode_alu alu;
526
527 for (i = 0; i < 4; i++) {
528 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
529
530 alu.op = ALU_OP1_INTERP_LOAD_P0;
531
532 alu.dst.sel = ctx->shader->input[input].gpr;
533 alu.dst.write = 1;
534
535 alu.dst.chan = i;
536
537 alu.src[0].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
538 alu.src[0].chan = i;
539
540 if (i == 3)
541 alu.last = 1;
542 r = r600_bytecode_add_alu(ctx->bc, &alu);
543 if (r)
544 return r;
545 }
546 return 0;
547 }
548
549 /*
550 * Special export handling in shaders
551 *
552 * shader export ARRAY_BASE for EXPORT_POS:
553 * 60 is position
554 * 61 is misc vector
555 * 62, 63 are clip distance vectors
556 *
557 * The use of the values exported in 61-63 are controlled by PA_CL_VS_OUT_CNTL:
558 * VS_OUT_MISC_VEC_ENA - enables the use of all fields in export 61
559 * USE_VTX_POINT_SIZE - point size in the X channel of export 61
560 * USE_VTX_EDGE_FLAG - edge flag in the Y channel of export 61
561 * USE_VTX_RENDER_TARGET_INDX - render target index in the Z channel of export 61
562 * USE_VTX_VIEWPORT_INDX - viewport index in the W channel of export 61
563 * USE_VTX_KILL_FLAG - kill flag in the Z channel of export 61 (mutually
564 * exclusive from render target index)
565 * VS_OUT_CCDIST0_VEC_ENA/VS_OUT_CCDIST1_VEC_ENA - enable clip distance vectors
566 *
567 *
568 * shader export ARRAY_BASE for EXPORT_PIXEL:
569 * 0-7 CB targets
570 * 61 computed Z vector
571 *
572 * The use of the values exported in the computed Z vector are controlled
573 * by DB_SHADER_CONTROL:
574 * Z_EXPORT_ENABLE - Z as a float in RED
575 * STENCIL_REF_EXPORT_ENABLE - stencil ref as int in GREEN
576 * COVERAGE_TO_MASK_ENABLE - alpha to mask in ALPHA
577 * MASK_EXPORT_ENABLE - pixel sample mask in BLUE
578 * DB_SOURCE_FORMAT - export control restrictions
579 *
580 */
581
582
583 /* Map name/sid pair from tgsi to the 8-bit semantic index for SPI setup */
584 static int r600_spi_sid(struct r600_shader_io * io)
585 {
586 int index, name = io->name;
587
588 /* These params are handled differently, they don't need
589 * semantic indices, so we'll use 0 for them.
590 */
591 if (name == TGSI_SEMANTIC_POSITION ||
592 name == TGSI_SEMANTIC_PSIZE ||
593 name == TGSI_SEMANTIC_EDGEFLAG ||
594 name == TGSI_SEMANTIC_FACE ||
595 name == TGSI_SEMANTIC_SAMPLEMASK)
596 index = 0;
597 else {
598 if (name == TGSI_SEMANTIC_GENERIC) {
599 /* For generic params simply use sid from tgsi */
600 index = io->sid;
601 } else {
602 /* For non-generic params - pack name and sid into 8 bits */
603 index = 0x80 | (name<<3) | (io->sid);
604 }
605
606 /* Make sure that all really used indices have nonzero value, so
607 * we can just compare it to 0 later instead of comparing the name
608 * with different values to detect special cases. */
609 index++;
610 }
611
612 return index;
613 };
614
615 /* we need this to get a common lds index for vs/tcs/tes input/outputs */
616 int r600_get_lds_unique_index(unsigned semantic_name, unsigned index)
617 {
618 switch (semantic_name) {
619 case TGSI_SEMANTIC_POSITION:
620 return 0;
621 case TGSI_SEMANTIC_PSIZE:
622 return 1;
623 case TGSI_SEMANTIC_CLIPDIST:
624 assert(index <= 1);
625 return 2 + index;
626 case TGSI_SEMANTIC_GENERIC:
627 if (index <= 63-4)
628 return 4 + index - 9;
629 else
630 /* same explanation as in the default statement,
631 * the only user hitting this is st/nine.
632 */
633 return 0;
634
635 /* patch indices are completely separate and thus start from 0 */
636 case TGSI_SEMANTIC_TESSOUTER:
637 return 0;
638 case TGSI_SEMANTIC_TESSINNER:
639 return 1;
640 case TGSI_SEMANTIC_PATCH:
641 return 2 + index;
642
643 default:
644 /* Don't fail here. The result of this function is only used
645 * for LS, TCS, TES, and GS, where legacy GL semantics can't
646 * occur, but this function is called for all vertex shaders
647 * before it's known whether LS will be compiled or not.
648 */
649 return 0;
650 }
651 }
652
653 /* turn input into interpolate on EG */
654 static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index)
655 {
656 int r = 0;
657
658 if (ctx->shader->input[index].spi_sid) {
659 ctx->shader->input[index].lds_pos = ctx->shader->nlds++;
660 if (ctx->shader->input[index].interpolate > 0) {
661 evergreen_interp_assign_ij_index(ctx, index);
662 r = evergreen_interp_alu(ctx, index);
663 } else {
664 r = evergreen_interp_flat(ctx, index);
665 }
666 }
667 return r;
668 }
669
670 static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back)
671 {
672 struct r600_bytecode_alu alu;
673 int i, r;
674 int gpr_front = ctx->shader->input[front].gpr;
675 int gpr_back = ctx->shader->input[back].gpr;
676
677 for (i = 0; i < 4; i++) {
678 memset(&alu, 0, sizeof(alu));
679 alu.op = ALU_OP3_CNDGT;
680 alu.is_op3 = 1;
681 alu.dst.write = 1;
682 alu.dst.sel = gpr_front;
683 alu.src[0].sel = ctx->face_gpr;
684 alu.src[1].sel = gpr_front;
685 alu.src[2].sel = gpr_back;
686
687 alu.dst.chan = i;
688 alu.src[1].chan = i;
689 alu.src[2].chan = i;
690 alu.last = (i==3);
691
692 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
693 return r;
694 }
695
696 return 0;
697 }
698
699 /* execute a single slot ALU calculation */
700 static int single_alu_op2(struct r600_shader_ctx *ctx, int op,
701 int dst_sel, int dst_chan,
702 int src0_sel, unsigned src0_chan_val,
703 int src1_sel, unsigned src1_chan_val)
704 {
705 struct r600_bytecode_alu alu;
706 int r, i;
707
708 if (ctx->bc->chip_class == CAYMAN && op == ALU_OP2_MULLO_INT) {
709 for (i = 0; i < 4; i++) {
710 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
711 alu.op = op;
712 alu.src[0].sel = src0_sel;
713 if (src0_sel == V_SQ_ALU_SRC_LITERAL)
714 alu.src[0].value = src0_chan_val;
715 else
716 alu.src[0].chan = src0_chan_val;
717 alu.src[1].sel = src1_sel;
718 if (src1_sel == V_SQ_ALU_SRC_LITERAL)
719 alu.src[1].value = src1_chan_val;
720 else
721 alu.src[1].chan = src1_chan_val;
722 alu.dst.sel = dst_sel;
723 alu.dst.chan = i;
724 alu.dst.write = i == dst_chan;
725 alu.last = (i == 3);
726 r = r600_bytecode_add_alu(ctx->bc, &alu);
727 if (r)
728 return r;
729 }
730 return 0;
731 }
732
733 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
734 alu.op = op;
735 alu.src[0].sel = src0_sel;
736 if (src0_sel == V_SQ_ALU_SRC_LITERAL)
737 alu.src[0].value = src0_chan_val;
738 else
739 alu.src[0].chan = src0_chan_val;
740 alu.src[1].sel = src1_sel;
741 if (src1_sel == V_SQ_ALU_SRC_LITERAL)
742 alu.src[1].value = src1_chan_val;
743 else
744 alu.src[1].chan = src1_chan_val;
745 alu.dst.sel = dst_sel;
746 alu.dst.chan = dst_chan;
747 alu.dst.write = 1;
748 alu.last = 1;
749 r = r600_bytecode_add_alu(ctx->bc, &alu);
750 if (r)
751 return r;
752 return 0;
753 }
754
755 /* execute a single slot ALU calculation */
756 static int single_alu_op3(struct r600_shader_ctx *ctx, int op,
757 int dst_sel, int dst_chan,
758 int src0_sel, unsigned src0_chan_val,
759 int src1_sel, unsigned src1_chan_val,
760 int src2_sel, unsigned src2_chan_val)
761 {
762 struct r600_bytecode_alu alu;
763 int r;
764
765 /* validate this for other ops */
766 assert(op == ALU_OP3_MULADD_UINT24 || op == ALU_OP3_CNDE_INT);
767 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
768 alu.op = op;
769 alu.src[0].sel = src0_sel;
770 if (src0_sel == V_SQ_ALU_SRC_LITERAL)
771 alu.src[0].value = src0_chan_val;
772 else
773 alu.src[0].chan = src0_chan_val;
774 alu.src[1].sel = src1_sel;
775 if (src1_sel == V_SQ_ALU_SRC_LITERAL)
776 alu.src[1].value = src1_chan_val;
777 else
778 alu.src[1].chan = src1_chan_val;
779 alu.src[2].sel = src2_sel;
780 if (src2_sel == V_SQ_ALU_SRC_LITERAL)
781 alu.src[2].value = src2_chan_val;
782 else
783 alu.src[2].chan = src2_chan_val;
784 alu.dst.sel = dst_sel;
785 alu.dst.chan = dst_chan;
786 alu.is_op3 = 1;
787 alu.last = 1;
788 r = r600_bytecode_add_alu(ctx->bc, &alu);
789 if (r)
790 return r;
791 return 0;
792 }
793
794 /* put it in temp_reg.x */
795 static int get_lds_offset0(struct r600_shader_ctx *ctx,
796 int rel_patch_chan,
797 int temp_reg, bool is_patch_var)
798 {
799 int r;
800
801 /* MUL temp.x, patch_stride (input_vals.x), rel_patch_id (r0.y (tcs)) */
802 /* ADD
803 Dimension - patch0_offset (input_vals.z),
804 Non-dim - patch0_data_offset (input_vals.w)
805 */
806 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
807 temp_reg, 0,
808 ctx->tess_output_info, 0,
809 0, rel_patch_chan,
810 ctx->tess_output_info, is_patch_var ? 3 : 2);
811 if (r)
812 return r;
813 return 0;
814 }
815
816 static inline int get_address_file_reg(struct r600_shader_ctx *ctx, int index)
817 {
818 return index > 0 ? ctx->bc->index_reg[index - 1] : ctx->bc->ar_reg;
819 }
820
821 static int r600_get_temp(struct r600_shader_ctx *ctx)
822 {
823 return ctx->temp_reg + ctx->max_driver_temp_used++;
824 }
825
826 static int vs_add_primid_output(struct r600_shader_ctx *ctx, int prim_id_sid)
827 {
828 int i;
829 i = ctx->shader->noutput++;
830 ctx->shader->output[i].name = TGSI_SEMANTIC_PRIMID;
831 ctx->shader->output[i].sid = 0;
832 ctx->shader->output[i].gpr = 0;
833 ctx->shader->output[i].interpolate = TGSI_INTERPOLATE_CONSTANT;
834 ctx->shader->output[i].write_mask = 0x4;
835 ctx->shader->output[i].spi_sid = prim_id_sid;
836
837 return 0;
838 }
839
840 static int tgsi_barrier(struct r600_shader_ctx *ctx)
841 {
842 struct r600_bytecode_alu alu;
843 int r;
844
845 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
846 alu.op = ctx->inst_info->op;
847 alu.last = 1;
848
849 r = r600_bytecode_add_alu(ctx->bc, &alu);
850 if (r)
851 return r;
852 return 0;
853 }
854
855 static int tgsi_declaration(struct r600_shader_ctx *ctx)
856 {
857 struct tgsi_full_declaration *d = &ctx->parse.FullToken.FullDeclaration;
858 int r, i, j, count = d->Range.Last - d->Range.First + 1;
859
860 switch (d->Declaration.File) {
861 case TGSI_FILE_INPUT:
862 for (j = 0; j < count; j++) {
863 i = ctx->shader->ninput + j;
864 assert(i < ARRAY_SIZE(ctx->shader->input));
865 ctx->shader->input[i].name = d->Semantic.Name;
866 ctx->shader->input[i].sid = d->Semantic.Index + j;
867 ctx->shader->input[i].interpolate = d->Interp.Interpolate;
868 ctx->shader->input[i].interpolate_location = d->Interp.Location;
869 ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First + j;
870 if (ctx->type == PIPE_SHADER_FRAGMENT) {
871 ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]);
872 switch (ctx->shader->input[i].name) {
873 case TGSI_SEMANTIC_FACE:
874 if (ctx->face_gpr != -1)
875 ctx->shader->input[i].gpr = ctx->face_gpr; /* already allocated by allocate_system_value_inputs */
876 else
877 ctx->face_gpr = ctx->shader->input[i].gpr;
878 break;
879 case TGSI_SEMANTIC_COLOR:
880 ctx->colors_used++;
881 break;
882 case TGSI_SEMANTIC_POSITION:
883 ctx->fragcoord_input = i;
884 break;
885 case TGSI_SEMANTIC_PRIMID:
886 /* set this for now */
887 ctx->shader->gs_prim_id_input = true;
888 ctx->shader->ps_prim_id_input = i;
889 break;
890 }
891 if (ctx->bc->chip_class >= EVERGREEN) {
892 if ((r = evergreen_interp_input(ctx, i)))
893 return r;
894 }
895 } else if (ctx->type == PIPE_SHADER_GEOMETRY) {
896 /* FIXME probably skip inputs if they aren't passed in the ring */
897 ctx->shader->input[i].ring_offset = ctx->next_ring_offset;
898 ctx->next_ring_offset += 16;
899 if (ctx->shader->input[i].name == TGSI_SEMANTIC_PRIMID)
900 ctx->shader->gs_prim_id_input = true;
901 }
902 }
903 ctx->shader->ninput += count;
904 break;
905 case TGSI_FILE_OUTPUT:
906 for (j = 0; j < count; j++) {
907 i = ctx->shader->noutput + j;
908 assert(i < ARRAY_SIZE(ctx->shader->output));
909 ctx->shader->output[i].name = d->Semantic.Name;
910 ctx->shader->output[i].sid = d->Semantic.Index + j;
911 ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First + j;
912 ctx->shader->output[i].interpolate = d->Interp.Interpolate;
913 ctx->shader->output[i].write_mask = d->Declaration.UsageMask;
914 if (ctx->type == PIPE_SHADER_VERTEX ||
915 ctx->type == PIPE_SHADER_GEOMETRY ||
916 ctx->type == PIPE_SHADER_TESS_EVAL) {
917 ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]);
918 switch (d->Semantic.Name) {
919 case TGSI_SEMANTIC_CLIPDIST:
920 break;
921 case TGSI_SEMANTIC_PSIZE:
922 ctx->shader->vs_out_misc_write = 1;
923 ctx->shader->vs_out_point_size = 1;
924 break;
925 case TGSI_SEMANTIC_EDGEFLAG:
926 ctx->shader->vs_out_misc_write = 1;
927 ctx->shader->vs_out_edgeflag = 1;
928 ctx->edgeflag_output = i;
929 break;
930 case TGSI_SEMANTIC_VIEWPORT_INDEX:
931 ctx->shader->vs_out_misc_write = 1;
932 ctx->shader->vs_out_viewport = 1;
933 break;
934 case TGSI_SEMANTIC_LAYER:
935 ctx->shader->vs_out_misc_write = 1;
936 ctx->shader->vs_out_layer = 1;
937 break;
938 case TGSI_SEMANTIC_CLIPVERTEX:
939 ctx->clip_vertex_write = TRUE;
940 ctx->cv_output = i;
941 break;
942 }
943 if (ctx->type == PIPE_SHADER_GEOMETRY) {
944 ctx->gs_out_ring_offset += 16;
945 }
946 } else if (ctx->type == PIPE_SHADER_FRAGMENT) {
947 switch (d->Semantic.Name) {
948 case TGSI_SEMANTIC_COLOR:
949 ctx->shader->nr_ps_max_color_exports++;
950 break;
951 }
952 }
953 }
954 ctx->shader->noutput += count;
955 break;
956 case TGSI_FILE_TEMPORARY:
957 if (ctx->info.indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
958 if (d->Array.ArrayID) {
959 r600_add_gpr_array(ctx->shader,
960 ctx->file_offset[TGSI_FILE_TEMPORARY] +
961 d->Range.First,
962 d->Range.Last - d->Range.First + 1, 0x0F);
963 }
964 }
965 break;
966
967 case TGSI_FILE_CONSTANT:
968 case TGSI_FILE_SAMPLER:
969 case TGSI_FILE_SAMPLER_VIEW:
970 case TGSI_FILE_ADDRESS:
971 case TGSI_FILE_IMAGE:
972 break;
973
974 case TGSI_FILE_HW_ATOMIC:
975 i = ctx->shader->nhwatomic_ranges;
976 ctx->shader->atomics[i].start = d->Range.First;
977 ctx->shader->atomics[i].end = d->Range.Last;
978 ctx->shader->atomics[i].hw_idx = ctx->shader->atomic_base + ctx->shader->nhwatomic;
979 ctx->shader->atomics[i].array_id = d->Array.ArrayID;
980 ctx->shader->atomics[i].buffer_id = d->Dim.Index2D;
981 ctx->shader->nhwatomic_ranges++;
982 ctx->shader->nhwatomic += count;
983 break;
984
985 case TGSI_FILE_SYSTEM_VALUE:
986 if (d->Semantic.Name == TGSI_SEMANTIC_SAMPLEMASK ||
987 d->Semantic.Name == TGSI_SEMANTIC_SAMPLEID ||
988 d->Semantic.Name == TGSI_SEMANTIC_SAMPLEPOS) {
989 break; /* Already handled from allocate_system_value_inputs */
990 } else if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) {
991 if (!ctx->native_integers) {
992 struct r600_bytecode_alu alu;
993 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
994
995 alu.op = ALU_OP1_INT_TO_FLT;
996 alu.src[0].sel = 0;
997 alu.src[0].chan = 3;
998
999 alu.dst.sel = 0;
1000 alu.dst.chan = 3;
1001 alu.dst.write = 1;
1002 alu.last = 1;
1003
1004 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1005 return r;
1006 }
1007 break;
1008 } else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID)
1009 break;
1010 else if (d->Semantic.Name == TGSI_SEMANTIC_INVOCATIONID)
1011 break;
1012 else if (d->Semantic.Name == TGSI_SEMANTIC_TESSINNER ||
1013 d->Semantic.Name == TGSI_SEMANTIC_TESSOUTER) {
1014 int param = r600_get_lds_unique_index(d->Semantic.Name, 0);
1015 int dreg = d->Semantic.Name == TGSI_SEMANTIC_TESSINNER ? 3 : 2;
1016 unsigned temp_reg = r600_get_temp(ctx);
1017
1018 r = get_lds_offset0(ctx, 2, temp_reg, true);
1019 if (r)
1020 return r;
1021
1022 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
1023 temp_reg, 0,
1024 temp_reg, 0,
1025 V_SQ_ALU_SRC_LITERAL, param * 16);
1026 if (r)
1027 return r;
1028
1029 do_lds_fetch_values(ctx, temp_reg, dreg, 0xf);
1030 }
1031 else if (d->Semantic.Name == TGSI_SEMANTIC_TESSCOORD) {
1032 /* MOV r1.x, r0.x;
1033 MOV r1.y, r0.y;
1034 */
1035 for (i = 0; i < 2; i++) {
1036 struct r600_bytecode_alu alu;
1037 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1038 alu.op = ALU_OP1_MOV;
1039 alu.src[0].sel = 0;
1040 alu.src[0].chan = 0 + i;
1041 alu.dst.sel = 1;
1042 alu.dst.chan = 0 + i;
1043 alu.dst.write = 1;
1044 alu.last = (i == 1) ? 1 : 0;
1045 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1046 return r;
1047 }
1048 /* ADD r1.z, 1.0f, -r0.x */
1049 struct r600_bytecode_alu alu;
1050 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1051 alu.op = ALU_OP2_ADD;
1052 alu.src[0].sel = V_SQ_ALU_SRC_1;
1053 alu.src[1].sel = 1;
1054 alu.src[1].chan = 0;
1055 alu.src[1].neg = 1;
1056 alu.dst.sel = 1;
1057 alu.dst.chan = 2;
1058 alu.dst.write = 1;
1059 alu.last = 1;
1060 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1061 return r;
1062
1063 /* ADD r1.z, r1.z, -r1.y */
1064 alu.op = ALU_OP2_ADD;
1065 alu.src[0].sel = 1;
1066 alu.src[0].chan = 2;
1067 alu.src[1].sel = 1;
1068 alu.src[1].chan = 1;
1069 alu.src[1].neg = 1;
1070 alu.dst.sel = 1;
1071 alu.dst.chan = 2;
1072 alu.dst.write = 1;
1073 alu.last = 1;
1074 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1075 return r;
1076 break;
1077 }
1078 break;
1079 default:
1080 R600_ERR("unsupported file %d declaration\n", d->Declaration.File);
1081 return -EINVAL;
1082 }
1083 return 0;
1084 }
1085
1086 static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_offset)
1087 {
1088 struct tgsi_parse_context parse;
1089 struct {
1090 boolean enabled;
1091 int *reg;
1092 unsigned name, alternate_name;
1093 } inputs[2] = {
1094 { false, &ctx->face_gpr, TGSI_SEMANTIC_SAMPLEMASK, ~0u }, /* lives in Front Face GPR.z */
1095
1096 { false, &ctx->fixed_pt_position_gpr, TGSI_SEMANTIC_SAMPLEID, TGSI_SEMANTIC_SAMPLEPOS } /* SAMPLEID is in Fixed Point Position GPR.w */
1097 };
1098 int num_regs = 0;
1099 unsigned k, i;
1100
1101 if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) {
1102 return 0;
1103 }
1104
1105 /* need to scan shader for system values and interpolateAtSample/Offset/Centroid */
1106 while (!tgsi_parse_end_of_tokens(&parse)) {
1107 tgsi_parse_token(&parse);
1108
1109 if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) {
1110 const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
1111 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE ||
1112 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
1113 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID)
1114 {
1115 int interpolate, location, k;
1116
1117 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
1118 location = TGSI_INTERPOLATE_LOC_CENTER;
1119 inputs[1].enabled = true; /* needs SAMPLEID */
1120 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
1121 location = TGSI_INTERPOLATE_LOC_CENTER;
1122 /* Needs sample positions, currently those are always available */
1123 } else {
1124 location = TGSI_INTERPOLATE_LOC_CENTROID;
1125 }
1126
1127 interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index];
1128 k = eg_get_interpolator_index(interpolate, location);
1129 if (k >= 0)
1130 ctx->eg_interpolators[k].enabled = true;
1131 }
1132 } else if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_DECLARATION) {
1133 struct tgsi_full_declaration *d = &parse.FullToken.FullDeclaration;
1134 if (d->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
1135 for (k = 0; k < ARRAY_SIZE(inputs); k++) {
1136 if (d->Semantic.Name == inputs[k].name ||
1137 d->Semantic.Name == inputs[k].alternate_name) {
1138 inputs[k].enabled = true;
1139 }
1140 }
1141 }
1142 }
1143 }
1144
1145 tgsi_parse_free(&parse);
1146
1147 for (i = 0; i < ARRAY_SIZE(inputs); i++) {
1148 boolean enabled = inputs[i].enabled;
1149 int *reg = inputs[i].reg;
1150 unsigned name = inputs[i].name;
1151
1152 if (enabled) {
1153 int gpr = gpr_offset + num_regs++;
1154 ctx->shader->nsys_inputs++;
1155
1156 // add to inputs, allocate a gpr
1157 k = ctx->shader->ninput++;
1158 ctx->shader->input[k].name = name;
1159 ctx->shader->input[k].sid = 0;
1160 ctx->shader->input[k].interpolate = TGSI_INTERPOLATE_CONSTANT;
1161 ctx->shader->input[k].interpolate_location = TGSI_INTERPOLATE_LOC_CENTER;
1162 *reg = ctx->shader->input[k].gpr = gpr;
1163 }
1164 }
1165
1166 return gpr_offset + num_regs;
1167 }
1168
1169 /*
1170 * for evergreen we need to scan the shader to find the number of GPRs we need to
1171 * reserve for interpolation and system values
1172 *
1173 * we need to know if we are going to emit
1174 * any sample or centroid inputs
1175 * if perspective and linear are required
1176 */
1177 static int evergreen_gpr_count(struct r600_shader_ctx *ctx)
1178 {
1179 unsigned i;
1180 int num_baryc;
1181 struct tgsi_parse_context parse;
1182
1183 memset(&ctx->eg_interpolators, 0, sizeof(ctx->eg_interpolators));
1184
1185 for (i = 0; i < ctx->info.num_inputs; i++) {
1186 int k;
1187 /* skip position/face/mask/sampleid */
1188 if (ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_POSITION ||
1189 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_FACE ||
1190 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEMASK ||
1191 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEID)
1192 continue;
1193
1194 k = eg_get_interpolator_index(
1195 ctx->info.input_interpolate[i],
1196 ctx->info.input_interpolate_loc[i]);
1197 if (k >= 0)
1198 ctx->eg_interpolators[k].enabled = TRUE;
1199 }
1200
1201 if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) {
1202 return 0;
1203 }
1204
1205 /* need to scan shader for system values and interpolateAtSample/Offset/Centroid */
1206 while (!tgsi_parse_end_of_tokens(&parse)) {
1207 tgsi_parse_token(&parse);
1208
1209 if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) {
1210 const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
1211 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE ||
1212 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
1213 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID)
1214 {
1215 int interpolate, location, k;
1216
1217 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
1218 location = TGSI_INTERPOLATE_LOC_CENTER;
1219 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
1220 location = TGSI_INTERPOLATE_LOC_CENTER;
1221 } else {
1222 location = TGSI_INTERPOLATE_LOC_CENTROID;
1223 }
1224
1225 interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index];
1226 k = eg_get_interpolator_index(interpolate, location);
1227 if (k >= 0)
1228 ctx->eg_interpolators[k].enabled = true;
1229 }
1230 }
1231 }
1232
1233 tgsi_parse_free(&parse);
1234
1235 /* assign gpr to each interpolator according to priority */
1236 num_baryc = 0;
1237 for (i = 0; i < ARRAY_SIZE(ctx->eg_interpolators); i++) {
1238 if (ctx->eg_interpolators[i].enabled) {
1239 ctx->eg_interpolators[i].ij_index = num_baryc;
1240 num_baryc ++;
1241 }
1242 }
1243
1244 /* XXX PULL MODEL and LINE STIPPLE */
1245
1246 num_baryc = (num_baryc + 1) >> 1;
1247 return allocate_system_value_inputs(ctx, num_baryc);
1248 }
1249
1250 /* sample_id_sel == NULL means fetch for current sample */
1251 static int load_sample_position(struct r600_shader_ctx *ctx, struct r600_shader_src *sample_id, int chan_sel)
1252 {
1253 struct r600_bytecode_vtx vtx;
1254 int r, t1;
1255
1256 assert(ctx->fixed_pt_position_gpr != -1);
1257
1258 t1 = r600_get_temp(ctx);
1259
1260 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
1261 vtx.op = FETCH_OP_VFETCH;
1262 vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;
1263 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1264 if (sample_id == NULL) {
1265 vtx.src_gpr = ctx->fixed_pt_position_gpr; // SAMPLEID is in .w;
1266 vtx.src_sel_x = 3;
1267 }
1268 else {
1269 struct r600_bytecode_alu alu;
1270
1271 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1272 alu.op = ALU_OP1_MOV;
1273 r600_bytecode_src(&alu.src[0], sample_id, chan_sel);
1274 alu.dst.sel = t1;
1275 alu.dst.write = 1;
1276 alu.last = 1;
1277 r = r600_bytecode_add_alu(ctx->bc, &alu);
1278 if (r)
1279 return r;
1280
1281 vtx.src_gpr = t1;
1282 vtx.src_sel_x = 0;
1283 }
1284 vtx.mega_fetch_count = 16;
1285 vtx.dst_gpr = t1;
1286 vtx.dst_sel_x = 0;
1287 vtx.dst_sel_y = 1;
1288 vtx.dst_sel_z = 2;
1289 vtx.dst_sel_w = 3;
1290 vtx.data_format = FMT_32_32_32_32_FLOAT;
1291 vtx.num_format_all = 2;
1292 vtx.format_comp_all = 1;
1293 vtx.use_const_fields = 0;
1294 vtx.offset = 1; // first element is size of buffer
1295 vtx.endian = r600_endian_swap(32);
1296 vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */
1297
1298 r = r600_bytecode_add_vtx(ctx->bc, &vtx);
1299 if (r)
1300 return r;
1301
1302 return t1;
1303 }
1304
1305 static void tgsi_src(struct r600_shader_ctx *ctx,
1306 const struct tgsi_full_src_register *tgsi_src,
1307 struct r600_shader_src *r600_src)
1308 {
1309 memset(r600_src, 0, sizeof(*r600_src));
1310 r600_src->swizzle[0] = tgsi_src->Register.SwizzleX;
1311 r600_src->swizzle[1] = tgsi_src->Register.SwizzleY;
1312 r600_src->swizzle[2] = tgsi_src->Register.SwizzleZ;
1313 r600_src->swizzle[3] = tgsi_src->Register.SwizzleW;
1314 r600_src->neg = tgsi_src->Register.Negate;
1315 r600_src->abs = tgsi_src->Register.Absolute;
1316
1317 if (tgsi_src->Register.File == TGSI_FILE_IMMEDIATE) {
1318 int index;
1319 if ((tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleY) &&
1320 (tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleZ) &&
1321 (tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleW)) {
1322
1323 index = tgsi_src->Register.Index * 4 + tgsi_src->Register.SwizzleX;
1324 r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel, &r600_src->neg, r600_src->abs);
1325 if (r600_src->sel != V_SQ_ALU_SRC_LITERAL)
1326 return;
1327 }
1328 index = tgsi_src->Register.Index;
1329 r600_src->sel = V_SQ_ALU_SRC_LITERAL;
1330 memcpy(r600_src->value, ctx->literals + index * 4, sizeof(r600_src->value));
1331 } else if (tgsi_src->Register.File == TGSI_FILE_SYSTEM_VALUE) {
1332 if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEMASK) {
1333 r600_src->swizzle[0] = 2; // Z value
1334 r600_src->swizzle[1] = 2;
1335 r600_src->swizzle[2] = 2;
1336 r600_src->swizzle[3] = 2;
1337 r600_src->sel = ctx->face_gpr;
1338 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEID) {
1339 r600_src->swizzle[0] = 3; // W value
1340 r600_src->swizzle[1] = 3;
1341 r600_src->swizzle[2] = 3;
1342 r600_src->swizzle[3] = 3;
1343 r600_src->sel = ctx->fixed_pt_position_gpr;
1344 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEPOS) {
1345 r600_src->swizzle[0] = 0;
1346 r600_src->swizzle[1] = 1;
1347 r600_src->swizzle[2] = 4;
1348 r600_src->swizzle[3] = 4;
1349 r600_src->sel = load_sample_position(ctx, NULL, -1);
1350 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INSTANCEID) {
1351 r600_src->swizzle[0] = 3;
1352 r600_src->swizzle[1] = 3;
1353 r600_src->swizzle[2] = 3;
1354 r600_src->swizzle[3] = 3;
1355 r600_src->sel = 0;
1356 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTEXID) {
1357 r600_src->swizzle[0] = 0;
1358 r600_src->swizzle[1] = 0;
1359 r600_src->swizzle[2] = 0;
1360 r600_src->swizzle[3] = 0;
1361 r600_src->sel = 0;
1362 } else if (ctx->type != PIPE_SHADER_TESS_CTRL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) {
1363 r600_src->swizzle[0] = 3;
1364 r600_src->swizzle[1] = 3;
1365 r600_src->swizzle[2] = 3;
1366 r600_src->swizzle[3] = 3;
1367 r600_src->sel = 1;
1368 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) {
1369 r600_src->swizzle[0] = 2;
1370 r600_src->swizzle[1] = 2;
1371 r600_src->swizzle[2] = 2;
1372 r600_src->swizzle[3] = 2;
1373 r600_src->sel = 0;
1374 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSCOORD) {
1375 r600_src->sel = 1;
1376 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSINNER) {
1377 r600_src->sel = 3;
1378 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSOUTER) {
1379 r600_src->sel = 2;
1380 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTICESIN) {
1381 if (ctx->type == PIPE_SHADER_TESS_CTRL) {
1382 r600_src->sel = ctx->tess_input_info;
1383 r600_src->swizzle[0] = 2;
1384 r600_src->swizzle[1] = 2;
1385 r600_src->swizzle[2] = 2;
1386 r600_src->swizzle[3] = 2;
1387 } else {
1388 r600_src->sel = ctx->tess_input_info;
1389 r600_src->swizzle[0] = 3;
1390 r600_src->swizzle[1] = 3;
1391 r600_src->swizzle[2] = 3;
1392 r600_src->swizzle[3] = 3;
1393 }
1394 } else if (ctx->type == PIPE_SHADER_TESS_CTRL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_PRIMID) {
1395 r600_src->sel = 0;
1396 r600_src->swizzle[0] = 0;
1397 r600_src->swizzle[1] = 0;
1398 r600_src->swizzle[2] = 0;
1399 r600_src->swizzle[3] = 0;
1400 } else if (ctx->type == PIPE_SHADER_TESS_EVAL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_PRIMID) {
1401 r600_src->sel = 0;
1402 r600_src->swizzle[0] = 3;
1403 r600_src->swizzle[1] = 3;
1404 r600_src->swizzle[2] = 3;
1405 r600_src->swizzle[3] = 3;
1406 }
1407 } else {
1408 if (tgsi_src->Register.Indirect)
1409 r600_src->rel = V_SQ_REL_RELATIVE;
1410 r600_src->sel = tgsi_src->Register.Index;
1411 r600_src->sel += ctx->file_offset[tgsi_src->Register.File];
1412 }
1413 if (tgsi_src->Register.File == TGSI_FILE_CONSTANT) {
1414 if (tgsi_src->Register.Dimension) {
1415 r600_src->kc_bank = tgsi_src->Dimension.Index;
1416 if (tgsi_src->Dimension.Indirect) {
1417 r600_src->kc_rel = 1;
1418 }
1419 }
1420 }
1421 }
1422
1423 static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
1424 unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,
1425 unsigned int dst_reg)
1426 {
1427 struct r600_bytecode_vtx vtx;
1428 unsigned int ar_reg;
1429 int r;
1430
1431 if (offset) {
1432 struct r600_bytecode_alu alu;
1433
1434 memset(&alu, 0, sizeof(alu));
1435
1436 alu.op = ALU_OP2_ADD_INT;
1437 alu.src[0].sel = ctx->bc->ar_reg;
1438 alu.src[0].chan = ar_chan;
1439
1440 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1441 alu.src[1].value = offset;
1442
1443 alu.dst.sel = dst_reg;
1444 alu.dst.chan = ar_chan;
1445 alu.dst.write = 1;
1446 alu.last = 1;
1447
1448 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1449 return r;
1450
1451 ar_reg = dst_reg;
1452 } else {
1453 ar_reg = ctx->bc->ar_reg;
1454 }
1455
1456 memset(&vtx, 0, sizeof(vtx));
1457 vtx.buffer_id = cb_idx;
1458 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1459 vtx.src_gpr = ar_reg;
1460 vtx.src_sel_x = ar_chan;
1461 vtx.mega_fetch_count = 16;
1462 vtx.dst_gpr = dst_reg;
1463 vtx.dst_sel_x = 0; /* SEL_X */
1464 vtx.dst_sel_y = 1; /* SEL_Y */
1465 vtx.dst_sel_z = 2; /* SEL_Z */
1466 vtx.dst_sel_w = 3; /* SEL_W */
1467 vtx.data_format = FMT_32_32_32_32_FLOAT;
1468 vtx.num_format_all = 2; /* NUM_FORMAT_SCALED */
1469 vtx.format_comp_all = 1; /* FORMAT_COMP_SIGNED */
1470 vtx.endian = r600_endian_swap(32);
1471 vtx.buffer_index_mode = cb_rel; // cb_rel ? V_SQ_CF_INDEX_0 : V_SQ_CF_INDEX_NONE;
1472
1473 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1474 return r;
1475
1476 return 0;
1477 }
1478
1479 static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1480 {
1481 struct r600_bytecode_vtx vtx;
1482 int r;
1483 unsigned index = src->Register.Index;
1484 unsigned vtx_id = src->Dimension.Index;
1485 int offset_reg = ctx->gs_rotated_input[vtx_id / 3];
1486 int offset_chan = vtx_id % 3;
1487 int t2 = 0;
1488
1489 /* offsets of per-vertex data in ESGS ring are passed to GS in R0.x, R0.y,
1490 * R0.w, R1.x, R1.y, R1.z (it seems R0.z is used for PrimitiveID) */
1491
1492 if (offset_reg == ctx->gs_rotated_input[0] && offset_chan == 2)
1493 offset_chan = 3;
1494
1495 if (src->Dimension.Indirect || src->Register.Indirect)
1496 t2 = r600_get_temp(ctx);
1497
1498 if (src->Dimension.Indirect) {
1499 int treg[3];
1500 struct r600_bytecode_alu alu;
1501 int r, i;
1502 unsigned addr_reg;
1503 addr_reg = get_address_file_reg(ctx, src->DimIndirect.Index);
1504 if (src->DimIndirect.Index > 0) {
1505 r = single_alu_op2(ctx, ALU_OP1_MOV,
1506 ctx->bc->ar_reg, 0,
1507 addr_reg, 0,
1508 0, 0);
1509 if (r)
1510 return r;
1511 }
1512 /*
1513 we have to put the R0.x/y/w into Rt.x Rt+1.x Rt+2.x then index reg from Rt.
1514 at least this is what fglrx seems to do. */
1515 for (i = 0; i < 3; i++) {
1516 treg[i] = r600_get_temp(ctx);
1517 }
1518 r600_add_gpr_array(ctx->shader, treg[0], 3, 0x0F);
1519
1520 for (i = 0; i < 3; i++) {
1521 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1522 alu.op = ALU_OP1_MOV;
1523 alu.src[0].sel = ctx->gs_rotated_input[0];
1524 alu.src[0].chan = i == 2 ? 3 : i;
1525 alu.dst.sel = treg[i];
1526 alu.dst.chan = 0;
1527 alu.dst.write = 1;
1528 alu.last = 1;
1529 r = r600_bytecode_add_alu(ctx->bc, &alu);
1530 if (r)
1531 return r;
1532 }
1533 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1534 alu.op = ALU_OP1_MOV;
1535 alu.src[0].sel = treg[0];
1536 alu.src[0].rel = 1;
1537 alu.dst.sel = t2;
1538 alu.dst.write = 1;
1539 alu.last = 1;
1540 r = r600_bytecode_add_alu(ctx->bc, &alu);
1541 if (r)
1542 return r;
1543 offset_reg = t2;
1544 offset_chan = 0;
1545 }
1546
1547 if (src->Register.Indirect) {
1548 int addr_reg;
1549 unsigned first = ctx->info.input_array_first[src->Indirect.ArrayID];
1550
1551 addr_reg = get_address_file_reg(ctx, src->Indirect.Index);
1552
1553 /* pull the value from index_reg */
1554 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
1555 t2, 1,
1556 addr_reg, 0,
1557 V_SQ_ALU_SRC_LITERAL, first);
1558 if (r)
1559 return r;
1560 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
1561 t2, 0,
1562 t2, 1,
1563 V_SQ_ALU_SRC_LITERAL, 4,
1564 offset_reg, offset_chan);
1565 if (r)
1566 return r;
1567 offset_reg = t2;
1568 offset_chan = 0;
1569 index = src->Register.Index - first;
1570 }
1571
1572 memset(&vtx, 0, sizeof(vtx));
1573 vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
1574 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1575 vtx.src_gpr = offset_reg;
1576 vtx.src_sel_x = offset_chan;
1577 vtx.offset = index * 16; /*bytes*/
1578 vtx.mega_fetch_count = 16;
1579 vtx.dst_gpr = dst_reg;
1580 vtx.dst_sel_x = 0; /* SEL_X */
1581 vtx.dst_sel_y = 1; /* SEL_Y */
1582 vtx.dst_sel_z = 2; /* SEL_Z */
1583 vtx.dst_sel_w = 3; /* SEL_W */
1584 if (ctx->bc->chip_class >= EVERGREEN) {
1585 vtx.use_const_fields = 1;
1586 } else {
1587 vtx.data_format = FMT_32_32_32_32_FLOAT;
1588 }
1589
1590 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1591 return r;
1592
1593 return 0;
1594 }
1595
1596 static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx)
1597 {
1598 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1599 unsigned i;
1600
1601 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1602 struct tgsi_full_src_register *src = &inst->Src[i];
1603
1604 if (src->Register.File == TGSI_FILE_INPUT) {
1605 if (ctx->shader->input[src->Register.Index].name == TGSI_SEMANTIC_PRIMID) {
1606 /* primitive id is in R0.z */
1607 ctx->src[i].sel = 0;
1608 ctx->src[i].swizzle[0] = 2;
1609 }
1610 }
1611 if (src->Register.File == TGSI_FILE_INPUT && src->Register.Dimension) {
1612 int treg = r600_get_temp(ctx);
1613
1614 fetch_gs_input(ctx, src, treg);
1615 ctx->src[i].sel = treg;
1616 ctx->src[i].rel = 0;
1617 }
1618 }
1619 return 0;
1620 }
1621
1622
1623 /* Tessellation shaders pass outputs to the next shader using LDS.
1624 *
1625 * LS outputs = TCS(HS) inputs
1626 * TCS(HS) outputs = TES(DS) inputs
1627 *
1628 * The LDS layout is:
1629 * - TCS inputs for patch 0
1630 * - TCS inputs for patch 1
1631 * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2)
1632 * - ...
1633 * - TCS outputs for patch 0 = get_tcs_out_patch0_offset
1634 * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset
1635 * - TCS outputs for patch 1
1636 * - Per-patch TCS outputs for patch 1
1637 * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2)
1638 * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
1639 * - ...
1640 *
1641 * All three shaders VS(LS), TCS, TES share the same LDS space.
1642 */
1643 /* this will return with the dw address in temp_reg.x */
1644 static int r600_get_byte_address(struct r600_shader_ctx *ctx, int temp_reg,
1645 const struct tgsi_full_dst_register *dst,
1646 const struct tgsi_full_src_register *src,
1647 int stride_bytes_reg, int stride_bytes_chan)
1648 {
1649 struct tgsi_full_dst_register reg;
1650 ubyte *name, *index, *array_first;
1651 int r;
1652 int param;
1653 struct tgsi_shader_info *info = &ctx->info;
1654 /* Set the register description. The address computation is the same
1655 * for sources and destinations. */
1656 if (src) {
1657 reg.Register.File = src->Register.File;
1658 reg.Register.Index = src->Register.Index;
1659 reg.Register.Indirect = src->Register.Indirect;
1660 reg.Register.Dimension = src->Register.Dimension;
1661 reg.Indirect = src->Indirect;
1662 reg.Dimension = src->Dimension;
1663 reg.DimIndirect = src->DimIndirect;
1664 } else
1665 reg = *dst;
1666
1667 /* If the register is 2-dimensional (e.g. an array of vertices
1668 * in a primitive), calculate the base address of the vertex. */
1669 if (reg.Register.Dimension) {
1670 int sel, chan;
1671 if (reg.Dimension.Indirect) {
1672 unsigned addr_reg;
1673 assert (reg.DimIndirect.File == TGSI_FILE_ADDRESS);
1674
1675 addr_reg = get_address_file_reg(ctx, reg.DimIndirect.Index);
1676 /* pull the value from index_reg */
1677 sel = addr_reg;
1678 chan = 0;
1679 } else {
1680 sel = V_SQ_ALU_SRC_LITERAL;
1681 chan = reg.Dimension.Index;
1682 }
1683
1684 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
1685 temp_reg, 0,
1686 stride_bytes_reg, stride_bytes_chan,
1687 sel, chan,
1688 temp_reg, 0);
1689 if (r)
1690 return r;
1691 }
1692
1693 if (reg.Register.File == TGSI_FILE_INPUT) {
1694 name = info->input_semantic_name;
1695 index = info->input_semantic_index;
1696 array_first = info->input_array_first;
1697 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
1698 name = info->output_semantic_name;
1699 index = info->output_semantic_index;
1700 array_first = info->output_array_first;
1701 } else {
1702 assert(0);
1703 return -1;
1704 }
1705 if (reg.Register.Indirect) {
1706 int addr_reg;
1707 int first;
1708 /* Add the relative address of the element. */
1709 if (reg.Indirect.ArrayID)
1710 first = array_first[reg.Indirect.ArrayID];
1711 else
1712 first = reg.Register.Index;
1713
1714 addr_reg = get_address_file_reg(ctx, reg.Indirect.Index);
1715
1716 /* pull the value from index_reg */
1717 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
1718 temp_reg, 0,
1719 V_SQ_ALU_SRC_LITERAL, 16,
1720 addr_reg, 0,
1721 temp_reg, 0);
1722 if (r)
1723 return r;
1724
1725 param = r600_get_lds_unique_index(name[first],
1726 index[first]);
1727
1728 } else {
1729 param = r600_get_lds_unique_index(name[reg.Register.Index],
1730 index[reg.Register.Index]);
1731 }
1732
1733 /* add to base_addr - passed in temp_reg.x */
1734 if (param) {
1735 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
1736 temp_reg, 0,
1737 temp_reg, 0,
1738 V_SQ_ALU_SRC_LITERAL, param * 16);
1739 if (r)
1740 return r;
1741
1742 }
1743 return 0;
1744 }
1745
1746 static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg,
1747 unsigned dst_reg, unsigned mask)
1748 {
1749 struct r600_bytecode_alu alu;
1750 int r, i, lasti;
1751
1752 if ((ctx->bc->cf_last->ndw>>1) >= 0x60)
1753 ctx->bc->force_add_cf = 1;
1754
1755 lasti = tgsi_last_instruction(mask);
1756 for (i = 1; i <= lasti; i++) {
1757 if (!(mask & (1 << i)))
1758 continue;
1759
1760 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
1761 temp_reg, i,
1762 temp_reg, 0,
1763 V_SQ_ALU_SRC_LITERAL, 4 * i);
1764 if (r)
1765 return r;
1766 }
1767 for (i = 0; i <= lasti; i++) {
1768 if (!(mask & (1 << i)))
1769 continue;
1770
1771 /* emit an LDS_READ_RET */
1772 memset(&alu, 0, sizeof(alu));
1773 alu.op = LDS_OP1_LDS_READ_RET;
1774 alu.src[0].sel = temp_reg;
1775 alu.src[0].chan = i;
1776 alu.src[1].sel = V_SQ_ALU_SRC_0;
1777 alu.src[2].sel = V_SQ_ALU_SRC_0;
1778 alu.dst.chan = 0;
1779 alu.is_lds_idx_op = true;
1780 alu.last = 1;
1781 r = r600_bytecode_add_alu(ctx->bc, &alu);
1782 if (r)
1783 return r;
1784 }
1785 for (i = 0; i <= lasti; i++) {
1786 if (!(mask & (1 << i)))
1787 continue;
1788
1789 /* then read from LDS_OQ_A_POP */
1790 memset(&alu, 0, sizeof(alu));
1791
1792 alu.op = ALU_OP1_MOV;
1793 alu.src[0].sel = EG_V_SQ_ALU_SRC_LDS_OQ_A_POP;
1794 alu.src[0].chan = 0;
1795 alu.dst.sel = dst_reg;
1796 alu.dst.chan = i;
1797 alu.dst.write = 1;
1798 alu.last = 1;
1799 r = r600_bytecode_add_alu(ctx->bc, &alu);
1800 if (r)
1801 return r;
1802 }
1803 return 0;
1804 }
1805
1806 static int fetch_mask(struct tgsi_src_register *reg)
1807 {
1808 int mask = 0;
1809 mask |= 1 << reg->SwizzleX;
1810 mask |= 1 << reg->SwizzleY;
1811 mask |= 1 << reg->SwizzleZ;
1812 mask |= 1 << reg->SwizzleW;
1813 return mask;
1814 }
1815
1816 static int fetch_tes_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1817 {
1818 int r;
1819 unsigned temp_reg = r600_get_temp(ctx);
1820
1821 r = get_lds_offset0(ctx, 2, temp_reg,
1822 src->Register.Dimension ? false : true);
1823 if (r)
1824 return r;
1825
1826 /* the base address is now in temp.x */
1827 r = r600_get_byte_address(ctx, temp_reg,
1828 NULL, src, ctx->tess_output_info, 1);
1829 if (r)
1830 return r;
1831
1832 r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register));
1833 if (r)
1834 return r;
1835 return 0;
1836 }
1837
1838 static int fetch_tcs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1839 {
1840 int r;
1841 unsigned temp_reg = r600_get_temp(ctx);
1842
1843 /* t.x = ips * r0.y */
1844 r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24,
1845 temp_reg, 0,
1846 ctx->tess_input_info, 0,
1847 0, 1);
1848
1849 if (r)
1850 return r;
1851
1852 /* the base address is now in temp.x */
1853 r = r600_get_byte_address(ctx, temp_reg,
1854 NULL, src, ctx->tess_input_info, 1);
1855 if (r)
1856 return r;
1857
1858 r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register));
1859 if (r)
1860 return r;
1861 return 0;
1862 }
1863
1864 static int fetch_tcs_output(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1865 {
1866 int r;
1867 unsigned temp_reg = r600_get_temp(ctx);
1868
1869 r = get_lds_offset0(ctx, 1, temp_reg,
1870 src->Register.Dimension ? false : true);
1871 if (r)
1872 return r;
1873 /* the base address is now in temp.x */
1874 r = r600_get_byte_address(ctx, temp_reg,
1875 NULL, src,
1876 ctx->tess_output_info, 1);
1877 if (r)
1878 return r;
1879
1880 r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register));
1881 if (r)
1882 return r;
1883 return 0;
1884 }
1885
1886 static int tgsi_split_lds_inputs(struct r600_shader_ctx *ctx)
1887 {
1888 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1889 unsigned i;
1890
1891 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1892 struct tgsi_full_src_register *src = &inst->Src[i];
1893
1894 if (ctx->type == PIPE_SHADER_TESS_EVAL && src->Register.File == TGSI_FILE_INPUT) {
1895 int treg = r600_get_temp(ctx);
1896 fetch_tes_input(ctx, src, treg);
1897 ctx->src[i].sel = treg;
1898 ctx->src[i].rel = 0;
1899 }
1900 if (ctx->type == PIPE_SHADER_TESS_CTRL && src->Register.File == TGSI_FILE_INPUT) {
1901 int treg = r600_get_temp(ctx);
1902 fetch_tcs_input(ctx, src, treg);
1903 ctx->src[i].sel = treg;
1904 ctx->src[i].rel = 0;
1905 }
1906 if (ctx->type == PIPE_SHADER_TESS_CTRL && src->Register.File == TGSI_FILE_OUTPUT) {
1907 int treg = r600_get_temp(ctx);
1908 fetch_tcs_output(ctx, src, treg);
1909 ctx->src[i].sel = treg;
1910 ctx->src[i].rel = 0;
1911 }
1912 }
1913 return 0;
1914 }
1915
1916 static int tgsi_split_constant(struct r600_shader_ctx *ctx)
1917 {
1918 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1919 struct r600_bytecode_alu alu;
1920 int i, j, k, nconst, r;
1921
1922 for (i = 0, nconst = 0; i < inst->Instruction.NumSrcRegs; i++) {
1923 if (inst->Src[i].Register.File == TGSI_FILE_CONSTANT) {
1924 nconst++;
1925 }
1926 tgsi_src(ctx, &inst->Src[i], &ctx->src[i]);
1927 }
1928 for (i = 0, j = nconst - 1; i < inst->Instruction.NumSrcRegs; i++) {
1929 if (inst->Src[i].Register.File != TGSI_FILE_CONSTANT) {
1930 continue;
1931 }
1932
1933 if (ctx->src[i].rel) {
1934 int chan = inst->Src[i].Indirect.Swizzle;
1935 int treg = r600_get_temp(ctx);
1936 if ((r = tgsi_fetch_rel_const(ctx, ctx->src[i].kc_bank, ctx->src[i].kc_rel, ctx->src[i].sel - 512, chan, treg)))
1937 return r;
1938
1939 ctx->src[i].kc_bank = 0;
1940 ctx->src[i].kc_rel = 0;
1941 ctx->src[i].sel = treg;
1942 ctx->src[i].rel = 0;
1943 j--;
1944 } else if (j > 0) {
1945 int treg = r600_get_temp(ctx);
1946 for (k = 0; k < 4; k++) {
1947 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1948 alu.op = ALU_OP1_MOV;
1949 alu.src[0].sel = ctx->src[i].sel;
1950 alu.src[0].chan = k;
1951 alu.src[0].rel = ctx->src[i].rel;
1952 alu.src[0].kc_bank = ctx->src[i].kc_bank;
1953 alu.src[0].kc_rel = ctx->src[i].kc_rel;
1954 alu.dst.sel = treg;
1955 alu.dst.chan = k;
1956 alu.dst.write = 1;
1957 if (k == 3)
1958 alu.last = 1;
1959 r = r600_bytecode_add_alu(ctx->bc, &alu);
1960 if (r)
1961 return r;
1962 }
1963 ctx->src[i].sel = treg;
1964 ctx->src[i].rel =0;
1965 j--;
1966 }
1967 }
1968 return 0;
1969 }
1970
1971 /* need to move any immediate into a temp - for trig functions which use literal for PI stuff */
1972 static int tgsi_split_literal_constant(struct r600_shader_ctx *ctx)
1973 {
1974 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1975 struct r600_bytecode_alu alu;
1976 int i, j, k, nliteral, r;
1977
1978 for (i = 0, nliteral = 0; i < inst->Instruction.NumSrcRegs; i++) {
1979 if (ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
1980 nliteral++;
1981 }
1982 }
1983 for (i = 0, j = nliteral - 1; i < inst->Instruction.NumSrcRegs; i++) {
1984 if (j > 0 && ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
1985 int treg = r600_get_temp(ctx);
1986 for (k = 0; k < 4; k++) {
1987 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1988 alu.op = ALU_OP1_MOV;
1989 alu.src[0].sel = ctx->src[i].sel;
1990 alu.src[0].chan = k;
1991 alu.src[0].value = ctx->src[i].value[k];
1992 alu.dst.sel = treg;
1993 alu.dst.chan = k;
1994 alu.dst.write = 1;
1995 if (k == 3)
1996 alu.last = 1;
1997 r = r600_bytecode_add_alu(ctx->bc, &alu);
1998 if (r)
1999 return r;
2000 }
2001 ctx->src[i].sel = treg;
2002 j--;
2003 }
2004 }
2005 return 0;
2006 }
2007
2008 static int process_twoside_color_inputs(struct r600_shader_ctx *ctx)
2009 {
2010 int i, r, count = ctx->shader->ninput;
2011
2012 for (i = 0; i < count; i++) {
2013 if (ctx->shader->input[i].name == TGSI_SEMANTIC_COLOR) {
2014 r = select_twoside_color(ctx, i, ctx->shader->input[i].back_color_input);
2015 if (r)
2016 return r;
2017 }
2018 }
2019 return 0;
2020 }
2021
2022 static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output_info *so,
2023 int stream, unsigned *stream_item_size UNUSED)
2024 {
2025 unsigned so_gpr[PIPE_MAX_SHADER_OUTPUTS];
2026 unsigned start_comp[PIPE_MAX_SHADER_OUTPUTS];
2027 int j, r;
2028 unsigned i;
2029
2030 /* Sanity checking. */
2031 if (so->num_outputs > PIPE_MAX_SO_OUTPUTS) {
2032 R600_ERR("Too many stream outputs: %d\n", so->num_outputs);
2033 r = -EINVAL;
2034 goto out_err;
2035 }
2036 for (i = 0; i < so->num_outputs; i++) {
2037 if (so->output[i].output_buffer >= 4) {
2038 R600_ERR("Exceeded the max number of stream output buffers, got: %d\n",
2039 so->output[i].output_buffer);
2040 r = -EINVAL;
2041 goto out_err;
2042 }
2043 }
2044
2045 /* Initialize locations where the outputs are stored. */
2046 for (i = 0; i < so->num_outputs; i++) {
2047
2048 so_gpr[i] = ctx->shader->output[so->output[i].register_index].gpr;
2049 start_comp[i] = so->output[i].start_component;
2050 /* Lower outputs with dst_offset < start_component.
2051 *
2052 * We can only output 4D vectors with a write mask, e.g. we can
2053 * only output the W component at offset 3, etc. If we want
2054 * to store Y, Z, or W at buffer offset 0, we need to use MOV
2055 * to move it to X and output X. */
2056 if (so->output[i].dst_offset < so->output[i].start_component) {
2057 unsigned tmp = r600_get_temp(ctx);
2058
2059 for (j = 0; j < so->output[i].num_components; j++) {
2060 struct r600_bytecode_alu alu;
2061 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2062 alu.op = ALU_OP1_MOV;
2063 alu.src[0].sel = so_gpr[i];
2064 alu.src[0].chan = so->output[i].start_component + j;
2065
2066 alu.dst.sel = tmp;
2067 alu.dst.chan = j;
2068 alu.dst.write = 1;
2069 if (j == so->output[i].num_components - 1)
2070 alu.last = 1;
2071 r = r600_bytecode_add_alu(ctx->bc, &alu);
2072 if (r)
2073 return r;
2074 }
2075 start_comp[i] = 0;
2076 so_gpr[i] = tmp;
2077 }
2078 }
2079
2080 /* Write outputs to buffers. */
2081 for (i = 0; i < so->num_outputs; i++) {
2082 struct r600_bytecode_output output;
2083
2084 if (stream != -1 && stream != so->output[i].output_buffer)
2085 continue;
2086
2087 memset(&output, 0, sizeof(struct r600_bytecode_output));
2088 output.gpr = so_gpr[i];
2089 output.elem_size = so->output[i].num_components - 1;
2090 if (output.elem_size == 2)
2091 output.elem_size = 3; // 3 not supported, write 4 with junk at end
2092 output.array_base = so->output[i].dst_offset - start_comp[i];
2093 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
2094 output.burst_count = 1;
2095 /* array_size is an upper limit for the burst_count
2096 * with MEM_STREAM instructions */
2097 output.array_size = 0xFFF;
2098 output.comp_mask = ((1 << so->output[i].num_components) - 1) << start_comp[i];
2099
2100 if (ctx->bc->chip_class >= EVERGREEN) {
2101 switch (so->output[i].output_buffer) {
2102 case 0:
2103 output.op = CF_OP_MEM_STREAM0_BUF0;
2104 break;
2105 case 1:
2106 output.op = CF_OP_MEM_STREAM0_BUF1;
2107 break;
2108 case 2:
2109 output.op = CF_OP_MEM_STREAM0_BUF2;
2110 break;
2111 case 3:
2112 output.op = CF_OP_MEM_STREAM0_BUF3;
2113 break;
2114 }
2115 output.op += so->output[i].stream * 4;
2116 assert(output.op >= CF_OP_MEM_STREAM0_BUF0 && output.op <= CF_OP_MEM_STREAM3_BUF3);
2117 ctx->enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer) << so->output[i].stream * 4;
2118 } else {
2119 switch (so->output[i].output_buffer) {
2120 case 0:
2121 output.op = CF_OP_MEM_STREAM0;
2122 break;
2123 case 1:
2124 output.op = CF_OP_MEM_STREAM1;
2125 break;
2126 case 2:
2127 output.op = CF_OP_MEM_STREAM2;
2128 break;
2129 case 3:
2130 output.op = CF_OP_MEM_STREAM3;
2131 break;
2132 }
2133 ctx->enabled_stream_buffers_mask |= 1 << so->output[i].output_buffer;
2134 }
2135 r = r600_bytecode_add_output(ctx->bc, &output);
2136 if (r)
2137 goto out_err;
2138 }
2139 return 0;
2140 out_err:
2141 return r;
2142 }
2143
2144 static void convert_edgeflag_to_int(struct r600_shader_ctx *ctx)
2145 {
2146 struct r600_bytecode_alu alu;
2147 unsigned reg;
2148
2149 if (!ctx->shader->vs_out_edgeflag)
2150 return;
2151
2152 reg = ctx->shader->output[ctx->edgeflag_output].gpr;
2153
2154 /* clamp(x, 0, 1) */
2155 memset(&alu, 0, sizeof(alu));
2156 alu.op = ALU_OP1_MOV;
2157 alu.src[0].sel = reg;
2158 alu.dst.sel = reg;
2159 alu.dst.write = 1;
2160 alu.dst.clamp = 1;
2161 alu.last = 1;
2162 r600_bytecode_add_alu(ctx->bc, &alu);
2163
2164 memset(&alu, 0, sizeof(alu));
2165 alu.op = ALU_OP1_FLT_TO_INT;
2166 alu.src[0].sel = reg;
2167 alu.dst.sel = reg;
2168 alu.dst.write = 1;
2169 alu.last = 1;
2170 r600_bytecode_add_alu(ctx->bc, &alu);
2171 }
2172
2173 static int generate_gs_copy_shader(struct r600_context *rctx,
2174 struct r600_pipe_shader *gs,
2175 struct pipe_stream_output_info *so)
2176 {
2177 struct r600_shader_ctx ctx = {};
2178 struct r600_shader *gs_shader = &gs->shader;
2179 struct r600_pipe_shader *cshader;
2180 unsigned ocnt = gs_shader->noutput;
2181 struct r600_bytecode_alu alu;
2182 struct r600_bytecode_vtx vtx;
2183 struct r600_bytecode_output output;
2184 struct r600_bytecode_cf *cf_jump, *cf_pop,
2185 *last_exp_pos = NULL, *last_exp_param = NULL;
2186 int next_clip_pos = 61, next_param = 0;
2187 unsigned i, j;
2188 int ring;
2189 bool only_ring_0 = true;
2190 cshader = calloc(1, sizeof(struct r600_pipe_shader));
2191 if (!cshader)
2192 return 0;
2193
2194 memcpy(cshader->shader.output, gs_shader->output, ocnt *
2195 sizeof(struct r600_shader_io));
2196
2197 cshader->shader.noutput = ocnt;
2198
2199 ctx.shader = &cshader->shader;
2200 ctx.bc = &ctx.shader->bc;
2201 ctx.type = ctx.bc->type = PIPE_SHADER_VERTEX;
2202
2203 r600_bytecode_init(ctx.bc, rctx->b.chip_class, rctx->b.family,
2204 rctx->screen->has_compressed_msaa_texturing);
2205
2206 ctx.bc->isa = rctx->isa;
2207
2208 cf_jump = NULL;
2209 memset(cshader->shader.ring_item_sizes, 0, sizeof(cshader->shader.ring_item_sizes));
2210
2211 /* R0.x = R0.x & 0x3fffffff */
2212 memset(&alu, 0, sizeof(alu));
2213 alu.op = ALU_OP2_AND_INT;
2214 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2215 alu.src[1].value = 0x3fffffff;
2216 alu.dst.write = 1;
2217 r600_bytecode_add_alu(ctx.bc, &alu);
2218
2219 /* R0.y = R0.x >> 30 */
2220 memset(&alu, 0, sizeof(alu));
2221 alu.op = ALU_OP2_LSHR_INT;
2222 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2223 alu.src[1].value = 0x1e;
2224 alu.dst.chan = 1;
2225 alu.dst.write = 1;
2226 alu.last = 1;
2227 r600_bytecode_add_alu(ctx.bc, &alu);
2228
2229 /* fetch vertex data from GSVS ring */
2230 for (i = 0; i < ocnt; ++i) {
2231 struct r600_shader_io *out = &ctx.shader->output[i];
2232
2233 out->gpr = i + 1;
2234 out->ring_offset = i * 16;
2235
2236 memset(&vtx, 0, sizeof(vtx));
2237 vtx.op = FETCH_OP_VFETCH;
2238 vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
2239 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
2240 vtx.mega_fetch_count = 16;
2241 vtx.offset = out->ring_offset;
2242 vtx.dst_gpr = out->gpr;
2243 vtx.src_gpr = 0;
2244 vtx.dst_sel_x = 0;
2245 vtx.dst_sel_y = 1;
2246 vtx.dst_sel_z = 2;
2247 vtx.dst_sel_w = 3;
2248 if (rctx->b.chip_class >= EVERGREEN) {
2249 vtx.use_const_fields = 1;
2250 } else {
2251 vtx.data_format = FMT_32_32_32_32_FLOAT;
2252 }
2253
2254 r600_bytecode_add_vtx(ctx.bc, &vtx);
2255 }
2256 ctx.temp_reg = i + 1;
2257 for (ring = 3; ring >= 0; --ring) {
2258 bool enabled = false;
2259 for (i = 0; i < so->num_outputs; i++) {
2260 if (so->output[i].stream == ring) {
2261 enabled = true;
2262 if (ring > 0)
2263 only_ring_0 = false;
2264 break;
2265 }
2266 }
2267 if (ring != 0 && !enabled) {
2268 cshader->shader.ring_item_sizes[ring] = 0;
2269 continue;
2270 }
2271
2272 if (cf_jump) {
2273 // Patch up jump label
2274 r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
2275 cf_pop = ctx.bc->cf_last;
2276
2277 cf_jump->cf_addr = cf_pop->id + 2;
2278 cf_jump->pop_count = 1;
2279 cf_pop->cf_addr = cf_pop->id + 2;
2280 cf_pop->pop_count = 1;
2281 }
2282
2283 /* PRED_SETE_INT __, R0.y, ring */
2284 memset(&alu, 0, sizeof(alu));
2285 alu.op = ALU_OP2_PRED_SETE_INT;
2286 alu.src[0].chan = 1;
2287 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2288 alu.src[1].value = ring;
2289 alu.execute_mask = 1;
2290 alu.update_pred = 1;
2291 alu.last = 1;
2292 r600_bytecode_add_alu_type(ctx.bc, &alu, CF_OP_ALU_PUSH_BEFORE);
2293
2294 r600_bytecode_add_cfinst(ctx.bc, CF_OP_JUMP);
2295 cf_jump = ctx.bc->cf_last;
2296
2297 if (enabled)
2298 emit_streamout(&ctx, so, only_ring_0 ? -1 : ring, &cshader->shader.ring_item_sizes[ring]);
2299 cshader->shader.ring_item_sizes[ring] = ocnt * 16;
2300 }
2301
2302 /* bc adds nops - copy it */
2303 if (ctx.bc->chip_class == R600) {
2304 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2305 alu.op = ALU_OP0_NOP;
2306 alu.last = 1;
2307 r600_bytecode_add_alu(ctx.bc, &alu);
2308
2309 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
2310 }
2311
2312 /* export vertex data */
2313 /* XXX factor out common code with r600_shader_from_tgsi ? */
2314 for (i = 0; i < ocnt; ++i) {
2315 struct r600_shader_io *out = &ctx.shader->output[i];
2316 bool instream0 = true;
2317 if (out->name == TGSI_SEMANTIC_CLIPVERTEX)
2318 continue;
2319
2320 for (j = 0; j < so->num_outputs; j++) {
2321 if (so->output[j].register_index == i) {
2322 if (so->output[j].stream == 0)
2323 break;
2324 if (so->output[j].stream > 0)
2325 instream0 = false;
2326 }
2327 }
2328 if (!instream0)
2329 continue;
2330 memset(&output, 0, sizeof(output));
2331 output.gpr = out->gpr;
2332 output.elem_size = 3;
2333 output.swizzle_x = 0;
2334 output.swizzle_y = 1;
2335 output.swizzle_z = 2;
2336 output.swizzle_w = 3;
2337 output.burst_count = 1;
2338 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2339 output.op = CF_OP_EXPORT;
2340 switch (out->name) {
2341 case TGSI_SEMANTIC_POSITION:
2342 output.array_base = 60;
2343 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2344 break;
2345
2346 case TGSI_SEMANTIC_PSIZE:
2347 output.array_base = 61;
2348 if (next_clip_pos == 61)
2349 next_clip_pos = 62;
2350 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2351 output.swizzle_y = 7;
2352 output.swizzle_z = 7;
2353 output.swizzle_w = 7;
2354 ctx.shader->vs_out_misc_write = 1;
2355 ctx.shader->vs_out_point_size = 1;
2356 break;
2357 case TGSI_SEMANTIC_LAYER:
2358 if (out->spi_sid) {
2359 /* duplicate it as PARAM to pass to the pixel shader */
2360 output.array_base = next_param++;
2361 r600_bytecode_add_output(ctx.bc, &output);
2362 last_exp_param = ctx.bc->cf_last;
2363 }
2364 output.array_base = 61;
2365 if (next_clip_pos == 61)
2366 next_clip_pos = 62;
2367 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2368 output.swizzle_x = 7;
2369 output.swizzle_y = 7;
2370 output.swizzle_z = 0;
2371 output.swizzle_w = 7;
2372 ctx.shader->vs_out_misc_write = 1;
2373 ctx.shader->vs_out_layer = 1;
2374 break;
2375 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2376 if (out->spi_sid) {
2377 /* duplicate it as PARAM to pass to the pixel shader */
2378 output.array_base = next_param++;
2379 r600_bytecode_add_output(ctx.bc, &output);
2380 last_exp_param = ctx.bc->cf_last;
2381 }
2382 output.array_base = 61;
2383 if (next_clip_pos == 61)
2384 next_clip_pos = 62;
2385 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2386 ctx.shader->vs_out_misc_write = 1;
2387 ctx.shader->vs_out_viewport = 1;
2388 output.swizzle_x = 7;
2389 output.swizzle_y = 7;
2390 output.swizzle_z = 7;
2391 output.swizzle_w = 0;
2392 break;
2393 case TGSI_SEMANTIC_CLIPDIST:
2394 /* spi_sid is 0 for clipdistance outputs that were generated
2395 * for clipvertex - we don't need to pass them to PS */
2396 ctx.shader->clip_dist_write = gs->shader.clip_dist_write;
2397 ctx.shader->cull_dist_write = gs->shader.cull_dist_write;
2398 ctx.shader->cc_dist_mask = gs->shader.cc_dist_mask;
2399 if (out->spi_sid) {
2400 /* duplicate it as PARAM to pass to the pixel shader */
2401 output.array_base = next_param++;
2402 r600_bytecode_add_output(ctx.bc, &output);
2403 last_exp_param = ctx.bc->cf_last;
2404 }
2405 output.array_base = next_clip_pos++;
2406 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2407 break;
2408 case TGSI_SEMANTIC_FOG:
2409 output.swizzle_y = 4; /* 0 */
2410 output.swizzle_z = 4; /* 0 */
2411 output.swizzle_w = 5; /* 1 */
2412 break;
2413 default:
2414 output.array_base = next_param++;
2415 break;
2416 }
2417 r600_bytecode_add_output(ctx.bc, &output);
2418 if (output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM)
2419 last_exp_param = ctx.bc->cf_last;
2420 else
2421 last_exp_pos = ctx.bc->cf_last;
2422 }
2423
2424 if (!last_exp_pos) {
2425 memset(&output, 0, sizeof(output));
2426 output.gpr = 0;
2427 output.elem_size = 3;
2428 output.swizzle_x = 7;
2429 output.swizzle_y = 7;
2430 output.swizzle_z = 7;
2431 output.swizzle_w = 7;
2432 output.burst_count = 1;
2433 output.type = 2;
2434 output.op = CF_OP_EXPORT;
2435 output.array_base = 60;
2436 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2437 r600_bytecode_add_output(ctx.bc, &output);
2438 last_exp_pos = ctx.bc->cf_last;
2439 }
2440
2441 if (!last_exp_param) {
2442 memset(&output, 0, sizeof(output));
2443 output.gpr = 0;
2444 output.elem_size = 3;
2445 output.swizzle_x = 7;
2446 output.swizzle_y = 7;
2447 output.swizzle_z = 7;
2448 output.swizzle_w = 7;
2449 output.burst_count = 1;
2450 output.type = 2;
2451 output.op = CF_OP_EXPORT;
2452 output.array_base = next_param++;
2453 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2454 r600_bytecode_add_output(ctx.bc, &output);
2455 last_exp_param = ctx.bc->cf_last;
2456 }
2457
2458 last_exp_pos->op = CF_OP_EXPORT_DONE;
2459 last_exp_param->op = CF_OP_EXPORT_DONE;
2460
2461 r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
2462 cf_pop = ctx.bc->cf_last;
2463
2464 cf_jump->cf_addr = cf_pop->id + 2;
2465 cf_jump->pop_count = 1;
2466 cf_pop->cf_addr = cf_pop->id + 2;
2467 cf_pop->pop_count = 1;
2468
2469 if (ctx.bc->chip_class == CAYMAN)
2470 cm_bytecode_add_cf_end(ctx.bc);
2471 else {
2472 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
2473 ctx.bc->cf_last->end_of_program = 1;
2474 }
2475
2476 gs->gs_copy_shader = cshader;
2477 cshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
2478
2479 ctx.bc->nstack = 1;
2480
2481 return r600_bytecode_build(ctx.bc);
2482 }
2483
2484 static int emit_inc_ring_offset(struct r600_shader_ctx *ctx, int idx, bool ind)
2485 {
2486 if (ind) {
2487 struct r600_bytecode_alu alu;
2488 int r;
2489
2490 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2491 alu.op = ALU_OP2_ADD_INT;
2492 alu.src[0].sel = ctx->gs_export_gpr_tregs[idx];
2493 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2494 alu.src[1].value = ctx->gs_out_ring_offset >> 4;
2495 alu.dst.sel = ctx->gs_export_gpr_tregs[idx];
2496 alu.dst.write = 1;
2497 alu.last = 1;
2498 r = r600_bytecode_add_alu(ctx->bc, &alu);
2499 if (r)
2500 return r;
2501 }
2502 return 0;
2503 }
2504
2505 static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so UNUSED, int stream, bool ind)
2506 {
2507 struct r600_bytecode_output output;
2508 int ring_offset;
2509 unsigned i, k;
2510 int effective_stream = stream == -1 ? 0 : stream;
2511 int idx = 0;
2512
2513 for (i = 0; i < ctx->shader->noutput; i++) {
2514 if (ctx->gs_for_vs) {
2515 /* for ES we need to lookup corresponding ring offset expected by GS
2516 * (map this output to GS input by name and sid) */
2517 /* FIXME precompute offsets */
2518 ring_offset = -1;
2519 for(k = 0; k < ctx->gs_for_vs->ninput; ++k) {
2520 struct r600_shader_io *in = &ctx->gs_for_vs->input[k];
2521 struct r600_shader_io *out = &ctx->shader->output[i];
2522 if (in->name == out->name && in->sid == out->sid)
2523 ring_offset = in->ring_offset;
2524 }
2525
2526 if (ring_offset == -1)
2527 continue;
2528 } else {
2529 ring_offset = idx * 16;
2530 idx++;
2531 }
2532
2533 if (stream > 0 && ctx->shader->output[i].name == TGSI_SEMANTIC_POSITION)
2534 continue;
2535 /* next_ring_offset after parsing input decls contains total size of
2536 * single vertex data, gs_next_vertex - current vertex index */
2537 if (!ind)
2538 ring_offset += ctx->gs_out_ring_offset * ctx->gs_next_vertex;
2539
2540 memset(&output, 0, sizeof(struct r600_bytecode_output));
2541 output.gpr = ctx->shader->output[i].gpr;
2542 output.elem_size = 3;
2543 output.comp_mask = 0xF;
2544 output.burst_count = 1;
2545
2546 if (ind)
2547 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
2548 else
2549 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
2550
2551 switch (stream) {
2552 default:
2553 case 0:
2554 output.op = CF_OP_MEM_RING; break;
2555 case 1:
2556 output.op = CF_OP_MEM_RING1; break;
2557 case 2:
2558 output.op = CF_OP_MEM_RING2; break;
2559 case 3:
2560 output.op = CF_OP_MEM_RING3; break;
2561 }
2562
2563 if (ind) {
2564 output.array_base = ring_offset >> 2; /* in dwords */
2565 output.array_size = 0xfff;
2566 output.index_gpr = ctx->gs_export_gpr_tregs[effective_stream];
2567 } else
2568 output.array_base = ring_offset >> 2; /* in dwords */
2569 r600_bytecode_add_output(ctx->bc, &output);
2570 }
2571
2572 ++ctx->gs_next_vertex;
2573 return 0;
2574 }
2575
2576
2577 static int r600_fetch_tess_io_info(struct r600_shader_ctx *ctx)
2578 {
2579 int r;
2580 struct r600_bytecode_vtx vtx;
2581 int temp_val = ctx->temp_reg;
2582 /* need to store the TCS output somewhere */
2583 r = single_alu_op2(ctx, ALU_OP1_MOV,
2584 temp_val, 0,
2585 V_SQ_ALU_SRC_LITERAL, 0,
2586 0, 0);
2587 if (r)
2588 return r;
2589
2590 /* used by VS/TCS */
2591 if (ctx->tess_input_info) {
2592 /* fetch tcs input values into resv space */
2593 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
2594 vtx.op = FETCH_OP_VFETCH;
2595 vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER;
2596 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
2597 vtx.mega_fetch_count = 16;
2598 vtx.data_format = FMT_32_32_32_32;
2599 vtx.num_format_all = 2;
2600 vtx.format_comp_all = 1;
2601 vtx.use_const_fields = 0;
2602 vtx.endian = r600_endian_swap(32);
2603 vtx.srf_mode_all = 1;
2604 vtx.offset = 0;
2605 vtx.dst_gpr = ctx->tess_input_info;
2606 vtx.dst_sel_x = 0;
2607 vtx.dst_sel_y = 1;
2608 vtx.dst_sel_z = 2;
2609 vtx.dst_sel_w = 3;
2610 vtx.src_gpr = temp_val;
2611 vtx.src_sel_x = 0;
2612
2613 r = r600_bytecode_add_vtx(ctx->bc, &vtx);
2614 if (r)
2615 return r;
2616 }
2617
2618 /* used by TCS/TES */
2619 if (ctx->tess_output_info) {
2620 /* fetch tcs output values into resv space */
2621 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
2622 vtx.op = FETCH_OP_VFETCH;
2623 vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER;
2624 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
2625 vtx.mega_fetch_count = 16;
2626 vtx.data_format = FMT_32_32_32_32;
2627 vtx.num_format_all = 2;
2628 vtx.format_comp_all = 1;
2629 vtx.use_const_fields = 0;
2630 vtx.endian = r600_endian_swap(32);
2631 vtx.srf_mode_all = 1;
2632 vtx.offset = 16;
2633 vtx.dst_gpr = ctx->tess_output_info;
2634 vtx.dst_sel_x = 0;
2635 vtx.dst_sel_y = 1;
2636 vtx.dst_sel_z = 2;
2637 vtx.dst_sel_w = 3;
2638 vtx.src_gpr = temp_val;
2639 vtx.src_sel_x = 0;
2640
2641 r = r600_bytecode_add_vtx(ctx->bc, &vtx);
2642 if (r)
2643 return r;
2644 }
2645 return 0;
2646 }
2647
2648 static int emit_lds_vs_writes(struct r600_shader_ctx *ctx)
2649 {
2650 int j, r;
2651 int temp_reg;
2652 unsigned i;
2653
2654 /* fetch tcs input values into input_vals */
2655 ctx->tess_input_info = r600_get_temp(ctx);
2656 ctx->tess_output_info = 0;
2657 r = r600_fetch_tess_io_info(ctx);
2658 if (r)
2659 return r;
2660
2661 temp_reg = r600_get_temp(ctx);
2662 /* dst reg contains LDS address stride * idx */
2663 /* MUL vertexID, vertex_dw_stride */
2664 r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24,
2665 temp_reg, 0,
2666 ctx->tess_input_info, 1,
2667 0, 1); /* rel id in r0.y? */
2668 if (r)
2669 return r;
2670
2671 for (i = 0; i < ctx->shader->noutput; i++) {
2672 struct r600_bytecode_alu alu;
2673 int param = r600_get_lds_unique_index(ctx->shader->output[i].name, ctx->shader->output[i].sid);
2674
2675 if (param) {
2676 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2677 temp_reg, 1,
2678 temp_reg, 0,
2679 V_SQ_ALU_SRC_LITERAL, param * 16);
2680 if (r)
2681 return r;
2682 }
2683
2684 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2685 temp_reg, 2,
2686 temp_reg, param ? 1 : 0,
2687 V_SQ_ALU_SRC_LITERAL, 8);
2688 if (r)
2689 return r;
2690
2691
2692 for (j = 0; j < 2; j++) {
2693 int chan = (j == 1) ? 2 : (param ? 1 : 0);
2694 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2695 alu.op = LDS_OP3_LDS_WRITE_REL;
2696 alu.src[0].sel = temp_reg;
2697 alu.src[0].chan = chan;
2698 alu.src[1].sel = ctx->shader->output[i].gpr;
2699 alu.src[1].chan = j * 2;
2700 alu.src[2].sel = ctx->shader->output[i].gpr;
2701 alu.src[2].chan = (j * 2) + 1;
2702 alu.last = 1;
2703 alu.dst.chan = 0;
2704 alu.lds_idx = 1;
2705 alu.is_lds_idx_op = true;
2706 r = r600_bytecode_add_alu(ctx->bc, &alu);
2707 if (r)
2708 return r;
2709 }
2710 }
2711 return 0;
2712 }
2713
2714 static int r600_store_tcs_output(struct r600_shader_ctx *ctx)
2715 {
2716 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2717 const struct tgsi_full_dst_register *dst = &inst->Dst[0];
2718 int i, r, lasti;
2719 int temp_reg = r600_get_temp(ctx);
2720 struct r600_bytecode_alu alu;
2721 unsigned write_mask = dst->Register.WriteMask;
2722
2723 if (inst->Dst[0].Register.File != TGSI_FILE_OUTPUT)
2724 return 0;
2725
2726 r = get_lds_offset0(ctx, 1, temp_reg, dst->Register.Dimension ? false : true);
2727 if (r)
2728 return r;
2729
2730 /* the base address is now in temp.x */
2731 r = r600_get_byte_address(ctx, temp_reg,
2732 &inst->Dst[0], NULL, ctx->tess_output_info, 1);
2733 if (r)
2734 return r;
2735
2736 /* LDS write */
2737 lasti = tgsi_last_instruction(write_mask);
2738 for (i = 1; i <= lasti; i++) {
2739
2740 if (!(write_mask & (1 << i)))
2741 continue;
2742 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2743 temp_reg, i,
2744 temp_reg, 0,
2745 V_SQ_ALU_SRC_LITERAL, 4 * i);
2746 if (r)
2747 return r;
2748 }
2749
2750 for (i = 0; i <= lasti; i++) {
2751 if (!(write_mask & (1 << i)))
2752 continue;
2753
2754 if ((i == 0 && ((write_mask & 3) == 3)) ||
2755 (i == 2 && ((write_mask & 0xc) == 0xc))) {
2756 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2757 alu.op = LDS_OP3_LDS_WRITE_REL;
2758 alu.src[0].sel = temp_reg;
2759 alu.src[0].chan = i;
2760
2761 alu.src[1].sel = dst->Register.Index;
2762 alu.src[1].sel += ctx->file_offset[dst->Register.File];
2763 alu.src[1].chan = i;
2764
2765 alu.src[2].sel = dst->Register.Index;
2766 alu.src[2].sel += ctx->file_offset[dst->Register.File];
2767 alu.src[2].chan = i + 1;
2768 alu.lds_idx = 1;
2769 alu.dst.chan = 0;
2770 alu.last = 1;
2771 alu.is_lds_idx_op = true;
2772 r = r600_bytecode_add_alu(ctx->bc, &alu);
2773 if (r)
2774 return r;
2775 i += 1;
2776 continue;
2777 }
2778 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2779 alu.op = LDS_OP2_LDS_WRITE;
2780 alu.src[0].sel = temp_reg;
2781 alu.src[0].chan = i;
2782
2783 alu.src[1].sel = dst->Register.Index;
2784 alu.src[1].sel += ctx->file_offset[dst->Register.File];
2785 alu.src[1].chan = i;
2786
2787 alu.src[2].sel = V_SQ_ALU_SRC_0;
2788 alu.dst.chan = 0;
2789 alu.last = 1;
2790 alu.is_lds_idx_op = true;
2791 r = r600_bytecode_add_alu(ctx->bc, &alu);
2792 if (r)
2793 return r;
2794 }
2795 return 0;
2796 }
2797
2798 static int r600_tess_factor_read(struct r600_shader_ctx *ctx,
2799 int output_idx)
2800 {
2801 int param;
2802 unsigned temp_reg = r600_get_temp(ctx);
2803 unsigned name = ctx->shader->output[output_idx].name;
2804 int dreg = ctx->shader->output[output_idx].gpr;
2805 int r;
2806
2807 param = r600_get_lds_unique_index(name, 0);
2808 r = get_lds_offset0(ctx, 1, temp_reg, true);
2809 if (r)
2810 return r;
2811
2812 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2813 temp_reg, 0,
2814 temp_reg, 0,
2815 V_SQ_ALU_SRC_LITERAL, param * 16);
2816 if (r)
2817 return r;
2818
2819 do_lds_fetch_values(ctx, temp_reg, dreg, 0xf);
2820 return 0;
2821 }
2822
2823 static int r600_emit_tess_factor(struct r600_shader_ctx *ctx)
2824 {
2825 int stride, outer_comps, inner_comps;
2826 int tessinner_idx = -1, tessouter_idx = -1;
2827 int i, r;
2828 unsigned j;
2829 int temp_reg = r600_get_temp(ctx);
2830 int treg[3] = {-1, -1, -1};
2831 struct r600_bytecode_alu alu;
2832 struct r600_bytecode_cf *cf_jump, *cf_pop;
2833
2834 /* only execute factor emission for invocation 0 */
2835 /* PRED_SETE_INT __, R0.x, 0 */
2836 memset(&alu, 0, sizeof(alu));
2837 alu.op = ALU_OP2_PRED_SETE_INT;
2838 alu.src[0].chan = 2;
2839 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2840 alu.execute_mask = 1;
2841 alu.update_pred = 1;
2842 alu.last = 1;
2843 r600_bytecode_add_alu_type(ctx->bc, &alu, CF_OP_ALU_PUSH_BEFORE);
2844
2845 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
2846 cf_jump = ctx->bc->cf_last;
2847
2848 treg[0] = r600_get_temp(ctx);
2849 switch (ctx->shader->tcs_prim_mode) {
2850 case PIPE_PRIM_LINES:
2851 stride = 8; /* 2 dwords, 1 vec2 store */
2852 outer_comps = 2;
2853 inner_comps = 0;
2854 break;
2855 case PIPE_PRIM_TRIANGLES:
2856 stride = 16; /* 4 dwords, 1 vec4 store */
2857 outer_comps = 3;
2858 inner_comps = 1;
2859 treg[1] = r600_get_temp(ctx);
2860 break;
2861 case PIPE_PRIM_QUADS:
2862 stride = 24; /* 6 dwords, 2 stores (vec4 + vec2) */
2863 outer_comps = 4;
2864 inner_comps = 2;
2865 treg[1] = r600_get_temp(ctx);
2866 treg[2] = r600_get_temp(ctx);
2867 break;
2868 default:
2869 assert(0);
2870 return -1;
2871 }
2872
2873 /* R0 is InvocationID, RelPatchID, PatchID, tf_base */
2874 /* TF_WRITE takes index in R.x, value in R.y */
2875 for (j = 0; j < ctx->shader->noutput; j++) {
2876 if (ctx->shader->output[j].name == TGSI_SEMANTIC_TESSINNER)
2877 tessinner_idx = j;
2878 if (ctx->shader->output[j].name == TGSI_SEMANTIC_TESSOUTER)
2879 tessouter_idx = j;
2880 }
2881
2882 if (tessouter_idx == -1)
2883 return -1;
2884
2885 if (tessinner_idx == -1 && inner_comps)
2886 return -1;
2887
2888 if (tessouter_idx != -1) {
2889 r = r600_tess_factor_read(ctx, tessouter_idx);
2890 if (r)
2891 return r;
2892 }
2893
2894 if (tessinner_idx != -1) {
2895 r = r600_tess_factor_read(ctx, tessinner_idx);
2896 if (r)
2897 return r;
2898 }
2899
2900 /* r.x = tf_base(r0.w) + relpatchid(r0.y) * tf_stride */
2901 /* r.x = relpatchid(r0.y) * tf_stride */
2902
2903 /* multiply incoming r0.y * stride - t.x = r0.y * stride */
2904 /* add incoming r0.w to it: t.x = t.x + r0.w */
2905 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
2906 temp_reg, 0,
2907 0, 1,
2908 V_SQ_ALU_SRC_LITERAL, stride,
2909 0, 3);
2910 if (r)
2911 return r;
2912
2913 for (i = 0; i < outer_comps + inner_comps; i++) {
2914 int out_idx = i >= outer_comps ? tessinner_idx : tessouter_idx;
2915 int out_comp = i >= outer_comps ? i - outer_comps : i;
2916
2917 if (ctx->shader->tcs_prim_mode == PIPE_PRIM_LINES) {
2918 if (out_comp == 1)
2919 out_comp = 0;
2920 else if (out_comp == 0)
2921 out_comp = 1;
2922 }
2923
2924 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2925 treg[i / 2], (2 * (i % 2)),
2926 temp_reg, 0,
2927 V_SQ_ALU_SRC_LITERAL, 4 * i);
2928 if (r)
2929 return r;
2930 r = single_alu_op2(ctx, ALU_OP1_MOV,
2931 treg[i / 2], 1 + (2 * (i%2)),
2932 ctx->shader->output[out_idx].gpr, out_comp,
2933 0, 0);
2934 if (r)
2935 return r;
2936 }
2937 for (i = 0; i < outer_comps + inner_comps; i++) {
2938 struct r600_bytecode_gds gds;
2939
2940 memset(&gds, 0, sizeof(struct r600_bytecode_gds));
2941 gds.src_gpr = treg[i / 2];
2942 gds.src_sel_x = 2 * (i % 2);
2943 gds.src_sel_y = 1 + (2 * (i % 2));
2944 gds.src_sel_z = 4;
2945 gds.dst_sel_x = 7;
2946 gds.dst_sel_y = 7;
2947 gds.dst_sel_z = 7;
2948 gds.dst_sel_w = 7;
2949 gds.op = FETCH_OP_TF_WRITE;
2950 r = r600_bytecode_add_gds(ctx->bc, &gds);
2951 if (r)
2952 return r;
2953 }
2954
2955 // Patch up jump label
2956 r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP);
2957 cf_pop = ctx->bc->cf_last;
2958
2959 cf_jump->cf_addr = cf_pop->id + 2;
2960 cf_jump->pop_count = 1;
2961 cf_pop->cf_addr = cf_pop->id + 2;
2962 cf_pop->pop_count = 1;
2963
2964 return 0;
2965 }
2966
2967 /*
2968 * We have to work out the thread ID for load and atomic
2969 * operations, which store the returned value to an index
2970 * in an intermediate buffer.
2971 * The index is calculated by taking the thread id,
2972 * calculated from the MBCNT instructions.
2973 * Then the shader engine ID is multiplied by 256,
2974 * and the wave id is added.
2975 * Then the result is multipled by 64 and thread id is
2976 * added.
2977 */
2978 static int load_thread_id_gpr(struct r600_shader_ctx *ctx)
2979 {
2980 struct r600_bytecode_alu alu;
2981 int r;
2982
2983 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2984 alu.op = ALU_OP1_MBCNT_32LO_ACCUM_PREV_INT;
2985 alu.dst.sel = ctx->temp_reg;
2986 alu.dst.chan = 0;
2987 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
2988 alu.src[0].value = 0xffffffff;
2989 alu.dst.write = 1;
2990 r = r600_bytecode_add_alu(ctx->bc, &alu);
2991 if (r)
2992 return r;
2993
2994 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2995 alu.op = ALU_OP1_MBCNT_32HI_INT;
2996 alu.dst.sel = ctx->temp_reg;
2997 alu.dst.chan = 1;
2998 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
2999 alu.src[0].value = 0xffffffff;
3000 alu.dst.write = 1;
3001 r = r600_bytecode_add_alu(ctx->bc, &alu);
3002 if (r)
3003 return r;
3004
3005 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3006 alu.op = ALU_OP3_MULADD_UINT24;
3007 alu.dst.sel = ctx->temp_reg;
3008 alu.dst.chan = 2;
3009 alu.src[0].sel = EG_V_SQ_ALU_SRC_SE_ID;
3010 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
3011 alu.src[1].value = 256;
3012 alu.src[2].sel = EG_V_SQ_ALU_SRC_HW_WAVE_ID;
3013 alu.dst.write = 1;
3014 alu.is_op3 = 1;
3015 alu.last = 1;
3016 r = r600_bytecode_add_alu(ctx->bc, &alu);
3017 if (r)
3018 return r;
3019
3020 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
3021 ctx->thread_id_gpr, 1,
3022 ctx->temp_reg, 2,
3023 V_SQ_ALU_SRC_LITERAL, 0x40,
3024 ctx->temp_reg, 0);
3025 if (r)
3026 return r;
3027 return 0;
3028 }
3029
3030 static int r600_shader_from_tgsi(struct r600_context *rctx,
3031 struct r600_pipe_shader *pipeshader,
3032 union r600_shader_key key)
3033 {
3034 struct r600_screen *rscreen = rctx->screen;
3035 struct r600_shader *shader = &pipeshader->shader;
3036 struct tgsi_token *tokens = pipeshader->selector->tokens;
3037 struct pipe_stream_output_info so = pipeshader->selector->so;
3038 struct tgsi_full_immediate *immediate;
3039 struct r600_shader_ctx ctx;
3040 struct r600_bytecode_output output[ARRAY_SIZE(shader->output)];
3041 unsigned output_done, noutput;
3042 unsigned opcode;
3043 int j, k, r = 0;
3044 unsigned i;
3045 int next_param_base = 0, next_clip_base;
3046 int max_color_exports = MAX2(key.ps.nr_cbufs, 1);
3047 bool indirect_gprs;
3048 bool ring_outputs = false;
3049 bool lds_outputs = false;
3050 bool lds_inputs = false;
3051 bool pos_emitted = false;
3052
3053 ctx.bc = &shader->bc;
3054 ctx.shader = shader;
3055 ctx.native_integers = true;
3056
3057 r600_bytecode_init(ctx.bc, rscreen->b.chip_class, rscreen->b.family,
3058 rscreen->has_compressed_msaa_texturing);
3059 ctx.tokens = tokens;
3060 tgsi_scan_shader(tokens, &ctx.info);
3061 shader->indirect_files = ctx.info.indirect_files;
3062
3063 shader->uses_doubles = ctx.info.uses_doubles;
3064 shader->uses_atomics = ctx.info.file_mask[TGSI_FILE_HW_ATOMIC];
3065 shader->nsys_inputs = 0;
3066
3067 shader->uses_images = ctx.info.file_count[TGSI_FILE_IMAGE] > 0;
3068 indirect_gprs = ctx.info.indirect_files & ~((1 << TGSI_FILE_CONSTANT) | (1 << TGSI_FILE_SAMPLER));
3069 tgsi_parse_init(&ctx.parse, tokens);
3070 ctx.type = ctx.info.processor;
3071 shader->processor_type = ctx.type;
3072 ctx.bc->type = shader->processor_type;
3073
3074 switch (ctx.type) {
3075 case PIPE_SHADER_VERTEX:
3076 shader->vs_as_gs_a = key.vs.as_gs_a;
3077 shader->vs_as_es = key.vs.as_es;
3078 shader->vs_as_ls = key.vs.as_ls;
3079 shader->atomic_base = key.vs.first_atomic_counter;
3080 if (shader->vs_as_es)
3081 ring_outputs = true;
3082 if (shader->vs_as_ls)
3083 lds_outputs = true;
3084 break;
3085 case PIPE_SHADER_GEOMETRY:
3086 ring_outputs = true;
3087 shader->atomic_base = key.gs.first_atomic_counter;
3088 shader->gs_tri_strip_adj_fix = key.gs.tri_strip_adj_fix;
3089 break;
3090 case PIPE_SHADER_TESS_CTRL:
3091 shader->tcs_prim_mode = key.tcs.prim_mode;
3092 shader->atomic_base = key.tcs.first_atomic_counter;
3093 lds_outputs = true;
3094 lds_inputs = true;
3095 break;
3096 case PIPE_SHADER_TESS_EVAL:
3097 shader->tes_as_es = key.tes.as_es;
3098 shader->atomic_base = key.tes.first_atomic_counter;
3099 lds_inputs = true;
3100 if (shader->tes_as_es)
3101 ring_outputs = true;
3102 break;
3103 case PIPE_SHADER_FRAGMENT:
3104 shader->two_side = key.ps.color_two_side;
3105 shader->atomic_base = key.ps.first_atomic_counter;
3106 shader->rat_base = key.ps.nr_cbufs;
3107 shader->image_size_const_offset = key.ps.image_size_const_offset;
3108 break;
3109 default:
3110 break;
3111 }
3112
3113 if (shader->vs_as_es || shader->tes_as_es) {
3114 ctx.gs_for_vs = &rctx->gs_shader->current->shader;
3115 } else {
3116 ctx.gs_for_vs = NULL;
3117 }
3118
3119 ctx.next_ring_offset = 0;
3120 ctx.gs_out_ring_offset = 0;
3121 ctx.gs_next_vertex = 0;
3122 ctx.gs_stream_output_info = &so;
3123
3124 ctx.face_gpr = -1;
3125 ctx.fixed_pt_position_gpr = -1;
3126 ctx.fragcoord_input = -1;
3127 ctx.colors_used = 0;
3128 ctx.clip_vertex_write = 0;
3129
3130 shader->nr_ps_color_exports = 0;
3131 shader->nr_ps_max_color_exports = 0;
3132
3133
3134 /* register allocations */
3135 /* Values [0,127] correspond to GPR[0..127].
3136 * Values [128,159] correspond to constant buffer bank 0
3137 * Values [160,191] correspond to constant buffer bank 1
3138 * Values [256,511] correspond to cfile constants c[0..255]. (Gone on EG)
3139 * Values [256,287] correspond to constant buffer bank 2 (EG)
3140 * Values [288,319] correspond to constant buffer bank 3 (EG)
3141 * Other special values are shown in the list below.
3142 * 244 ALU_SRC_1_DBL_L: special constant 1.0 double-float, LSW. (RV670+)
3143 * 245 ALU_SRC_1_DBL_M: special constant 1.0 double-float, MSW. (RV670+)
3144 * 246 ALU_SRC_0_5_DBL_L: special constant 0.5 double-float, LSW. (RV670+)
3145 * 247 ALU_SRC_0_5_DBL_M: special constant 0.5 double-float, MSW. (RV670+)
3146 * 248 SQ_ALU_SRC_0: special constant 0.0.
3147 * 249 SQ_ALU_SRC_1: special constant 1.0 float.
3148 * 250 SQ_ALU_SRC_1_INT: special constant 1 integer.
3149 * 251 SQ_ALU_SRC_M_1_INT: special constant -1 integer.
3150 * 252 SQ_ALU_SRC_0_5: special constant 0.5 float.
3151 * 253 SQ_ALU_SRC_LITERAL: literal constant.
3152 * 254 SQ_ALU_SRC_PV: previous vector result.
3153 * 255 SQ_ALU_SRC_PS: previous scalar result.
3154 */
3155 for (i = 0; i < TGSI_FILE_COUNT; i++) {
3156 ctx.file_offset[i] = 0;
3157 }
3158
3159 if (ctx.type == PIPE_SHADER_VERTEX) {
3160
3161 ctx.file_offset[TGSI_FILE_INPUT] = 1;
3162 if (ctx.info.num_inputs)
3163 r600_bytecode_add_cfinst(ctx.bc, CF_OP_CALL_FS);
3164 }
3165 if (ctx.type == PIPE_SHADER_FRAGMENT) {
3166 if (ctx.bc->chip_class >= EVERGREEN)
3167 ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx);
3168 else
3169 ctx.file_offset[TGSI_FILE_INPUT] = allocate_system_value_inputs(&ctx, ctx.file_offset[TGSI_FILE_INPUT]);
3170 }
3171 if (ctx.type == PIPE_SHADER_GEOMETRY) {
3172 /* FIXME 1 would be enough in some cases (3 or less input vertices) */
3173 ctx.file_offset[TGSI_FILE_INPUT] = 2;
3174 }
3175 if (ctx.type == PIPE_SHADER_TESS_CTRL)
3176 ctx.file_offset[TGSI_FILE_INPUT] = 1;
3177 if (ctx.type == PIPE_SHADER_TESS_EVAL) {
3178 bool add_tesscoord = false, add_tess_inout = false;
3179 ctx.file_offset[TGSI_FILE_INPUT] = 1;
3180 for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) {
3181 /* if we have tesscoord save one reg */
3182 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSCOORD)
3183 add_tesscoord = true;
3184 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSINNER ||
3185 ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSOUTER)
3186 add_tess_inout = true;
3187 }
3188 if (add_tesscoord || add_tess_inout)
3189 ctx.file_offset[TGSI_FILE_INPUT]++;
3190 if (add_tess_inout)
3191 ctx.file_offset[TGSI_FILE_INPUT]+=2;
3192 }
3193
3194 ctx.file_offset[TGSI_FILE_OUTPUT] =
3195 ctx.file_offset[TGSI_FILE_INPUT] +
3196 ctx.info.file_max[TGSI_FILE_INPUT] + 1;
3197 ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] +
3198 ctx.info.file_max[TGSI_FILE_OUTPUT] + 1;
3199
3200 /* Outside the GPR range. This will be translated to one of the
3201 * kcache banks later. */
3202 ctx.file_offset[TGSI_FILE_CONSTANT] = 512;
3203
3204 ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL;
3205 ctx.bc->ar_reg = ctx.file_offset[TGSI_FILE_TEMPORARY] +
3206 ctx.info.file_max[TGSI_FILE_TEMPORARY] + 1;
3207 ctx.bc->index_reg[0] = ctx.bc->ar_reg + 1;
3208 ctx.bc->index_reg[1] = ctx.bc->ar_reg + 2;
3209
3210 if (ctx.type == PIPE_SHADER_TESS_CTRL) {
3211 ctx.tess_input_info = ctx.bc->ar_reg + 3;
3212 ctx.tess_output_info = ctx.bc->ar_reg + 4;
3213 ctx.temp_reg = ctx.bc->ar_reg + 5;
3214 } else if (ctx.type == PIPE_SHADER_TESS_EVAL) {
3215 ctx.tess_input_info = 0;
3216 ctx.tess_output_info = ctx.bc->ar_reg + 3;
3217 ctx.temp_reg = ctx.bc->ar_reg + 4;
3218 } else if (ctx.type == PIPE_SHADER_GEOMETRY) {
3219 ctx.gs_export_gpr_tregs[0] = ctx.bc->ar_reg + 3;
3220 ctx.gs_export_gpr_tregs[1] = ctx.bc->ar_reg + 4;
3221 ctx.gs_export_gpr_tregs[2] = ctx.bc->ar_reg + 5;
3222 ctx.gs_export_gpr_tregs[3] = ctx.bc->ar_reg + 6;
3223 ctx.temp_reg = ctx.bc->ar_reg + 7;
3224 if (ctx.shader->gs_tri_strip_adj_fix) {
3225 ctx.gs_rotated_input[0] = ctx.bc->ar_reg + 7;
3226 ctx.gs_rotated_input[1] = ctx.bc->ar_reg + 8;
3227 ctx.temp_reg += 2;
3228 } else {
3229 ctx.gs_rotated_input[0] = 0;
3230 ctx.gs_rotated_input[1] = 1;
3231 }
3232 } else {
3233 ctx.temp_reg = ctx.bc->ar_reg + 3;
3234 }
3235
3236 if (shader->uses_images && ctx.type == PIPE_SHADER_FRAGMENT) {
3237 ctx.thread_id_gpr = ctx.temp_reg;
3238 ctx.temp_reg++;
3239 } else
3240 ctx.thread_id_gpr = 0;
3241
3242 shader->max_arrays = 0;
3243 shader->num_arrays = 0;
3244 if (indirect_gprs) {
3245
3246 if (ctx.info.indirect_files & (1 << TGSI_FILE_INPUT)) {
3247 r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_INPUT],
3248 ctx.file_offset[TGSI_FILE_OUTPUT] -
3249 ctx.file_offset[TGSI_FILE_INPUT],
3250 0x0F);
3251 }
3252 if (ctx.info.indirect_files & (1 << TGSI_FILE_OUTPUT)) {
3253 r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_OUTPUT],
3254 ctx.file_offset[TGSI_FILE_TEMPORARY] -
3255 ctx.file_offset[TGSI_FILE_OUTPUT],
3256 0x0F);
3257 }
3258 }
3259
3260 ctx.nliterals = 0;
3261 ctx.literals = NULL;
3262
3263 shader->fs_write_all = ctx.info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] &&
3264 ctx.info.colors_written == 1;
3265 shader->vs_position_window_space = ctx.info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
3266 shader->ps_conservative_z = (uint8_t)ctx.info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT];
3267
3268 if (ctx.type == PIPE_SHADER_VERTEX ||
3269 ctx.type == PIPE_SHADER_GEOMETRY ||
3270 ctx.type == PIPE_SHADER_TESS_EVAL) {
3271 shader->cc_dist_mask = (1 << (ctx.info.properties[TGSI_PROPERTY_NUM_CULLDIST_ENABLED] +
3272 ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED])) - 1;
3273 shader->clip_dist_write = (1 << ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED]) - 1;
3274 shader->cull_dist_write = ((1 << ctx.info.properties[TGSI_PROPERTY_NUM_CULLDIST_ENABLED]) - 1) << ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED];
3275 }
3276
3277 if (shader->vs_as_gs_a)
3278 vs_add_primid_output(&ctx, key.vs.prim_id_out);
3279
3280 if (ctx.type == PIPE_SHADER_TESS_EVAL)
3281 r600_fetch_tess_io_info(&ctx);
3282
3283 while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
3284 tgsi_parse_token(&ctx.parse);
3285 switch (ctx.parse.FullToken.Token.Type) {
3286 case TGSI_TOKEN_TYPE_IMMEDIATE:
3287 immediate = &ctx.parse.FullToken.FullImmediate;
3288 ctx.literals = realloc(ctx.literals, (ctx.nliterals + 1) * 16);
3289 if(ctx.literals == NULL) {
3290 r = -ENOMEM;
3291 goto out_err;
3292 }
3293 ctx.literals[ctx.nliterals * 4 + 0] = immediate->u[0].Uint;
3294 ctx.literals[ctx.nliterals * 4 + 1] = immediate->u[1].Uint;
3295 ctx.literals[ctx.nliterals * 4 + 2] = immediate->u[2].Uint;
3296 ctx.literals[ctx.nliterals * 4 + 3] = immediate->u[3].Uint;
3297 ctx.nliterals++;
3298 break;
3299 case TGSI_TOKEN_TYPE_DECLARATION:
3300 r = tgsi_declaration(&ctx);
3301 if (r)
3302 goto out_err;
3303 break;
3304 case TGSI_TOKEN_TYPE_INSTRUCTION:
3305 case TGSI_TOKEN_TYPE_PROPERTY:
3306 break;
3307 default:
3308 R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type);
3309 r = -EINVAL;
3310 goto out_err;
3311 }
3312 }
3313
3314 shader->ring_item_sizes[0] = ctx.next_ring_offset;
3315 shader->ring_item_sizes[1] = 0;
3316 shader->ring_item_sizes[2] = 0;
3317 shader->ring_item_sizes[3] = 0;
3318
3319 /* Process two side if needed */
3320 if (shader->two_side && ctx.colors_used) {
3321 int i, count = ctx.shader->ninput;
3322 unsigned next_lds_loc = ctx.shader->nlds;
3323
3324 /* additional inputs will be allocated right after the existing inputs,
3325 * we won't need them after the color selection, so we don't need to
3326 * reserve these gprs for the rest of the shader code and to adjust
3327 * output offsets etc. */
3328 int gpr = ctx.file_offset[TGSI_FILE_INPUT] +
3329 ctx.info.file_max[TGSI_FILE_INPUT] + 1;
3330
3331 /* if two sided and neither face or sample mask is used by shader, ensure face_gpr is emitted */
3332 if (ctx.face_gpr == -1) {
3333 i = ctx.shader->ninput++;
3334 ctx.shader->input[i].name = TGSI_SEMANTIC_FACE;
3335 ctx.shader->input[i].spi_sid = 0;
3336 ctx.shader->input[i].gpr = gpr++;
3337 ctx.face_gpr = ctx.shader->input[i].gpr;
3338 }
3339
3340 for (i = 0; i < count; i++) {
3341 if (ctx.shader->input[i].name == TGSI_SEMANTIC_COLOR) {
3342 int ni = ctx.shader->ninput++;
3343 memcpy(&ctx.shader->input[ni],&ctx.shader->input[i], sizeof(struct r600_shader_io));
3344 ctx.shader->input[ni].name = TGSI_SEMANTIC_BCOLOR;
3345 ctx.shader->input[ni].spi_sid = r600_spi_sid(&ctx.shader->input[ni]);
3346 ctx.shader->input[ni].gpr = gpr++;
3347 // TGSI to LLVM needs to know the lds position of inputs.
3348 // Non LLVM path computes it later (in process_twoside_color)
3349 ctx.shader->input[ni].lds_pos = next_lds_loc++;
3350 ctx.shader->input[i].back_color_input = ni;
3351 if (ctx.bc->chip_class >= EVERGREEN) {
3352 if ((r = evergreen_interp_input(&ctx, ni)))
3353 return r;
3354 }
3355 }
3356 }
3357 }
3358
3359 if (shader->fs_write_all && rscreen->b.chip_class >= EVERGREEN)
3360 shader->nr_ps_max_color_exports = 8;
3361
3362 if (ctx.fragcoord_input >= 0) {
3363 if (ctx.bc->chip_class == CAYMAN) {
3364 for (j = 0 ; j < 4; j++) {
3365 struct r600_bytecode_alu alu;
3366 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3367 alu.op = ALU_OP1_RECIP_IEEE;
3368 alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
3369 alu.src[0].chan = 3;
3370
3371 alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
3372 alu.dst.chan = j;
3373 alu.dst.write = (j == 3);
3374 alu.last = 1;
3375 if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
3376 return r;
3377 }
3378 } else {
3379 struct r600_bytecode_alu alu;
3380 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3381 alu.op = ALU_OP1_RECIP_IEEE;
3382 alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
3383 alu.src[0].chan = 3;
3384
3385 alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
3386 alu.dst.chan = 3;
3387 alu.dst.write = 1;
3388 alu.last = 1;
3389 if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
3390 return r;
3391 }
3392 }
3393
3394 if (ctx.thread_id_gpr) {
3395 load_thread_id_gpr(&ctx);
3396 }
3397
3398 if (ctx.type == PIPE_SHADER_GEOMETRY) {
3399 struct r600_bytecode_alu alu;
3400 int r;
3401
3402 /* GS thread with no output workaround - emit a cut at start of GS */
3403 if (ctx.bc->chip_class == R600)
3404 r600_bytecode_add_cfinst(ctx.bc, CF_OP_CUT_VERTEX);
3405
3406 for (j = 0; j < 4; j++) {
3407 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3408 alu.op = ALU_OP1_MOV;
3409 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3410 alu.src[0].value = 0;
3411 alu.dst.sel = ctx.gs_export_gpr_tregs[j];
3412 alu.dst.write = 1;
3413 alu.last = 1;
3414 r = r600_bytecode_add_alu(ctx.bc, &alu);
3415 if (r)
3416 return r;
3417 }
3418
3419 if (ctx.shader->gs_tri_strip_adj_fix) {
3420 r = single_alu_op2(&ctx, ALU_OP2_AND_INT,
3421 ctx.gs_rotated_input[0], 2,
3422 0, 2,
3423 V_SQ_ALU_SRC_LITERAL, 1);
3424 if (r)
3425 return r;
3426
3427 for (i = 0; i < 6; i++) {
3428 int rotated = (i + 4) % 6;
3429 int offset_reg = i / 3;
3430 int offset_chan = i % 3;
3431 int rotated_offset_reg = rotated / 3;
3432 int rotated_offset_chan = rotated % 3;
3433
3434 if (offset_reg == 0 && offset_chan == 2)
3435 offset_chan = 3;
3436 if (rotated_offset_reg == 0 && rotated_offset_chan == 2)
3437 rotated_offset_chan = 3;
3438
3439 r = single_alu_op3(&ctx, ALU_OP3_CNDE_INT,
3440 ctx.gs_rotated_input[offset_reg], offset_chan,
3441 ctx.gs_rotated_input[0], 2,
3442 offset_reg, offset_chan,
3443 rotated_offset_reg, rotated_offset_chan);
3444 if (r)
3445 return r;
3446 }
3447 }
3448 }
3449
3450 if (ctx.type == PIPE_SHADER_TESS_CTRL)
3451 r600_fetch_tess_io_info(&ctx);
3452
3453 if (shader->two_side && ctx.colors_used) {
3454 if ((r = process_twoside_color_inputs(&ctx)))
3455 return r;
3456 }
3457
3458 tgsi_parse_init(&ctx.parse, tokens);
3459 while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
3460 tgsi_parse_token(&ctx.parse);
3461 switch (ctx.parse.FullToken.Token.Type) {
3462 case TGSI_TOKEN_TYPE_INSTRUCTION:
3463 r = tgsi_is_supported(&ctx);
3464 if (r)
3465 goto out_err;
3466 ctx.max_driver_temp_used = 0;
3467 /* reserve first tmp for everyone */
3468 r600_get_temp(&ctx);
3469
3470 opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode;
3471 if ((r = tgsi_split_constant(&ctx)))
3472 goto out_err;
3473 if ((r = tgsi_split_literal_constant(&ctx)))
3474 goto out_err;
3475 if (ctx.type == PIPE_SHADER_GEOMETRY) {
3476 if ((r = tgsi_split_gs_inputs(&ctx)))
3477 goto out_err;
3478 } else if (lds_inputs) {
3479 if ((r = tgsi_split_lds_inputs(&ctx)))
3480 goto out_err;
3481 }
3482 if (ctx.bc->chip_class == CAYMAN)
3483 ctx.inst_info = &cm_shader_tgsi_instruction[opcode];
3484 else if (ctx.bc->chip_class >= EVERGREEN)
3485 ctx.inst_info = &eg_shader_tgsi_instruction[opcode];
3486 else
3487 ctx.inst_info = &r600_shader_tgsi_instruction[opcode];
3488 r = ctx.inst_info->process(&ctx);
3489 if (r)
3490 goto out_err;
3491
3492 if (ctx.type == PIPE_SHADER_TESS_CTRL) {
3493 r = r600_store_tcs_output(&ctx);
3494 if (r)
3495 goto out_err;
3496 }
3497 break;
3498 default:
3499 break;
3500 }
3501 }
3502
3503 /* Reset the temporary register counter. */
3504 ctx.max_driver_temp_used = 0;
3505
3506 noutput = shader->noutput;
3507
3508 if (!ring_outputs && ctx.clip_vertex_write) {
3509 unsigned clipdist_temp[2];
3510
3511 clipdist_temp[0] = r600_get_temp(&ctx);
3512 clipdist_temp[1] = r600_get_temp(&ctx);
3513
3514 /* need to convert a clipvertex write into clipdistance writes and not export
3515 the clip vertex anymore */
3516
3517 memset(&shader->output[noutput], 0, 2*sizeof(struct r600_shader_io));
3518 shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
3519 shader->output[noutput].gpr = clipdist_temp[0];
3520 noutput++;
3521 shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
3522 shader->output[noutput].gpr = clipdist_temp[1];
3523 noutput++;
3524
3525 /* reset spi_sid for clipvertex output to avoid confusing spi */
3526 shader->output[ctx.cv_output].spi_sid = 0;
3527
3528 shader->clip_dist_write = 0xFF;
3529 shader->cc_dist_mask = 0xFF;
3530
3531 for (i = 0; i < 8; i++) {
3532 int oreg = i >> 2;
3533 int ochan = i & 3;
3534
3535 for (j = 0; j < 4; j++) {
3536 struct r600_bytecode_alu alu;
3537 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3538 alu.op = ALU_OP2_DOT4;
3539 alu.src[0].sel = shader->output[ctx.cv_output].gpr;
3540 alu.src[0].chan = j;
3541
3542 alu.src[1].sel = 512 + i;
3543 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
3544 alu.src[1].chan = j;
3545
3546 alu.dst.sel = clipdist_temp[oreg];
3547 alu.dst.chan = j;
3548 alu.dst.write = (j == ochan);
3549 if (j == 3)
3550 alu.last = 1;
3551 r = r600_bytecode_add_alu(ctx.bc, &alu);
3552 if (r)
3553 return r;
3554 }
3555 }
3556 }
3557
3558 /* Add stream outputs. */
3559 if (so.num_outputs) {
3560 bool emit = false;
3561 if (!lds_outputs && !ring_outputs && ctx.type == PIPE_SHADER_VERTEX)
3562 emit = true;
3563 if (!ring_outputs && ctx.type == PIPE_SHADER_TESS_EVAL)
3564 emit = true;
3565 if (emit)
3566 emit_streamout(&ctx, &so, -1, NULL);
3567 }
3568 pipeshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
3569 convert_edgeflag_to_int(&ctx);
3570
3571 if (ctx.type == PIPE_SHADER_TESS_CTRL)
3572 r600_emit_tess_factor(&ctx);
3573
3574 if (lds_outputs) {
3575 if (ctx.type == PIPE_SHADER_VERTEX) {
3576 if (ctx.shader->noutput)
3577 emit_lds_vs_writes(&ctx);
3578 }
3579 } else if (ring_outputs) {
3580 if (shader->vs_as_es || shader->tes_as_es) {
3581 ctx.gs_export_gpr_tregs[0] = r600_get_temp(&ctx);
3582 ctx.gs_export_gpr_tregs[1] = -1;
3583 ctx.gs_export_gpr_tregs[2] = -1;
3584 ctx.gs_export_gpr_tregs[3] = -1;
3585
3586 emit_gs_ring_writes(&ctx, &so, -1, FALSE);
3587 }
3588 } else {
3589 /* Export output */
3590 next_clip_base = shader->vs_out_misc_write ? 62 : 61;
3591
3592 for (i = 0, j = 0; i < noutput; i++, j++) {
3593 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
3594 output[j].gpr = shader->output[i].gpr;
3595 output[j].elem_size = 3;
3596 output[j].swizzle_x = 0;
3597 output[j].swizzle_y = 1;
3598 output[j].swizzle_z = 2;
3599 output[j].swizzle_w = 3;
3600 output[j].burst_count = 1;
3601 output[j].type = 0xffffffff;
3602 output[j].op = CF_OP_EXPORT;
3603 switch (ctx.type) {
3604 case PIPE_SHADER_VERTEX:
3605 case PIPE_SHADER_TESS_EVAL:
3606 switch (shader->output[i].name) {
3607 case TGSI_SEMANTIC_POSITION:
3608 output[j].array_base = 60;
3609 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3610 pos_emitted = true;
3611 break;
3612
3613 case TGSI_SEMANTIC_PSIZE:
3614 output[j].array_base = 61;
3615 output[j].swizzle_y = 7;
3616 output[j].swizzle_z = 7;
3617 output[j].swizzle_w = 7;
3618 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3619 pos_emitted = true;
3620 break;
3621 case TGSI_SEMANTIC_EDGEFLAG:
3622 output[j].array_base = 61;
3623 output[j].swizzle_x = 7;
3624 output[j].swizzle_y = 0;
3625 output[j].swizzle_z = 7;
3626 output[j].swizzle_w = 7;
3627 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3628 pos_emitted = true;
3629 break;
3630 case TGSI_SEMANTIC_LAYER:
3631 /* spi_sid is 0 for outputs that are
3632 * not consumed by PS */
3633 if (shader->output[i].spi_sid) {
3634 output[j].array_base = next_param_base++;
3635 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
3636 j++;
3637 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
3638 }
3639 output[j].array_base = 61;
3640 output[j].swizzle_x = 7;
3641 output[j].swizzle_y = 7;
3642 output[j].swizzle_z = 0;
3643 output[j].swizzle_w = 7;
3644 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3645 pos_emitted = true;
3646 break;
3647 case TGSI_SEMANTIC_VIEWPORT_INDEX:
3648 /* spi_sid is 0 for outputs that are
3649 * not consumed by PS */
3650 if (shader->output[i].spi_sid) {
3651 output[j].array_base = next_param_base++;
3652 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
3653 j++;
3654 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
3655 }
3656 output[j].array_base = 61;
3657 output[j].swizzle_x = 7;
3658 output[j].swizzle_y = 7;
3659 output[j].swizzle_z = 7;
3660 output[j].swizzle_w = 0;
3661 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3662 pos_emitted = true;
3663 break;
3664 case TGSI_SEMANTIC_CLIPVERTEX:
3665 j--;
3666 break;
3667 case TGSI_SEMANTIC_CLIPDIST:
3668 output[j].array_base = next_clip_base++;
3669 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3670 pos_emitted = true;
3671 /* spi_sid is 0 for clipdistance outputs that were generated
3672 * for clipvertex - we don't need to pass them to PS */
3673 if (shader->output[i].spi_sid) {
3674 j++;
3675 /* duplicate it as PARAM to pass to the pixel shader */
3676 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
3677 output[j].array_base = next_param_base++;
3678 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
3679 }
3680 break;
3681 case TGSI_SEMANTIC_FOG:
3682 output[j].swizzle_y = 4; /* 0 */
3683 output[j].swizzle_z = 4; /* 0 */
3684 output[j].swizzle_w = 5; /* 1 */
3685 break;
3686 case TGSI_SEMANTIC_PRIMID:
3687 output[j].swizzle_x = 2;
3688 output[j].swizzle_y = 4; /* 0 */
3689 output[j].swizzle_z = 4; /* 0 */
3690 output[j].swizzle_w = 4; /* 0 */
3691 break;
3692 }
3693
3694 break;
3695 case PIPE_SHADER_FRAGMENT:
3696 if (shader->output[i].name == TGSI_SEMANTIC_COLOR) {
3697 /* never export more colors than the number of CBs */
3698 if (shader->output[i].sid >= max_color_exports) {
3699 /* skip export */
3700 j--;
3701 continue;
3702 }
3703 output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
3704 output[j].array_base = shader->output[i].sid;
3705 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3706 shader->nr_ps_color_exports++;
3707 if (shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN)) {
3708 for (k = 1; k < max_color_exports; k++) {
3709 j++;
3710 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
3711 output[j].gpr = shader->output[i].gpr;
3712 output[j].elem_size = 3;
3713 output[j].swizzle_x = 0;
3714 output[j].swizzle_y = 1;
3715 output[j].swizzle_z = 2;
3716 output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
3717 output[j].burst_count = 1;
3718 output[j].array_base = k;
3719 output[j].op = CF_OP_EXPORT;
3720 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3721 shader->nr_ps_color_exports++;
3722 }
3723 }
3724 } else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) {
3725 output[j].array_base = 61;
3726 output[j].swizzle_x = 2;
3727 output[j].swizzle_y = 7;
3728 output[j].swizzle_z = output[j].swizzle_w = 7;
3729 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3730 } else if (shader->output[i].name == TGSI_SEMANTIC_STENCIL) {
3731 output[j].array_base = 61;
3732 output[j].swizzle_x = 7;
3733 output[j].swizzle_y = 1;
3734 output[j].swizzle_z = output[j].swizzle_w = 7;
3735 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3736 } else if (shader->output[i].name == TGSI_SEMANTIC_SAMPLEMASK) {
3737 output[j].array_base = 61;
3738 output[j].swizzle_x = 7;
3739 output[j].swizzle_y = 7;
3740 output[j].swizzle_z = 0;
3741 output[j].swizzle_w = 7;
3742 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3743 } else {
3744 R600_ERR("unsupported fragment output name %d\n", shader->output[i].name);
3745 r = -EINVAL;
3746 goto out_err;
3747 }
3748 break;
3749 case PIPE_SHADER_TESS_CTRL:
3750 break;
3751 default:
3752 R600_ERR("unsupported processor type %d\n", ctx.type);
3753 r = -EINVAL;
3754 goto out_err;
3755 }
3756
3757 if (output[j].type == 0xffffffff) {
3758 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
3759 output[j].array_base = next_param_base++;
3760 }
3761 }
3762
3763 /* add fake position export */
3764 if ((ctx.type == PIPE_SHADER_VERTEX || ctx.type == PIPE_SHADER_TESS_EVAL) && pos_emitted == false) {
3765 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
3766 output[j].gpr = 0;
3767 output[j].elem_size = 3;
3768 output[j].swizzle_x = 7;
3769 output[j].swizzle_y = 7;
3770 output[j].swizzle_z = 7;
3771 output[j].swizzle_w = 7;
3772 output[j].burst_count = 1;
3773 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3774 output[j].array_base = 60;
3775 output[j].op = CF_OP_EXPORT;
3776 j++;
3777 }
3778
3779 /* add fake param output for vertex shader if no param is exported */
3780 if ((ctx.type == PIPE_SHADER_VERTEX || ctx.type == PIPE_SHADER_TESS_EVAL) && next_param_base == 0) {
3781 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
3782 output[j].gpr = 0;
3783 output[j].elem_size = 3;
3784 output[j].swizzle_x = 7;
3785 output[j].swizzle_y = 7;
3786 output[j].swizzle_z = 7;
3787 output[j].swizzle_w = 7;
3788 output[j].burst_count = 1;
3789 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
3790 output[j].array_base = 0;
3791 output[j].op = CF_OP_EXPORT;
3792 j++;
3793 }
3794
3795 /* add fake pixel export */
3796 if (ctx.type == PIPE_SHADER_FRAGMENT && shader->nr_ps_color_exports == 0) {
3797 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
3798 output[j].gpr = 0;
3799 output[j].elem_size = 3;
3800 output[j].swizzle_x = 7;
3801 output[j].swizzle_y = 7;
3802 output[j].swizzle_z = 7;
3803 output[j].swizzle_w = 7;
3804 output[j].burst_count = 1;
3805 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3806 output[j].array_base = 0;
3807 output[j].op = CF_OP_EXPORT;
3808 j++;
3809 shader->nr_ps_color_exports++;
3810 }
3811
3812 noutput = j;
3813
3814 /* set export done on last export of each type */
3815 for (k = noutput - 1, output_done = 0; k >= 0; k--) {
3816 if (!(output_done & (1 << output[k].type))) {
3817 output_done |= (1 << output[k].type);
3818 output[k].op = CF_OP_EXPORT_DONE;
3819 }
3820 }
3821 /* add output to bytecode */
3822 for (i = 0; i < noutput; i++) {
3823 r = r600_bytecode_add_output(ctx.bc, &output[i]);
3824 if (r)
3825 goto out_err;
3826 }
3827 }
3828
3829 /* add program end */
3830 if (ctx.bc->chip_class == CAYMAN)
3831 cm_bytecode_add_cf_end(ctx.bc);
3832 else {
3833 const struct cf_op_info *last = NULL;
3834
3835 if (ctx.bc->cf_last)
3836 last = r600_isa_cf(ctx.bc->cf_last->op);
3837
3838 /* alu clause instructions don't have EOP bit, so add NOP */
3839 if (!last || last->flags & CF_ALU)
3840 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
3841
3842 ctx.bc->cf_last->end_of_program = 1;
3843 }
3844
3845 /* check GPR limit - we have 124 = 128 - 4
3846 * (4 are reserved as alu clause temporary registers) */
3847 if (ctx.bc->ngpr > 124) {
3848 R600_ERR("GPR limit exceeded - shader requires %d registers\n", ctx.bc->ngpr);
3849 r = -ENOMEM;
3850 goto out_err;
3851 }
3852
3853 if (ctx.type == PIPE_SHADER_GEOMETRY) {
3854 if ((r = generate_gs_copy_shader(rctx, pipeshader, &so)))
3855 return r;
3856 }
3857
3858 free(ctx.literals);
3859 tgsi_parse_free(&ctx.parse);
3860 return 0;
3861 out_err:
3862 free(ctx.literals);
3863 tgsi_parse_free(&ctx.parse);
3864 return r;
3865 }
3866
3867 static int tgsi_unsupported(struct r600_shader_ctx *ctx)
3868 {
3869 const unsigned tgsi_opcode =
3870 ctx->parse.FullToken.FullInstruction.Instruction.Opcode;
3871 R600_ERR("%s tgsi opcode unsupported\n",
3872 tgsi_get_opcode_name(tgsi_opcode));
3873 return -EINVAL;
3874 }
3875
3876 static int tgsi_end(struct r600_shader_ctx *ctx UNUSED)
3877 {
3878 return 0;
3879 }
3880
3881 static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
3882 const struct r600_shader_src *shader_src,
3883 unsigned chan)
3884 {
3885 bc_src->sel = shader_src->sel;
3886 bc_src->chan = shader_src->swizzle[chan];
3887 bc_src->neg = shader_src->neg;
3888 bc_src->abs = shader_src->abs;
3889 bc_src->rel = shader_src->rel;
3890 bc_src->value = shader_src->value[bc_src->chan];
3891 bc_src->kc_bank = shader_src->kc_bank;
3892 bc_src->kc_rel = shader_src->kc_rel;
3893 }
3894
3895 static void r600_bytecode_src_set_abs(struct r600_bytecode_alu_src *bc_src)
3896 {
3897 bc_src->abs = 1;
3898 bc_src->neg = 0;
3899 }
3900
3901 static void r600_bytecode_src_toggle_neg(struct r600_bytecode_alu_src *bc_src)
3902 {
3903 bc_src->neg = !bc_src->neg;
3904 }
3905
3906 static void tgsi_dst(struct r600_shader_ctx *ctx,
3907 const struct tgsi_full_dst_register *tgsi_dst,
3908 unsigned swizzle,
3909 struct r600_bytecode_alu_dst *r600_dst)
3910 {
3911 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3912
3913 r600_dst->sel = tgsi_dst->Register.Index;
3914 r600_dst->sel += ctx->file_offset[tgsi_dst->Register.File];
3915 r600_dst->chan = swizzle;
3916 r600_dst->write = 1;
3917 if (inst->Instruction.Saturate) {
3918 r600_dst->clamp = 1;
3919 }
3920 if (ctx->type == PIPE_SHADER_TESS_CTRL) {
3921 if (tgsi_dst->Register.File == TGSI_FILE_OUTPUT) {
3922 return;
3923 }
3924 }
3925 if (tgsi_dst->Register.Indirect)
3926 r600_dst->rel = V_SQ_REL_RELATIVE;
3927
3928 }
3929
3930 static int tgsi_op2_64_params(struct r600_shader_ctx *ctx, bool singledest, bool swap)
3931 {
3932 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3933 unsigned write_mask = inst->Dst[0].Register.WriteMask;
3934 struct r600_bytecode_alu alu;
3935 int i, j, r, lasti = tgsi_last_instruction(write_mask);
3936 int use_tmp = 0;
3937
3938 if (singledest) {
3939 switch (write_mask) {
3940 case 0x1:
3941 write_mask = 0x3;
3942 break;
3943 case 0x2:
3944 use_tmp = 1;
3945 write_mask = 0x3;
3946 break;
3947 case 0x4:
3948 write_mask = 0xc;
3949 break;
3950 case 0x8:
3951 write_mask = 0xc;
3952 use_tmp = 3;
3953 break;
3954 }
3955 }
3956
3957 lasti = tgsi_last_instruction(write_mask);
3958 for (i = 0; i <= lasti; i++) {
3959
3960 if (!(write_mask & (1 << i)))
3961 continue;
3962
3963 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3964
3965 if (singledest) {
3966 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3967 if (use_tmp) {
3968 alu.dst.sel = ctx->temp_reg;
3969 alu.dst.chan = i;
3970 alu.dst.write = 1;
3971 }
3972 if (i == 1 || i == 3)
3973 alu.dst.write = 0;
3974 } else
3975 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3976
3977 alu.op = ctx->inst_info->op;
3978 if (ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DABS) {
3979 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3980 } else if (!swap) {
3981 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3982 r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
3983 }
3984 } else {
3985 r600_bytecode_src(&alu.src[0], &ctx->src[1], fp64_switch(i));
3986 r600_bytecode_src(&alu.src[1], &ctx->src[0], fp64_switch(i));
3987 }
3988
3989 /* handle some special cases */
3990 if (i == 1 || i == 3) {
3991 switch (ctx->parse.FullToken.FullInstruction.Instruction.Opcode) {
3992 case TGSI_OPCODE_DABS:
3993 r600_bytecode_src_set_abs(&alu.src[0]);
3994 break;
3995 default:
3996 break;
3997 }
3998 }
3999 if (i == lasti) {
4000 alu.last = 1;
4001 }
4002 r = r600_bytecode_add_alu(ctx->bc, &alu);
4003 if (r)
4004 return r;
4005 }
4006
4007 if (use_tmp) {
4008 write_mask = inst->Dst[0].Register.WriteMask;
4009
4010 /* move result from temp to dst */
4011 for (i = 0; i <= lasti; i++) {
4012 if (!(write_mask & (1 << i)))
4013 continue;
4014
4015 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4016 alu.op = ALU_OP1_MOV;
4017 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4018 alu.src[0].sel = ctx->temp_reg;
4019 alu.src[0].chan = use_tmp - 1;
4020 alu.last = (i == lasti);
4021
4022 r = r600_bytecode_add_alu(ctx->bc, &alu);
4023 if (r)
4024 return r;
4025 }
4026 }
4027 return 0;
4028 }
4029
4030 static int tgsi_op2_64(struct r600_shader_ctx *ctx)
4031 {
4032 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4033 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4034 /* confirm writemasking */
4035 if ((write_mask & 0x3) != 0x3 &&
4036 (write_mask & 0xc) != 0xc) {
4037 fprintf(stderr, "illegal writemask for 64-bit: 0x%x\n", write_mask);
4038 return -1;
4039 }
4040 return tgsi_op2_64_params(ctx, false, false);
4041 }
4042
4043 static int tgsi_op2_64_single_dest(struct r600_shader_ctx *ctx)
4044 {
4045 return tgsi_op2_64_params(ctx, true, false);
4046 }
4047
4048 static int tgsi_op2_64_single_dest_s(struct r600_shader_ctx *ctx)
4049 {
4050 return tgsi_op2_64_params(ctx, true, true);
4051 }
4052
4053 static int tgsi_op3_64(struct r600_shader_ctx *ctx)
4054 {
4055 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4056 struct r600_bytecode_alu alu;
4057 int i, j, r;
4058 int lasti = 3;
4059 int tmp = r600_get_temp(ctx);
4060
4061 for (i = 0; i < lasti + 1; i++) {
4062
4063 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4064 alu.op = ctx->inst_info->op;
4065 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4066 r600_bytecode_src(&alu.src[j], &ctx->src[j], i == 3 ? 0 : 1);
4067 }
4068
4069 if (inst->Dst[0].Register.WriteMask & (1 << i))
4070 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4071 else
4072 alu.dst.sel = tmp;
4073
4074 alu.dst.chan = i;
4075 alu.is_op3 = 1;
4076 if (i == lasti) {
4077 alu.last = 1;
4078 }
4079 r = r600_bytecode_add_alu(ctx->bc, &alu);
4080 if (r)
4081 return r;
4082 }
4083 return 0;
4084 }
4085
4086 static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap, int trans_only)
4087 {
4088 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4089 struct r600_bytecode_alu alu;
4090 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4091 int i, j, r, lasti = tgsi_last_instruction(write_mask);
4092 /* use temp register if trans_only and more than one dst component */
4093 int use_tmp = trans_only && (write_mask ^ (1 << lasti));
4094 unsigned op = ctx->inst_info->op;
4095
4096 if (op == ALU_OP2_MUL_IEEE &&
4097 ctx->info.properties[TGSI_PROPERTY_MUL_ZERO_WINS])
4098 op = ALU_OP2_MUL;
4099
4100 for (i = 0; i <= lasti; i++) {
4101 if (!(write_mask & (1 << i)))
4102 continue;
4103
4104 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4105 if (use_tmp) {
4106 alu.dst.sel = ctx->temp_reg;
4107 alu.dst.chan = i;
4108 alu.dst.write = 1;
4109 } else
4110 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4111
4112 alu.op = op;
4113 if (!swap) {
4114 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4115 r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
4116 }
4117 } else {
4118 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4119 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4120 }
4121 if (i == lasti || trans_only) {
4122 alu.last = 1;
4123 }
4124 r = r600_bytecode_add_alu(ctx->bc, &alu);
4125 if (r)
4126 return r;
4127 }
4128
4129 if (use_tmp) {
4130 /* move result from temp to dst */
4131 for (i = 0; i <= lasti; i++) {
4132 if (!(write_mask & (1 << i)))
4133 continue;
4134
4135 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4136 alu.op = ALU_OP1_MOV;
4137 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4138 alu.src[0].sel = ctx->temp_reg;
4139 alu.src[0].chan = i;
4140 alu.last = (i == lasti);
4141
4142 r = r600_bytecode_add_alu(ctx->bc, &alu);
4143 if (r)
4144 return r;
4145 }
4146 }
4147 return 0;
4148 }
4149
4150 static int tgsi_op2(struct r600_shader_ctx *ctx)
4151 {
4152 return tgsi_op2_s(ctx, 0, 0);
4153 }
4154
4155 static int tgsi_op2_swap(struct r600_shader_ctx *ctx)
4156 {
4157 return tgsi_op2_s(ctx, 1, 0);
4158 }
4159
4160 static int tgsi_op2_trans(struct r600_shader_ctx *ctx)
4161 {
4162 return tgsi_op2_s(ctx, 0, 1);
4163 }
4164
4165 static int tgsi_ineg(struct r600_shader_ctx *ctx)
4166 {
4167 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4168 struct r600_bytecode_alu alu;
4169 int i, r;
4170 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4171
4172 for (i = 0; i < lasti + 1; i++) {
4173
4174 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4175 continue;
4176 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4177 alu.op = ctx->inst_info->op;
4178
4179 alu.src[0].sel = V_SQ_ALU_SRC_0;
4180
4181 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4182
4183 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4184
4185 if (i == lasti) {
4186 alu.last = 1;
4187 }
4188 r = r600_bytecode_add_alu(ctx->bc, &alu);
4189 if (r)
4190 return r;
4191 }
4192 return 0;
4193
4194 }
4195
4196 static int tgsi_dneg(struct r600_shader_ctx *ctx)
4197 {
4198 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4199 struct r600_bytecode_alu alu;
4200 int i, r;
4201 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4202
4203 for (i = 0; i < lasti + 1; i++) {
4204
4205 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4206 continue;
4207 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4208 alu.op = ALU_OP1_MOV;
4209
4210 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4211
4212 if (i == 1 || i == 3)
4213 r600_bytecode_src_toggle_neg(&alu.src[0]);
4214 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4215
4216 if (i == lasti) {
4217 alu.last = 1;
4218 }
4219 r = r600_bytecode_add_alu(ctx->bc, &alu);
4220 if (r)
4221 return r;
4222 }
4223 return 0;
4224
4225 }
4226
4227 static int tgsi_dfracexp(struct r600_shader_ctx *ctx)
4228 {
4229 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4230 struct r600_bytecode_alu alu;
4231 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4232 int i, j, r;
4233
4234 for (i = 0; i <= 3; i++) {
4235 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4236 alu.op = ctx->inst_info->op;
4237
4238 alu.dst.sel = ctx->temp_reg;
4239 alu.dst.chan = i;
4240 alu.dst.write = 1;
4241 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4242 r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
4243 }
4244
4245 if (i == 3)
4246 alu.last = 1;
4247
4248 r = r600_bytecode_add_alu(ctx->bc, &alu);
4249 if (r)
4250 return r;
4251 }
4252
4253 /* Replicate significand result across channels. */
4254 for (i = 0; i <= 3; i++) {
4255 if (!(write_mask & (1 << i)))
4256 continue;
4257
4258 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4259 alu.op = ALU_OP1_MOV;
4260 alu.src[0].chan = (i & 1) + 2;
4261 alu.src[0].sel = ctx->temp_reg;
4262
4263 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4264 alu.dst.write = 1;
4265 alu.last = 1;
4266 r = r600_bytecode_add_alu(ctx->bc, &alu);
4267 if (r)
4268 return r;
4269 }
4270
4271 for (i = 0; i <= 3; i++) {
4272 if (inst->Dst[1].Register.WriteMask & (1 << i)) {
4273 /* MOV third channels to writemask dst1 */
4274 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4275 alu.op = ALU_OP1_MOV;
4276 alu.src[0].chan = 1;
4277 alu.src[0].sel = ctx->temp_reg;
4278
4279 tgsi_dst(ctx, &inst->Dst[1], i, &alu.dst);
4280 alu.last = 1;
4281 r = r600_bytecode_add_alu(ctx->bc, &alu);
4282 if (r)
4283 return r;
4284 break;
4285 }
4286 }
4287 return 0;
4288 }
4289
4290
4291 static int egcm_int_to_double(struct r600_shader_ctx *ctx)
4292 {
4293 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4294 struct r600_bytecode_alu alu;
4295 int i, r;
4296 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4297
4298 assert(inst->Instruction.Opcode == TGSI_OPCODE_I2D ||
4299 inst->Instruction.Opcode == TGSI_OPCODE_U2D);
4300
4301 for (i = 0; i <= (lasti+1)/2; i++) {
4302 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4303 alu.op = ctx->inst_info->op;
4304
4305 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4306 alu.dst.sel = ctx->temp_reg;
4307 alu.dst.chan = i;
4308 alu.dst.write = 1;
4309 alu.last = 1;
4310
4311 r = r600_bytecode_add_alu(ctx->bc, &alu);
4312 if (r)
4313 return r;
4314 }
4315
4316 for (i = 0; i <= lasti; i++) {
4317 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4318 alu.op = ALU_OP1_FLT32_TO_FLT64;
4319
4320 alu.src[0].chan = i/2;
4321 if (i%2 == 0)
4322 alu.src[0].sel = ctx->temp_reg;
4323 else {
4324 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
4325 alu.src[0].value = 0x0;
4326 }
4327 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4328 alu.last = i == lasti;
4329
4330 r = r600_bytecode_add_alu(ctx->bc, &alu);
4331 if (r)
4332 return r;
4333 }
4334
4335 return 0;
4336 }
4337
4338 static int egcm_double_to_int(struct r600_shader_ctx *ctx)
4339 {
4340 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4341 struct r600_bytecode_alu alu;
4342 int i, r;
4343 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4344
4345 assert(inst->Instruction.Opcode == TGSI_OPCODE_D2I ||
4346 inst->Instruction.Opcode == TGSI_OPCODE_D2U);
4347
4348 for (i = 0; i <= lasti; i++) {
4349 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4350 alu.op = ALU_OP1_FLT64_TO_FLT32;
4351
4352 r600_bytecode_src(&alu.src[0], &ctx->src[0], fp64_switch(i));
4353 alu.dst.chan = i;
4354 alu.dst.sel = ctx->temp_reg;
4355 alu.dst.write = i%2 == 0;
4356 alu.last = i == lasti;
4357
4358 r = r600_bytecode_add_alu(ctx->bc, &alu);
4359 if (r)
4360 return r;
4361 }
4362
4363 for (i = 0; i <= (lasti+1)/2; i++) {
4364 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4365 alu.op = ctx->inst_info->op;
4366
4367 alu.src[0].chan = i*2;
4368 alu.src[0].sel = ctx->temp_reg;
4369 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
4370 alu.last = 1;
4371
4372 r = r600_bytecode_add_alu(ctx->bc, &alu);
4373 if (r)
4374 return r;
4375 }
4376
4377 return 0;
4378 }
4379
4380 static int cayman_emit_unary_double_raw(struct r600_bytecode *bc,
4381 unsigned op,
4382 int dst_reg,
4383 struct r600_shader_src *src,
4384 bool abs)
4385 {
4386 struct r600_bytecode_alu alu;
4387 const int last_slot = 3;
4388 int r;
4389
4390 /* these have to write the result to X/Y by the looks of it */
4391 for (int i = 0 ; i < last_slot; i++) {
4392 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4393 alu.op = op;
4394
4395 r600_bytecode_src(&alu.src[0], src, 1);
4396 r600_bytecode_src(&alu.src[1], src, 0);
4397
4398 if (abs)
4399 r600_bytecode_src_set_abs(&alu.src[1]);
4400
4401 alu.dst.sel = dst_reg;
4402 alu.dst.chan = i;
4403 alu.dst.write = (i == 0 || i == 1);
4404
4405 if (bc->chip_class != CAYMAN || i == last_slot - 1)
4406 alu.last = 1;
4407 r = r600_bytecode_add_alu(bc, &alu);
4408 if (r)
4409 return r;
4410 }
4411
4412 return 0;
4413 }
4414
4415 static int cayman_emit_double_instr(struct r600_shader_ctx *ctx)
4416 {
4417 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4418 int i, r;
4419 struct r600_bytecode_alu alu;
4420 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4421 int t1 = ctx->temp_reg;
4422
4423 /* should only be one src regs */
4424 assert(inst->Instruction.NumSrcRegs == 1);
4425
4426 /* only support one double at a time */
4427 assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
4428 inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);
4429
4430 r = cayman_emit_unary_double_raw(
4431 ctx->bc, ctx->inst_info->op, t1,
4432 &ctx->src[0],
4433 ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DRSQ ||
4434 ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DSQRT);
4435 if (r)
4436 return r;
4437
4438 for (i = 0 ; i <= lasti; i++) {
4439 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4440 continue;
4441 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4442 alu.op = ALU_OP1_MOV;
4443 alu.src[0].sel = t1;
4444 alu.src[0].chan = (i == 0 || i == 2) ? 0 : 1;
4445 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4446 alu.dst.write = 1;
4447 if (i == lasti)
4448 alu.last = 1;
4449 r = r600_bytecode_add_alu(ctx->bc, &alu);
4450 if (r)
4451 return r;
4452 }
4453 return 0;
4454 }
4455
4456 static int cayman_emit_float_instr(struct r600_shader_ctx *ctx)
4457 {
4458 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4459 int i, j, r;
4460 struct r600_bytecode_alu alu;
4461 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
4462
4463 for (i = 0 ; i < last_slot; i++) {
4464 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4465 alu.op = ctx->inst_info->op;
4466 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4467 r600_bytecode_src(&alu.src[j], &ctx->src[j], 0);
4468
4469 /* RSQ should take the absolute value of src */
4470 if (inst->Instruction.Opcode == TGSI_OPCODE_RSQ) {
4471 r600_bytecode_src_set_abs(&alu.src[j]);
4472 }
4473 }
4474 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4475 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
4476
4477 if (i == last_slot - 1)
4478 alu.last = 1;
4479 r = r600_bytecode_add_alu(ctx->bc, &alu);
4480 if (r)
4481 return r;
4482 }
4483 return 0;
4484 }
4485
4486 static int cayman_mul_int_instr(struct r600_shader_ctx *ctx)
4487 {
4488 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4489 int i, j, k, r;
4490 struct r600_bytecode_alu alu;
4491 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4492 int t1 = ctx->temp_reg;
4493
4494 for (k = 0; k <= lasti; k++) {
4495 if (!(inst->Dst[0].Register.WriteMask & (1 << k)))
4496 continue;
4497
4498 for (i = 0 ; i < 4; i++) {
4499 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4500 alu.op = ctx->inst_info->op;
4501 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4502 r600_bytecode_src(&alu.src[j], &ctx->src[j], k);
4503 }
4504 alu.dst.sel = t1;
4505 alu.dst.chan = i;
4506 alu.dst.write = (i == k);
4507 if (i == 3)
4508 alu.last = 1;
4509 r = r600_bytecode_add_alu(ctx->bc, &alu);
4510 if (r)
4511 return r;
4512 }
4513 }
4514
4515 for (i = 0 ; i <= lasti; i++) {
4516 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4517 continue;
4518 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4519 alu.op = ALU_OP1_MOV;
4520 alu.src[0].sel = t1;
4521 alu.src[0].chan = i;
4522 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4523 alu.dst.write = 1;
4524 if (i == lasti)
4525 alu.last = 1;
4526 r = r600_bytecode_add_alu(ctx->bc, &alu);
4527 if (r)
4528 return r;
4529 }
4530
4531 return 0;
4532 }
4533
4534
4535 static int cayman_mul_double_instr(struct r600_shader_ctx *ctx)
4536 {
4537 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4538 int i, j, k, r;
4539 struct r600_bytecode_alu alu;
4540 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4541 int t1 = ctx->temp_reg;
4542
4543 /* t1 would get overwritten below if we actually tried to
4544 * multiply two pairs of doubles at a time. */
4545 assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
4546 inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);
4547
4548 k = inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ? 0 : 1;
4549
4550 for (i = 0; i < 4; i++) {
4551 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4552 alu.op = ctx->inst_info->op;
4553 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4554 r600_bytecode_src(&alu.src[j], &ctx->src[j], k * 2 + ((i == 3) ? 0 : 1));
4555 }
4556 alu.dst.sel = t1;
4557 alu.dst.chan = i;
4558 alu.dst.write = 1;
4559 if (i == 3)
4560 alu.last = 1;
4561 r = r600_bytecode_add_alu(ctx->bc, &alu);
4562 if (r)
4563 return r;
4564 }
4565
4566 for (i = 0; i <= lasti; i++) {
4567 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4568 continue;
4569 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4570 alu.op = ALU_OP1_MOV;
4571 alu.src[0].sel = t1;
4572 alu.src[0].chan = i;
4573 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4574 alu.dst.write = 1;
4575 if (i == lasti)
4576 alu.last = 1;
4577 r = r600_bytecode_add_alu(ctx->bc, &alu);
4578 if (r)
4579 return r;
4580 }
4581
4582 return 0;
4583 }
4584
4585 /*
4586 * Emit RECIP_64 + MUL_64 to implement division.
4587 */
4588 static int cayman_ddiv_instr(struct r600_shader_ctx *ctx)
4589 {
4590 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4591 int r;
4592 struct r600_bytecode_alu alu;
4593 int t1 = ctx->temp_reg;
4594 int k;
4595
4596 /* Only support one double at a time. This is the same constraint as
4597 * in DMUL lowering. */
4598 assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
4599 inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);
4600
4601 k = inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ? 0 : 1;
4602
4603 r = cayman_emit_unary_double_raw(ctx->bc, ALU_OP2_RECIP_64, t1, &ctx->src[1], false);
4604 if (r)
4605 return r;
4606
4607 for (int i = 0; i < 4; i++) {
4608 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4609 alu.op = ALU_OP2_MUL_64;
4610
4611 r600_bytecode_src(&alu.src[0], &ctx->src[0], k * 2 + ((i == 3) ? 0 : 1));
4612
4613 alu.src[1].sel = t1;
4614 alu.src[1].chan = (i == 3) ? 0 : 1;
4615
4616 alu.dst.sel = t1;
4617 alu.dst.chan = i;
4618 alu.dst.write = 1;
4619 if (i == 3)
4620 alu.last = 1;
4621 r = r600_bytecode_add_alu(ctx->bc, &alu);
4622 if (r)
4623 return r;
4624 }
4625
4626 for (int i = 0; i < 2; i++) {
4627 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4628 alu.op = ALU_OP1_MOV;
4629 alu.src[0].sel = t1;
4630 alu.src[0].chan = i;
4631 tgsi_dst(ctx, &inst->Dst[0], k * 2 + i, &alu.dst);
4632 alu.dst.write = 1;
4633 if (i == 1)
4634 alu.last = 1;
4635 r = r600_bytecode_add_alu(ctx->bc, &alu);
4636 if (r)
4637 return r;
4638 }
4639 return 0;
4640 }
4641
4642 /*
4643 * r600 - trunc to -PI..PI range
4644 * r700 - normalize by dividing by 2PI
4645 * see fdo bug 27901
4646 */
4647 static int tgsi_setup_trig(struct r600_shader_ctx *ctx)
4648 {
4649 int r;
4650 struct r600_bytecode_alu alu;
4651
4652 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4653 alu.op = ALU_OP3_MULADD;
4654 alu.is_op3 = 1;
4655
4656 alu.dst.chan = 0;
4657 alu.dst.sel = ctx->temp_reg;
4658 alu.dst.write = 1;
4659
4660 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4661
4662 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4663 alu.src[1].chan = 0;
4664 alu.src[1].value = u_bitcast_f2u(0.5f * M_1_PI);
4665 alu.src[2].sel = V_SQ_ALU_SRC_0_5;
4666 alu.src[2].chan = 0;
4667 alu.last = 1;
4668 r = r600_bytecode_add_alu(ctx->bc, &alu);
4669 if (r)
4670 return r;
4671
4672 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4673 alu.op = ALU_OP1_FRACT;
4674
4675 alu.dst.chan = 0;
4676 alu.dst.sel = ctx->temp_reg;
4677 alu.dst.write = 1;
4678
4679 alu.src[0].sel = ctx->temp_reg;
4680 alu.src[0].chan = 0;
4681 alu.last = 1;
4682 r = r600_bytecode_add_alu(ctx->bc, &alu);
4683 if (r)
4684 return r;
4685
4686 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4687 alu.op = ALU_OP3_MULADD;
4688 alu.is_op3 = 1;
4689
4690 alu.dst.chan = 0;
4691 alu.dst.sel = ctx->temp_reg;
4692 alu.dst.write = 1;
4693
4694 alu.src[0].sel = ctx->temp_reg;
4695 alu.src[0].chan = 0;
4696
4697 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4698 alu.src[1].chan = 0;
4699 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
4700 alu.src[2].chan = 0;
4701
4702 if (ctx->bc->chip_class == R600) {
4703 alu.src[1].value = u_bitcast_f2u(2.0f * M_PI);
4704 alu.src[2].value = u_bitcast_f2u(-M_PI);
4705 } else {
4706 alu.src[1].sel = V_SQ_ALU_SRC_1;
4707 alu.src[2].sel = V_SQ_ALU_SRC_0_5;
4708 alu.src[2].neg = 1;
4709 }
4710
4711 alu.last = 1;
4712 r = r600_bytecode_add_alu(ctx->bc, &alu);
4713 if (r)
4714 return r;
4715 return 0;
4716 }
4717
4718 static int cayman_trig(struct r600_shader_ctx *ctx)
4719 {
4720 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4721 struct r600_bytecode_alu alu;
4722 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
4723 int i, r;
4724
4725 r = tgsi_setup_trig(ctx);
4726 if (r)
4727 return r;
4728
4729
4730 for (i = 0; i < last_slot; i++) {
4731 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4732 alu.op = ctx->inst_info->op;
4733 alu.dst.chan = i;
4734
4735 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4736 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
4737
4738 alu.src[0].sel = ctx->temp_reg;
4739 alu.src[0].chan = 0;
4740 if (i == last_slot - 1)
4741 alu.last = 1;
4742 r = r600_bytecode_add_alu(ctx->bc, &alu);
4743 if (r)
4744 return r;
4745 }
4746 return 0;
4747 }
4748
4749 static int tgsi_trig(struct r600_shader_ctx *ctx)
4750 {
4751 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4752 struct r600_bytecode_alu alu;
4753 int i, r;
4754 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4755
4756 r = tgsi_setup_trig(ctx);
4757 if (r)
4758 return r;
4759
4760 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4761 alu.op = ctx->inst_info->op;
4762 alu.dst.chan = 0;
4763 alu.dst.sel = ctx->temp_reg;
4764 alu.dst.write = 1;
4765
4766 alu.src[0].sel = ctx->temp_reg;
4767 alu.src[0].chan = 0;
4768 alu.last = 1;
4769 r = r600_bytecode_add_alu(ctx->bc, &alu);
4770 if (r)
4771 return r;
4772
4773 /* replicate result */
4774 for (i = 0; i < lasti + 1; i++) {
4775 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4776 continue;
4777
4778 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4779 alu.op = ALU_OP1_MOV;
4780
4781 alu.src[0].sel = ctx->temp_reg;
4782 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4783 if (i == lasti)
4784 alu.last = 1;
4785 r = r600_bytecode_add_alu(ctx->bc, &alu);
4786 if (r)
4787 return r;
4788 }
4789 return 0;
4790 }
4791
4792 static int tgsi_kill(struct r600_shader_ctx *ctx)
4793 {
4794 const struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4795 struct r600_bytecode_alu alu;
4796 int i, r;
4797
4798 for (i = 0; i < 4; i++) {
4799 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4800 alu.op = ctx->inst_info->op;
4801
4802 alu.dst.chan = i;
4803
4804 alu.src[0].sel = V_SQ_ALU_SRC_0;
4805
4806 if (inst->Instruction.Opcode == TGSI_OPCODE_KILL) {
4807 alu.src[1].sel = V_SQ_ALU_SRC_1;
4808 alu.src[1].neg = 1;
4809 } else {
4810 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4811 }
4812 if (i == 3) {
4813 alu.last = 1;
4814 }
4815 r = r600_bytecode_add_alu(ctx->bc, &alu);
4816 if (r)
4817 return r;
4818 }
4819
4820 /* kill must be last in ALU */
4821 ctx->bc->force_add_cf = 1;
4822 ctx->shader->uses_kill = TRUE;
4823 return 0;
4824 }
4825
4826 static int tgsi_lit(struct r600_shader_ctx *ctx)
4827 {
4828 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4829 struct r600_bytecode_alu alu;
4830 int r;
4831
4832 /* tmp.x = max(src.y, 0.0) */
4833 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4834 alu.op = ALU_OP2_MAX;
4835 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
4836 alu.src[1].sel = V_SQ_ALU_SRC_0; /*0.0*/
4837 alu.src[1].chan = 1;
4838
4839 alu.dst.sel = ctx->temp_reg;
4840 alu.dst.chan = 0;
4841 alu.dst.write = 1;
4842
4843 alu.last = 1;
4844 r = r600_bytecode_add_alu(ctx->bc, &alu);
4845 if (r)
4846 return r;
4847
4848 if (inst->Dst[0].Register.WriteMask & (1 << 2))
4849 {
4850 int chan;
4851 int sel;
4852 unsigned i;
4853
4854 if (ctx->bc->chip_class == CAYMAN) {
4855 for (i = 0; i < 3; i++) {
4856 /* tmp.z = log(tmp.x) */
4857 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4858 alu.op = ALU_OP1_LOG_CLAMPED;
4859 alu.src[0].sel = ctx->temp_reg;
4860 alu.src[0].chan = 0;
4861 alu.dst.sel = ctx->temp_reg;
4862 alu.dst.chan = i;
4863 if (i == 2) {
4864 alu.dst.write = 1;
4865 alu.last = 1;
4866 } else
4867 alu.dst.write = 0;
4868
4869 r = r600_bytecode_add_alu(ctx->bc, &alu);
4870 if (r)
4871 return r;
4872 }
4873 } else {
4874 /* tmp.z = log(tmp.x) */
4875 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4876 alu.op = ALU_OP1_LOG_CLAMPED;
4877 alu.src[0].sel = ctx->temp_reg;
4878 alu.src[0].chan = 0;
4879 alu.dst.sel = ctx->temp_reg;
4880 alu.dst.chan = 2;
4881 alu.dst.write = 1;
4882 alu.last = 1;
4883 r = r600_bytecode_add_alu(ctx->bc, &alu);
4884 if (r)
4885 return r;
4886 }
4887
4888 chan = alu.dst.chan;
4889 sel = alu.dst.sel;
4890
4891 /* tmp.x = amd MUL_LIT(tmp.z, src.w, src.x ) */
4892 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4893 alu.op = ALU_OP3_MUL_LIT;
4894 alu.src[0].sel = sel;
4895 alu.src[0].chan = chan;
4896 r600_bytecode_src(&alu.src[1], &ctx->src[0], 3);
4897 r600_bytecode_src(&alu.src[2], &ctx->src[0], 0);
4898 alu.dst.sel = ctx->temp_reg;
4899 alu.dst.chan = 0;
4900 alu.dst.write = 1;
4901 alu.is_op3 = 1;
4902 alu.last = 1;
4903 r = r600_bytecode_add_alu(ctx->bc, &alu);
4904 if (r)
4905 return r;
4906
4907 if (ctx->bc->chip_class == CAYMAN) {
4908 for (i = 0; i < 3; i++) {
4909 /* dst.z = exp(tmp.x) */
4910 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4911 alu.op = ALU_OP1_EXP_IEEE;
4912 alu.src[0].sel = ctx->temp_reg;
4913 alu.src[0].chan = 0;
4914 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4915 if (i == 2) {
4916 alu.dst.write = 1;
4917 alu.last = 1;
4918 } else
4919 alu.dst.write = 0;
4920 r = r600_bytecode_add_alu(ctx->bc, &alu);
4921 if (r)
4922 return r;
4923 }
4924 } else {
4925 /* dst.z = exp(tmp.x) */
4926 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4927 alu.op = ALU_OP1_EXP_IEEE;
4928 alu.src[0].sel = ctx->temp_reg;
4929 alu.src[0].chan = 0;
4930 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
4931 alu.last = 1;
4932 r = r600_bytecode_add_alu(ctx->bc, &alu);
4933 if (r)
4934 return r;
4935 }
4936 }
4937
4938 /* dst.x, <- 1.0 */
4939 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4940 alu.op = ALU_OP1_MOV;
4941 alu.src[0].sel = V_SQ_ALU_SRC_1; /*1.0*/
4942 alu.src[0].chan = 0;
4943 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
4944 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 0) & 1;
4945 r = r600_bytecode_add_alu(ctx->bc, &alu);
4946 if (r)
4947 return r;
4948
4949 /* dst.y = max(src.x, 0.0) */
4950 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4951 alu.op = ALU_OP2_MAX;
4952 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4953 alu.src[1].sel = V_SQ_ALU_SRC_0; /*0.0*/
4954 alu.src[1].chan = 0;
4955 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
4956 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 1) & 1;
4957 r = r600_bytecode_add_alu(ctx->bc, &alu);
4958 if (r)
4959 return r;
4960
4961 /* dst.w, <- 1.0 */
4962 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4963 alu.op = ALU_OP1_MOV;
4964 alu.src[0].sel = V_SQ_ALU_SRC_1;
4965 alu.src[0].chan = 0;
4966 tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
4967 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 3) & 1;
4968 alu.last = 1;
4969 r = r600_bytecode_add_alu(ctx->bc, &alu);
4970 if (r)
4971 return r;
4972
4973 return 0;
4974 }
4975
4976 static int tgsi_rsq(struct r600_shader_ctx *ctx)
4977 {
4978 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4979 struct r600_bytecode_alu alu;
4980 int i, r;
4981
4982 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4983
4984 alu.op = ALU_OP1_RECIPSQRT_IEEE;
4985
4986 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
4987 r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
4988 r600_bytecode_src_set_abs(&alu.src[i]);
4989 }
4990 alu.dst.sel = ctx->temp_reg;
4991 alu.dst.write = 1;
4992 alu.last = 1;
4993 r = r600_bytecode_add_alu(ctx->bc, &alu);
4994 if (r)
4995 return r;
4996 /* replicate result */
4997 return tgsi_helper_tempx_replicate(ctx);
4998 }
4999
5000 static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx)
5001 {
5002 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5003 struct r600_bytecode_alu alu;
5004 int i, r;
5005
5006 for (i = 0; i < 4; i++) {
5007 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5008 alu.src[0].sel = ctx->temp_reg;
5009 alu.op = ALU_OP1_MOV;
5010 alu.dst.chan = i;
5011 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5012 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
5013 if (i == 3)
5014 alu.last = 1;
5015 r = r600_bytecode_add_alu(ctx->bc, &alu);
5016 if (r)
5017 return r;
5018 }
5019 return 0;
5020 }
5021
5022 static int tgsi_trans_srcx_replicate(struct r600_shader_ctx *ctx)
5023 {
5024 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5025 struct r600_bytecode_alu alu;
5026 int i, r;
5027
5028 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5029 alu.op = ctx->inst_info->op;
5030 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
5031 r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
5032 }
5033 alu.dst.sel = ctx->temp_reg;
5034 alu.dst.write = 1;
5035 alu.last = 1;
5036 r = r600_bytecode_add_alu(ctx->bc, &alu);
5037 if (r)
5038 return r;
5039 /* replicate result */
5040 return tgsi_helper_tempx_replicate(ctx);
5041 }
5042
5043 static int cayman_pow(struct r600_shader_ctx *ctx)
5044 {
5045 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5046 int i, r;
5047 struct r600_bytecode_alu alu;
5048 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
5049
5050 for (i = 0; i < 3; i++) {
5051 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5052 alu.op = ALU_OP1_LOG_IEEE;
5053 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5054 alu.dst.sel = ctx->temp_reg;
5055 alu.dst.chan = i;
5056 alu.dst.write = 1;
5057 if (i == 2)
5058 alu.last = 1;
5059 r = r600_bytecode_add_alu(ctx->bc, &alu);
5060 if (r)
5061 return r;
5062 }
5063
5064 /* b * LOG2(a) */
5065 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5066 alu.op = ALU_OP2_MUL;
5067 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
5068 alu.src[1].sel = ctx->temp_reg;
5069 alu.dst.sel = ctx->temp_reg;
5070 alu.dst.write = 1;
5071 alu.last = 1;
5072 r = r600_bytecode_add_alu(ctx->bc, &alu);
5073 if (r)
5074 return r;
5075
5076 for (i = 0; i < last_slot; i++) {
5077 /* POW(a,b) = EXP2(b * LOG2(a))*/
5078 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5079 alu.op = ALU_OP1_EXP_IEEE;
5080 alu.src[0].sel = ctx->temp_reg;
5081
5082 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5083 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
5084 if (i == last_slot - 1)
5085 alu.last = 1;
5086 r = r600_bytecode_add_alu(ctx->bc, &alu);
5087 if (r)
5088 return r;
5089 }
5090 return 0;
5091 }
5092
5093 static int tgsi_pow(struct r600_shader_ctx *ctx)
5094 {
5095 struct r600_bytecode_alu alu;
5096 int r;
5097
5098 /* LOG2(a) */
5099 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5100 alu.op = ALU_OP1_LOG_IEEE;
5101 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5102 alu.dst.sel = ctx->temp_reg;
5103 alu.dst.write = 1;
5104 alu.last = 1;
5105 r = r600_bytecode_add_alu(ctx->bc, &alu);
5106 if (r)
5107 return r;
5108 /* b * LOG2(a) */
5109 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5110 alu.op = ALU_OP2_MUL;
5111 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
5112 alu.src[1].sel = ctx->temp_reg;
5113 alu.dst.sel = ctx->temp_reg;
5114 alu.dst.write = 1;
5115 alu.last = 1;
5116 r = r600_bytecode_add_alu(ctx->bc, &alu);
5117 if (r)
5118 return r;
5119 /* POW(a,b) = EXP2(b * LOG2(a))*/
5120 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5121 alu.op = ALU_OP1_EXP_IEEE;
5122 alu.src[0].sel = ctx->temp_reg;
5123 alu.dst.sel = ctx->temp_reg;
5124 alu.dst.write = 1;
5125 alu.last = 1;
5126 r = r600_bytecode_add_alu(ctx->bc, &alu);
5127 if (r)
5128 return r;
5129 return tgsi_helper_tempx_replicate(ctx);
5130 }
5131
5132 static int tgsi_divmod(struct r600_shader_ctx *ctx, int mod, int signed_op)
5133 {
5134 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5135 struct r600_bytecode_alu alu;
5136 int i, r, j;
5137 unsigned write_mask = inst->Dst[0].Register.WriteMask;
5138 int tmp0 = ctx->temp_reg;
5139 int tmp1 = r600_get_temp(ctx);
5140 int tmp2 = r600_get_temp(ctx);
5141 int tmp3 = r600_get_temp(ctx);
5142 /* Unsigned path:
5143 *
5144 * we need to represent src1 as src2*q + r, where q - quotient, r - remainder
5145 *
5146 * 1. tmp0.x = rcp (src2) = 2^32/src2 + e, where e is rounding error
5147 * 2. tmp0.z = lo (tmp0.x * src2)
5148 * 3. tmp0.w = -tmp0.z
5149 * 4. tmp0.y = hi (tmp0.x * src2)
5150 * 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z) = abs(lo(rcp*src2))
5151 * 6. tmp0.w = hi (tmp0.z * tmp0.x) = e, rounding error
5152 * 7. tmp1.x = tmp0.x - tmp0.w
5153 * 8. tmp1.y = tmp0.x + tmp0.w
5154 * 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x)
5155 * 10. tmp0.z = hi(tmp0.x * src1) = q
5156 * 11. tmp0.y = lo (tmp0.z * src2) = src2*q = src1 - r
5157 *
5158 * 12. tmp0.w = src1 - tmp0.y = r
5159 * 13. tmp1.x = tmp0.w >= src2 = r >= src2 (uint comparison)
5160 * 14. tmp1.y = src1 >= tmp0.y = r >= 0 (uint comparison)
5161 *
5162 * if DIV
5163 *
5164 * 15. tmp1.z = tmp0.z + 1 = q + 1
5165 * 16. tmp1.w = tmp0.z - 1 = q - 1
5166 *
5167 * else MOD
5168 *
5169 * 15. tmp1.z = tmp0.w - src2 = r - src2
5170 * 16. tmp1.w = tmp0.w + src2 = r + src2
5171 *
5172 * endif
5173 *
5174 * 17. tmp1.x = tmp1.x & tmp1.y
5175 *
5176 * DIV: 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z
5177 * MOD: 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z
5178 *
5179 * 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z
5180 * 20. dst = src2==0 ? MAX_UINT : tmp0.z
5181 *
5182 * Signed path:
5183 *
5184 * Same as unsigned, using abs values of the operands,
5185 * and fixing the sign of the result in the end.
5186 */
5187
5188 for (i = 0; i < 4; i++) {
5189 if (!(write_mask & (1<<i)))
5190 continue;
5191
5192 if (signed_op) {
5193
5194 /* tmp2.x = -src0 */
5195 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5196 alu.op = ALU_OP2_SUB_INT;
5197
5198 alu.dst.sel = tmp2;
5199 alu.dst.chan = 0;
5200 alu.dst.write = 1;
5201
5202 alu.src[0].sel = V_SQ_ALU_SRC_0;
5203
5204 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5205
5206 alu.last = 1;
5207 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5208 return r;
5209
5210 /* tmp2.y = -src1 */
5211 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5212 alu.op = ALU_OP2_SUB_INT;
5213
5214 alu.dst.sel = tmp2;
5215 alu.dst.chan = 1;
5216 alu.dst.write = 1;
5217
5218 alu.src[0].sel = V_SQ_ALU_SRC_0;
5219
5220 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5221
5222 alu.last = 1;
5223 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5224 return r;
5225
5226 /* tmp2.z sign bit is set if src0 and src2 signs are different */
5227 /* it will be a sign of the quotient */
5228 if (!mod) {
5229
5230 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5231 alu.op = ALU_OP2_XOR_INT;
5232
5233 alu.dst.sel = tmp2;
5234 alu.dst.chan = 2;
5235 alu.dst.write = 1;
5236
5237 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5238 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5239
5240 alu.last = 1;
5241 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5242 return r;
5243 }
5244
5245 /* tmp2.x = |src0| */
5246 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5247 alu.op = ALU_OP3_CNDGE_INT;
5248 alu.is_op3 = 1;
5249
5250 alu.dst.sel = tmp2;
5251 alu.dst.chan = 0;
5252 alu.dst.write = 1;
5253
5254 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5255 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5256 alu.src[2].sel = tmp2;
5257 alu.src[2].chan = 0;
5258
5259 alu.last = 1;
5260 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5261 return r;
5262
5263 /* tmp2.y = |src1| */
5264 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5265 alu.op = ALU_OP3_CNDGE_INT;
5266 alu.is_op3 = 1;
5267
5268 alu.dst.sel = tmp2;
5269 alu.dst.chan = 1;
5270 alu.dst.write = 1;
5271
5272 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5273 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5274 alu.src[2].sel = tmp2;
5275 alu.src[2].chan = 1;
5276
5277 alu.last = 1;
5278 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5279 return r;
5280
5281 }
5282
5283 /* 1. tmp0.x = rcp_u (src2) = 2^32/src2 + e, where e is rounding error */
5284 if (ctx->bc->chip_class == CAYMAN) {
5285 /* tmp3.x = u2f(src2) */
5286 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5287 alu.op = ALU_OP1_UINT_TO_FLT;
5288
5289 alu.dst.sel = tmp3;
5290 alu.dst.chan = 0;
5291 alu.dst.write = 1;
5292
5293 if (signed_op) {
5294 alu.src[0].sel = tmp2;
5295 alu.src[0].chan = 1;
5296 } else {
5297 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5298 }
5299
5300 alu.last = 1;
5301 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5302 return r;
5303
5304 /* tmp0.x = recip(tmp3.x) */
5305 for (j = 0 ; j < 3; j++) {
5306 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5307 alu.op = ALU_OP1_RECIP_IEEE;
5308
5309 alu.dst.sel = tmp0;
5310 alu.dst.chan = j;
5311 alu.dst.write = (j == 0);
5312
5313 alu.src[0].sel = tmp3;
5314 alu.src[0].chan = 0;
5315
5316 if (j == 2)
5317 alu.last = 1;
5318 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5319 return r;
5320 }
5321
5322 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5323 alu.op = ALU_OP2_MUL;
5324
5325 alu.src[0].sel = tmp0;
5326 alu.src[0].chan = 0;
5327
5328 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
5329 alu.src[1].value = 0x4f800000;
5330
5331 alu.dst.sel = tmp3;
5332 alu.dst.write = 1;
5333 alu.last = 1;
5334 r = r600_bytecode_add_alu(ctx->bc, &alu);
5335 if (r)
5336 return r;
5337
5338 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5339 alu.op = ALU_OP1_FLT_TO_UINT;
5340
5341 alu.dst.sel = tmp0;
5342 alu.dst.chan = 0;
5343 alu.dst.write = 1;
5344
5345 alu.src[0].sel = tmp3;
5346 alu.src[0].chan = 0;
5347
5348 alu.last = 1;
5349 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5350 return r;
5351
5352 } else {
5353 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5354 alu.op = ALU_OP1_RECIP_UINT;
5355
5356 alu.dst.sel = tmp0;
5357 alu.dst.chan = 0;
5358 alu.dst.write = 1;
5359
5360 if (signed_op) {
5361 alu.src[0].sel = tmp2;
5362 alu.src[0].chan = 1;
5363 } else {
5364 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5365 }
5366
5367 alu.last = 1;
5368 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5369 return r;
5370 }
5371
5372 /* 2. tmp0.z = lo (tmp0.x * src2) */
5373 if (ctx->bc->chip_class == CAYMAN) {
5374 for (j = 0 ; j < 4; j++) {
5375 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5376 alu.op = ALU_OP2_MULLO_UINT;
5377
5378 alu.dst.sel = tmp0;
5379 alu.dst.chan = j;
5380 alu.dst.write = (j == 2);
5381
5382 alu.src[0].sel = tmp0;
5383 alu.src[0].chan = 0;
5384 if (signed_op) {
5385 alu.src[1].sel = tmp2;
5386 alu.src[1].chan = 1;
5387 } else {
5388 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5389 }
5390
5391 alu.last = (j == 3);
5392 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5393 return r;
5394 }
5395 } else {
5396 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5397 alu.op = ALU_OP2_MULLO_UINT;
5398
5399 alu.dst.sel = tmp0;
5400 alu.dst.chan = 2;
5401 alu.dst.write = 1;
5402
5403 alu.src[0].sel = tmp0;
5404 alu.src[0].chan = 0;
5405 if (signed_op) {
5406 alu.src[1].sel = tmp2;
5407 alu.src[1].chan = 1;
5408 } else {
5409 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5410 }
5411
5412 alu.last = 1;
5413 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5414 return r;
5415 }
5416
5417 /* 3. tmp0.w = -tmp0.z */
5418 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5419 alu.op = ALU_OP2_SUB_INT;
5420
5421 alu.dst.sel = tmp0;
5422 alu.dst.chan = 3;
5423 alu.dst.write = 1;
5424
5425 alu.src[0].sel = V_SQ_ALU_SRC_0;
5426 alu.src[1].sel = tmp0;
5427 alu.src[1].chan = 2;
5428
5429 alu.last = 1;
5430 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5431 return r;
5432
5433 /* 4. tmp0.y = hi (tmp0.x * src2) */
5434 if (ctx->bc->chip_class == CAYMAN) {
5435 for (j = 0 ; j < 4; j++) {
5436 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5437 alu.op = ALU_OP2_MULHI_UINT;
5438
5439 alu.dst.sel = tmp0;
5440 alu.dst.chan = j;
5441 alu.dst.write = (j == 1);
5442
5443 alu.src[0].sel = tmp0;
5444 alu.src[0].chan = 0;
5445
5446 if (signed_op) {
5447 alu.src[1].sel = tmp2;
5448 alu.src[1].chan = 1;
5449 } else {
5450 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5451 }
5452 alu.last = (j == 3);
5453 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5454 return r;
5455 }
5456 } else {
5457 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5458 alu.op = ALU_OP2_MULHI_UINT;
5459
5460 alu.dst.sel = tmp0;
5461 alu.dst.chan = 1;
5462 alu.dst.write = 1;
5463
5464 alu.src[0].sel = tmp0;
5465 alu.src[0].chan = 0;
5466
5467 if (signed_op) {
5468 alu.src[1].sel = tmp2;
5469 alu.src[1].chan = 1;
5470 } else {
5471 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5472 }
5473
5474 alu.last = 1;
5475 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5476 return r;
5477 }
5478
5479 /* 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z) = abs(lo(rcp*src)) */
5480 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5481 alu.op = ALU_OP3_CNDE_INT;
5482 alu.is_op3 = 1;
5483
5484 alu.dst.sel = tmp0;
5485 alu.dst.chan = 2;
5486 alu.dst.write = 1;
5487
5488 alu.src[0].sel = tmp0;
5489 alu.src[0].chan = 1;
5490 alu.src[1].sel = tmp0;
5491 alu.src[1].chan = 3;
5492 alu.src[2].sel = tmp0;
5493 alu.src[2].chan = 2;
5494
5495 alu.last = 1;
5496 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5497 return r;
5498
5499 /* 6. tmp0.w = hi (tmp0.z * tmp0.x) = e, rounding error */
5500 if (ctx->bc->chip_class == CAYMAN) {
5501 for (j = 0 ; j < 4; j++) {
5502 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5503 alu.op = ALU_OP2_MULHI_UINT;
5504
5505 alu.dst.sel = tmp0;
5506 alu.dst.chan = j;
5507 alu.dst.write = (j == 3);
5508
5509 alu.src[0].sel = tmp0;
5510 alu.src[0].chan = 2;
5511
5512 alu.src[1].sel = tmp0;
5513 alu.src[1].chan = 0;
5514
5515 alu.last = (j == 3);
5516 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5517 return r;
5518 }
5519 } else {
5520 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5521 alu.op = ALU_OP2_MULHI_UINT;
5522
5523 alu.dst.sel = tmp0;
5524 alu.dst.chan = 3;
5525 alu.dst.write = 1;
5526
5527 alu.src[0].sel = tmp0;
5528 alu.src[0].chan = 2;
5529
5530 alu.src[1].sel = tmp0;
5531 alu.src[1].chan = 0;
5532
5533 alu.last = 1;
5534 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5535 return r;
5536 }
5537
5538 /* 7. tmp1.x = tmp0.x - tmp0.w */
5539 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5540 alu.op = ALU_OP2_SUB_INT;
5541
5542 alu.dst.sel = tmp1;
5543 alu.dst.chan = 0;
5544 alu.dst.write = 1;
5545
5546 alu.src[0].sel = tmp0;
5547 alu.src[0].chan = 0;
5548 alu.src[1].sel = tmp0;
5549 alu.src[1].chan = 3;
5550
5551 alu.last = 1;
5552 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5553 return r;
5554
5555 /* 8. tmp1.y = tmp0.x + tmp0.w */
5556 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5557 alu.op = ALU_OP2_ADD_INT;
5558
5559 alu.dst.sel = tmp1;
5560 alu.dst.chan = 1;
5561 alu.dst.write = 1;
5562
5563 alu.src[0].sel = tmp0;
5564 alu.src[0].chan = 0;
5565 alu.src[1].sel = tmp0;
5566 alu.src[1].chan = 3;
5567
5568 alu.last = 1;
5569 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5570 return r;
5571
5572 /* 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) */
5573 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5574 alu.op = ALU_OP3_CNDE_INT;
5575 alu.is_op3 = 1;
5576
5577 alu.dst.sel = tmp0;
5578 alu.dst.chan = 0;
5579 alu.dst.write = 1;
5580
5581 alu.src[0].sel = tmp0;
5582 alu.src[0].chan = 1;
5583 alu.src[1].sel = tmp1;
5584 alu.src[1].chan = 1;
5585 alu.src[2].sel = tmp1;
5586 alu.src[2].chan = 0;
5587
5588 alu.last = 1;
5589 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5590 return r;
5591
5592 /* 10. tmp0.z = hi(tmp0.x * src1) = q */
5593 if (ctx->bc->chip_class == CAYMAN) {
5594 for (j = 0 ; j < 4; j++) {
5595 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5596 alu.op = ALU_OP2_MULHI_UINT;
5597
5598 alu.dst.sel = tmp0;
5599 alu.dst.chan = j;
5600 alu.dst.write = (j == 2);
5601
5602 alu.src[0].sel = tmp0;
5603 alu.src[0].chan = 0;
5604
5605 if (signed_op) {
5606 alu.src[1].sel = tmp2;
5607 alu.src[1].chan = 0;
5608 } else {
5609 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5610 }
5611
5612 alu.last = (j == 3);
5613 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5614 return r;
5615 }
5616 } else {
5617 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5618 alu.op = ALU_OP2_MULHI_UINT;
5619
5620 alu.dst.sel = tmp0;
5621 alu.dst.chan = 2;
5622 alu.dst.write = 1;
5623
5624 alu.src[0].sel = tmp0;
5625 alu.src[0].chan = 0;
5626
5627 if (signed_op) {
5628 alu.src[1].sel = tmp2;
5629 alu.src[1].chan = 0;
5630 } else {
5631 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5632 }
5633
5634 alu.last = 1;
5635 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5636 return r;
5637 }
5638
5639 /* 11. tmp0.y = lo (src2 * tmp0.z) = src2*q = src1 - r */
5640 if (ctx->bc->chip_class == CAYMAN) {
5641 for (j = 0 ; j < 4; j++) {
5642 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5643 alu.op = ALU_OP2_MULLO_UINT;
5644
5645 alu.dst.sel = tmp0;
5646 alu.dst.chan = j;
5647 alu.dst.write = (j == 1);
5648
5649 if (signed_op) {
5650 alu.src[0].sel = tmp2;
5651 alu.src[0].chan = 1;
5652 } else {
5653 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5654 }
5655
5656 alu.src[1].sel = tmp0;
5657 alu.src[1].chan = 2;
5658
5659 alu.last = (j == 3);
5660 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5661 return r;
5662 }
5663 } else {
5664 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5665 alu.op = ALU_OP2_MULLO_UINT;
5666
5667 alu.dst.sel = tmp0;
5668 alu.dst.chan = 1;
5669 alu.dst.write = 1;
5670
5671 if (signed_op) {
5672 alu.src[0].sel = tmp2;
5673 alu.src[0].chan = 1;
5674 } else {
5675 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5676 }
5677
5678 alu.src[1].sel = tmp0;
5679 alu.src[1].chan = 2;
5680
5681 alu.last = 1;
5682 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5683 return r;
5684 }
5685
5686 /* 12. tmp0.w = src1 - tmp0.y = r */
5687 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5688 alu.op = ALU_OP2_SUB_INT;
5689
5690 alu.dst.sel = tmp0;
5691 alu.dst.chan = 3;
5692 alu.dst.write = 1;
5693
5694 if (signed_op) {
5695 alu.src[0].sel = tmp2;
5696 alu.src[0].chan = 0;
5697 } else {
5698 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5699 }
5700
5701 alu.src[1].sel = tmp0;
5702 alu.src[1].chan = 1;
5703
5704 alu.last = 1;
5705 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5706 return r;
5707
5708 /* 13. tmp1.x = tmp0.w >= src2 = r >= src2 */
5709 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5710 alu.op = ALU_OP2_SETGE_UINT;
5711
5712 alu.dst.sel = tmp1;
5713 alu.dst.chan = 0;
5714 alu.dst.write = 1;
5715
5716 alu.src[0].sel = tmp0;
5717 alu.src[0].chan = 3;
5718 if (signed_op) {
5719 alu.src[1].sel = tmp2;
5720 alu.src[1].chan = 1;
5721 } else {
5722 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5723 }
5724
5725 alu.last = 1;
5726 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5727 return r;
5728
5729 /* 14. tmp1.y = src1 >= tmp0.y = r >= 0 */
5730 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5731 alu.op = ALU_OP2_SETGE_UINT;
5732
5733 alu.dst.sel = tmp1;
5734 alu.dst.chan = 1;
5735 alu.dst.write = 1;
5736
5737 if (signed_op) {
5738 alu.src[0].sel = tmp2;
5739 alu.src[0].chan = 0;
5740 } else {
5741 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5742 }
5743
5744 alu.src[1].sel = tmp0;
5745 alu.src[1].chan = 1;
5746
5747 alu.last = 1;
5748 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5749 return r;
5750
5751 if (mod) { /* UMOD */
5752
5753 /* 15. tmp1.z = tmp0.w - src2 = r - src2 */
5754 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5755 alu.op = ALU_OP2_SUB_INT;
5756
5757 alu.dst.sel = tmp1;
5758 alu.dst.chan = 2;
5759 alu.dst.write = 1;
5760
5761 alu.src[0].sel = tmp0;
5762 alu.src[0].chan = 3;
5763
5764 if (signed_op) {
5765 alu.src[1].sel = tmp2;
5766 alu.src[1].chan = 1;
5767 } else {
5768 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5769 }
5770
5771 alu.last = 1;
5772 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5773 return r;
5774
5775 /* 16. tmp1.w = tmp0.w + src2 = r + src2 */
5776 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5777 alu.op = ALU_OP2_ADD_INT;
5778
5779 alu.dst.sel = tmp1;
5780 alu.dst.chan = 3;
5781 alu.dst.write = 1;
5782
5783 alu.src[0].sel = tmp0;
5784 alu.src[0].chan = 3;
5785 if (signed_op) {
5786 alu.src[1].sel = tmp2;
5787 alu.src[1].chan = 1;
5788 } else {
5789 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5790 }
5791
5792 alu.last = 1;
5793 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5794 return r;
5795
5796 } else { /* UDIV */
5797
5798 /* 15. tmp1.z = tmp0.z + 1 = q + 1 DIV */
5799 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5800 alu.op = ALU_OP2_ADD_INT;
5801
5802 alu.dst.sel = tmp1;
5803 alu.dst.chan = 2;
5804 alu.dst.write = 1;
5805
5806 alu.src[0].sel = tmp0;
5807 alu.src[0].chan = 2;
5808 alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
5809
5810 alu.last = 1;
5811 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5812 return r;
5813
5814 /* 16. tmp1.w = tmp0.z - 1 = q - 1 */
5815 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5816 alu.op = ALU_OP2_ADD_INT;
5817
5818 alu.dst.sel = tmp1;
5819 alu.dst.chan = 3;
5820 alu.dst.write = 1;
5821
5822 alu.src[0].sel = tmp0;
5823 alu.src[0].chan = 2;
5824 alu.src[1].sel = V_SQ_ALU_SRC_M_1_INT;
5825
5826 alu.last = 1;
5827 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5828 return r;
5829
5830 }
5831
5832 /* 17. tmp1.x = tmp1.x & tmp1.y */
5833 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5834 alu.op = ALU_OP2_AND_INT;
5835
5836 alu.dst.sel = tmp1;
5837 alu.dst.chan = 0;
5838 alu.dst.write = 1;
5839
5840 alu.src[0].sel = tmp1;
5841 alu.src[0].chan = 0;
5842 alu.src[1].sel = tmp1;
5843 alu.src[1].chan = 1;
5844
5845 alu.last = 1;
5846 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5847 return r;
5848
5849 /* 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z DIV */
5850 /* 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z MOD */
5851 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5852 alu.op = ALU_OP3_CNDE_INT;
5853 alu.is_op3 = 1;
5854
5855 alu.dst.sel = tmp0;
5856 alu.dst.chan = 2;
5857 alu.dst.write = 1;
5858
5859 alu.src[0].sel = tmp1;
5860 alu.src[0].chan = 0;
5861 alu.src[1].sel = tmp0;
5862 alu.src[1].chan = mod ? 3 : 2;
5863 alu.src[2].sel = tmp1;
5864 alu.src[2].chan = 2;
5865
5866 alu.last = 1;
5867 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5868 return r;
5869
5870 /* 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z */
5871 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5872 alu.op = ALU_OP3_CNDE_INT;
5873 alu.is_op3 = 1;
5874
5875 if (signed_op) {
5876 alu.dst.sel = tmp0;
5877 alu.dst.chan = 2;
5878 alu.dst.write = 1;
5879 } else {
5880 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5881 }
5882
5883 alu.src[0].sel = tmp1;
5884 alu.src[0].chan = 1;
5885 alu.src[1].sel = tmp1;
5886 alu.src[1].chan = 3;
5887 alu.src[2].sel = tmp0;
5888 alu.src[2].chan = 2;
5889
5890 alu.last = 1;
5891 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5892 return r;
5893
5894 if (signed_op) {
5895
5896 /* fix the sign of the result */
5897
5898 if (mod) {
5899
5900 /* tmp0.x = -tmp0.z */
5901 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5902 alu.op = ALU_OP2_SUB_INT;
5903
5904 alu.dst.sel = tmp0;
5905 alu.dst.chan = 0;
5906 alu.dst.write = 1;
5907
5908 alu.src[0].sel = V_SQ_ALU_SRC_0;
5909 alu.src[1].sel = tmp0;
5910 alu.src[1].chan = 2;
5911
5912 alu.last = 1;
5913 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5914 return r;
5915
5916 /* sign of the remainder is the same as the sign of src0 */
5917 /* tmp0.x = src0>=0 ? tmp0.z : tmp0.x */
5918 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5919 alu.op = ALU_OP3_CNDGE_INT;
5920 alu.is_op3 = 1;
5921
5922 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5923
5924 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5925 alu.src[1].sel = tmp0;
5926 alu.src[1].chan = 2;
5927 alu.src[2].sel = tmp0;
5928 alu.src[2].chan = 0;
5929
5930 alu.last = 1;
5931 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5932 return r;
5933
5934 } else {
5935
5936 /* tmp0.x = -tmp0.z */
5937 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5938 alu.op = ALU_OP2_SUB_INT;
5939
5940 alu.dst.sel = tmp0;
5941 alu.dst.chan = 0;
5942 alu.dst.write = 1;
5943
5944 alu.src[0].sel = V_SQ_ALU_SRC_0;
5945 alu.src[1].sel = tmp0;
5946 alu.src[1].chan = 2;
5947
5948 alu.last = 1;
5949 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5950 return r;
5951
5952 /* fix the quotient sign (same as the sign of src0*src1) */
5953 /* tmp0.x = tmp2.z>=0 ? tmp0.z : tmp0.x */
5954 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5955 alu.op = ALU_OP3_CNDGE_INT;
5956 alu.is_op3 = 1;
5957
5958 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5959
5960 alu.src[0].sel = tmp2;
5961 alu.src[0].chan = 2;
5962 alu.src[1].sel = tmp0;
5963 alu.src[1].chan = 2;
5964 alu.src[2].sel = tmp0;
5965 alu.src[2].chan = 0;
5966
5967 alu.last = 1;
5968 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5969 return r;
5970 }
5971 }
5972 }
5973 return 0;
5974 }
5975
5976 static int tgsi_udiv(struct r600_shader_ctx *ctx)
5977 {
5978 return tgsi_divmod(ctx, 0, 0);
5979 }
5980
5981 static int tgsi_umod(struct r600_shader_ctx *ctx)
5982 {
5983 return tgsi_divmod(ctx, 1, 0);
5984 }
5985
5986 static int tgsi_idiv(struct r600_shader_ctx *ctx)
5987 {
5988 return tgsi_divmod(ctx, 0, 1);
5989 }
5990
5991 static int tgsi_imod(struct r600_shader_ctx *ctx)
5992 {
5993 return tgsi_divmod(ctx, 1, 1);
5994 }
5995
5996
5997 static int tgsi_f2i(struct r600_shader_ctx *ctx)
5998 {
5999 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6000 struct r600_bytecode_alu alu;
6001 int i, r;
6002 unsigned write_mask = inst->Dst[0].Register.WriteMask;
6003 int last_inst = tgsi_last_instruction(write_mask);
6004
6005 for (i = 0; i < 4; i++) {
6006 if (!(write_mask & (1<<i)))
6007 continue;
6008
6009 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6010 alu.op = ALU_OP1_TRUNC;
6011
6012 alu.dst.sel = ctx->temp_reg;
6013 alu.dst.chan = i;
6014 alu.dst.write = 1;
6015
6016 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6017 if (i == last_inst)
6018 alu.last = 1;
6019 r = r600_bytecode_add_alu(ctx->bc, &alu);
6020 if (r)
6021 return r;
6022 }
6023
6024 for (i = 0; i < 4; i++) {
6025 if (!(write_mask & (1<<i)))
6026 continue;
6027
6028 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6029 alu.op = ctx->inst_info->op;
6030
6031 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6032
6033 alu.src[0].sel = ctx->temp_reg;
6034 alu.src[0].chan = i;
6035
6036 if (i == last_inst || alu.op == ALU_OP1_FLT_TO_UINT)
6037 alu.last = 1;
6038 r = r600_bytecode_add_alu(ctx->bc, &alu);
6039 if (r)
6040 return r;
6041 }
6042
6043 return 0;
6044 }
6045
6046 static int tgsi_iabs(struct r600_shader_ctx *ctx)
6047 {
6048 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6049 struct r600_bytecode_alu alu;
6050 int i, r;
6051 unsigned write_mask = inst->Dst[0].Register.WriteMask;
6052 int last_inst = tgsi_last_instruction(write_mask);
6053
6054 /* tmp = -src */
6055 for (i = 0; i < 4; i++) {
6056 if (!(write_mask & (1<<i)))
6057 continue;
6058
6059 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6060 alu.op = ALU_OP2_SUB_INT;
6061
6062 alu.dst.sel = ctx->temp_reg;
6063 alu.dst.chan = i;
6064 alu.dst.write = 1;
6065
6066 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6067 alu.src[0].sel = V_SQ_ALU_SRC_0;
6068
6069 if (i == last_inst)
6070 alu.last = 1;
6071 r = r600_bytecode_add_alu(ctx->bc, &alu);
6072 if (r)
6073 return r;
6074 }
6075
6076 /* dst = (src >= 0 ? src : tmp) */
6077 for (i = 0; i < 4; i++) {
6078 if (!(write_mask & (1<<i)))
6079 continue;
6080
6081 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6082 alu.op = ALU_OP3_CNDGE_INT;
6083 alu.is_op3 = 1;
6084 alu.dst.write = 1;
6085
6086 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6087
6088 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6089 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6090 alu.src[2].sel = ctx->temp_reg;
6091 alu.src[2].chan = i;
6092
6093 if (i == last_inst)
6094 alu.last = 1;
6095 r = r600_bytecode_add_alu(ctx->bc, &alu);
6096 if (r)
6097 return r;
6098 }
6099 return 0;
6100 }
6101
6102 static int tgsi_issg(struct r600_shader_ctx *ctx)
6103 {
6104 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6105 struct r600_bytecode_alu alu;
6106 int i, r;
6107 unsigned write_mask = inst->Dst[0].Register.WriteMask;
6108 int last_inst = tgsi_last_instruction(write_mask);
6109
6110 /* tmp = (src >= 0 ? src : -1) */
6111 for (i = 0; i < 4; i++) {
6112 if (!(write_mask & (1<<i)))
6113 continue;
6114
6115 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6116 alu.op = ALU_OP3_CNDGE_INT;
6117 alu.is_op3 = 1;
6118
6119 alu.dst.sel = ctx->temp_reg;
6120 alu.dst.chan = i;
6121 alu.dst.write = 1;
6122
6123 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6124 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6125 alu.src[2].sel = V_SQ_ALU_SRC_M_1_INT;
6126
6127 if (i == last_inst)
6128 alu.last = 1;
6129 r = r600_bytecode_add_alu(ctx->bc, &alu);
6130 if (r)
6131 return r;
6132 }
6133
6134 /* dst = (tmp > 0 ? 1 : tmp) */
6135 for (i = 0; i < 4; i++) {
6136 if (!(write_mask & (1<<i)))
6137 continue;
6138
6139 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6140 alu.op = ALU_OP3_CNDGT_INT;
6141 alu.is_op3 = 1;
6142 alu.dst.write = 1;
6143
6144 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6145
6146 alu.src[0].sel = ctx->temp_reg;
6147 alu.src[0].chan = i;
6148
6149 alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
6150
6151 alu.src[2].sel = ctx->temp_reg;
6152 alu.src[2].chan = i;
6153
6154 if (i == last_inst)
6155 alu.last = 1;
6156 r = r600_bytecode_add_alu(ctx->bc, &alu);
6157 if (r)
6158 return r;
6159 }
6160 return 0;
6161 }
6162
6163
6164
6165 static int tgsi_ssg(struct r600_shader_ctx *ctx)
6166 {
6167 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6168 struct r600_bytecode_alu alu;
6169 int i, r;
6170
6171 /* tmp = (src > 0 ? 1 : src) */
6172 for (i = 0; i < 4; i++) {
6173 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6174 alu.op = ALU_OP3_CNDGT;
6175 alu.is_op3 = 1;
6176
6177 alu.dst.sel = ctx->temp_reg;
6178 alu.dst.chan = i;
6179
6180 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6181 alu.src[1].sel = V_SQ_ALU_SRC_1;
6182 r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
6183
6184 if (i == 3)
6185 alu.last = 1;
6186 r = r600_bytecode_add_alu(ctx->bc, &alu);
6187 if (r)
6188 return r;
6189 }
6190
6191 /* dst = (-tmp > 0 ? -1 : tmp) */
6192 for (i = 0; i < 4; i++) {
6193 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6194 alu.op = ALU_OP3_CNDGT;
6195 alu.is_op3 = 1;
6196 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6197
6198 alu.src[0].sel = ctx->temp_reg;
6199 alu.src[0].chan = i;
6200 alu.src[0].neg = 1;
6201
6202 alu.src[1].sel = V_SQ_ALU_SRC_1;
6203 alu.src[1].neg = 1;
6204
6205 alu.src[2].sel = ctx->temp_reg;
6206 alu.src[2].chan = i;
6207
6208 if (i == 3)
6209 alu.last = 1;
6210 r = r600_bytecode_add_alu(ctx->bc, &alu);
6211 if (r)
6212 return r;
6213 }
6214 return 0;
6215 }
6216
6217 static int tgsi_bfi(struct r600_shader_ctx *ctx)
6218 {
6219 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6220 struct r600_bytecode_alu alu;
6221 int i, r, t1, t2;
6222
6223 unsigned write_mask = inst->Dst[0].Register.WriteMask;
6224 int last_inst = tgsi_last_instruction(write_mask);
6225
6226 t1 = r600_get_temp(ctx);
6227
6228 for (i = 0; i < 4; i++) {
6229 if (!(write_mask & (1<<i)))
6230 continue;
6231
6232 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6233 alu.op = ALU_OP2_SETGE_INT;
6234 r600_bytecode_src(&alu.src[0], &ctx->src[3], i);
6235 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6236 alu.src[1].value = 32;
6237 alu.dst.sel = ctx->temp_reg;
6238 alu.dst.chan = i;
6239 alu.dst.write = 1;
6240 alu.last = i == last_inst;
6241 r = r600_bytecode_add_alu(ctx->bc, &alu);
6242 if (r)
6243 return r;
6244 }
6245
6246 for (i = 0; i < 4; i++) {
6247 if (!(write_mask & (1<<i)))
6248 continue;
6249
6250 /* create mask tmp */
6251 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6252 alu.op = ALU_OP2_BFM_INT;
6253 alu.dst.sel = t1;
6254 alu.dst.chan = i;
6255 alu.dst.write = 1;
6256 alu.last = i == last_inst;
6257
6258 r600_bytecode_src(&alu.src[0], &ctx->src[3], i);
6259 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6260
6261 r = r600_bytecode_add_alu(ctx->bc, &alu);
6262 if (r)
6263 return r;
6264 }
6265
6266 t2 = r600_get_temp(ctx);
6267
6268 for (i = 0; i < 4; i++) {
6269 if (!(write_mask & (1<<i)))
6270 continue;
6271
6272 /* shift insert left */
6273 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6274 alu.op = ALU_OP2_LSHL_INT;
6275 alu.dst.sel = t2;
6276 alu.dst.chan = i;
6277 alu.dst.write = 1;
6278 alu.last = i == last_inst;
6279
6280 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
6281 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6282
6283 r = r600_bytecode_add_alu(ctx->bc, &alu);
6284 if (r)
6285 return r;
6286 }
6287
6288 for (i = 0; i < 4; i++) {
6289 if (!(write_mask & (1<<i)))
6290 continue;
6291
6292 /* actual bitfield insert */
6293 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6294 alu.op = ALU_OP3_BFI_INT;
6295 alu.is_op3 = 1;
6296 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6297 alu.dst.chan = i;
6298 alu.dst.write = 1;
6299 alu.last = i == last_inst;
6300
6301 alu.src[0].sel = t1;
6302 alu.src[0].chan = i;
6303 alu.src[1].sel = t2;
6304 alu.src[1].chan = i;
6305 r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
6306
6307 r = r600_bytecode_add_alu(ctx->bc, &alu);
6308 if (r)
6309 return r;
6310 }
6311
6312 for (i = 0; i < 4; i++) {
6313 if (!(write_mask & (1<<i)))
6314 continue;
6315 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6316 alu.op = ALU_OP3_CNDE_INT;
6317 alu.is_op3 = 1;
6318 alu.src[0].sel = ctx->temp_reg;
6319 alu.src[0].chan = i;
6320 r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
6321
6322 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6323
6324 alu.src[1].sel = alu.dst.sel;
6325 alu.src[1].chan = i;
6326
6327 alu.last = i == last_inst;
6328 r = r600_bytecode_add_alu(ctx->bc, &alu);
6329 if (r)
6330 return r;
6331 }
6332 return 0;
6333 }
6334
6335 static int tgsi_msb(struct r600_shader_ctx *ctx)
6336 {
6337 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6338 struct r600_bytecode_alu alu;
6339 int i, r, t1, t2;
6340
6341 unsigned write_mask = inst->Dst[0].Register.WriteMask;
6342 int last_inst = tgsi_last_instruction(write_mask);
6343
6344 assert(ctx->inst_info->op == ALU_OP1_FFBH_INT ||
6345 ctx->inst_info->op == ALU_OP1_FFBH_UINT);
6346
6347 t1 = ctx->temp_reg;
6348
6349 /* bit position is indexed from lsb by TGSI, and from msb by the hardware */
6350 for (i = 0; i < 4; i++) {
6351 if (!(write_mask & (1<<i)))
6352 continue;
6353
6354 /* t1 = FFBH_INT / FFBH_UINT */
6355 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6356 alu.op = ctx->inst_info->op;
6357 alu.dst.sel = t1;
6358 alu.dst.chan = i;
6359 alu.dst.write = 1;
6360 alu.last = i == last_inst;
6361
6362 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6363
6364 r = r600_bytecode_add_alu(ctx->bc, &alu);
6365 if (r)
6366 return r;
6367 }
6368
6369 t2 = r600_get_temp(ctx);
6370
6371 for (i = 0; i < 4; i++) {
6372 if (!(write_mask & (1<<i)))
6373 continue;
6374
6375 /* t2 = 31 - t1 */
6376 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6377 alu.op = ALU_OP2_SUB_INT;
6378 alu.dst.sel = t2;
6379 alu.dst.chan = i;
6380 alu.dst.write = 1;
6381 alu.last = i == last_inst;
6382
6383 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
6384 alu.src[0].value = 31;
6385 alu.src[1].sel = t1;
6386 alu.src[1].chan = i;
6387
6388 r = r600_bytecode_add_alu(ctx->bc, &alu);
6389 if (r)
6390 return r;
6391 }
6392
6393 for (i = 0; i < 4; i++) {
6394 if (!(write_mask & (1<<i)))
6395 continue;
6396
6397 /* result = t1 >= 0 ? t2 : t1 */
6398 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6399 alu.op = ALU_OP3_CNDGE_INT;
6400 alu.is_op3 = 1;
6401 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6402 alu.dst.chan = i;
6403 alu.dst.write = 1;
6404 alu.last = i == last_inst;
6405
6406 alu.src[0].sel = t1;
6407 alu.src[0].chan = i;
6408 alu.src[1].sel = t2;
6409 alu.src[1].chan = i;
6410 alu.src[2].sel = t1;
6411 alu.src[2].chan = i;
6412
6413 r = r600_bytecode_add_alu(ctx->bc, &alu);
6414 if (r)
6415 return r;
6416 }
6417
6418 return 0;
6419 }
6420
6421 static int tgsi_interp_egcm(struct r600_shader_ctx *ctx)
6422 {
6423 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6424 struct r600_bytecode_alu alu;
6425 int r, i = 0, k, interp_gpr, interp_base_chan, tmp, lasti;
6426 unsigned location;
6427 const int input = inst->Src[0].Register.Index + ctx->shader->nsys_inputs;
6428
6429 assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
6430
6431 /* Interpolators have been marked for use already by allocate_system_value_inputs */
6432 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
6433 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6434 location = TGSI_INTERPOLATE_LOC_CENTER; /* sample offset will be added explicitly */
6435 }
6436 else {
6437 location = TGSI_INTERPOLATE_LOC_CENTROID;
6438 }
6439
6440 k = eg_get_interpolator_index(ctx->shader->input[input].interpolate, location);
6441 if (k < 0)
6442 k = 0;
6443 interp_gpr = ctx->eg_interpolators[k].ij_index / 2;
6444 interp_base_chan = 2 * (ctx->eg_interpolators[k].ij_index % 2);
6445
6446 /* NOTE: currently offset is not perspective correct */
6447 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
6448 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6449 int sample_gpr = -1;
6450 int gradientsH, gradientsV;
6451 struct r600_bytecode_tex tex;
6452
6453 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6454 sample_gpr = load_sample_position(ctx, &ctx->src[1], ctx->src[1].swizzle[0]);
6455 }
6456
6457 gradientsH = r600_get_temp(ctx);
6458 gradientsV = r600_get_temp(ctx);
6459 for (i = 0; i < 2; i++) {
6460 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
6461 tex.op = i == 0 ? FETCH_OP_GET_GRADIENTS_H : FETCH_OP_GET_GRADIENTS_V;
6462 tex.src_gpr = interp_gpr;
6463 tex.src_sel_x = interp_base_chan + 0;
6464 tex.src_sel_y = interp_base_chan + 1;
6465 tex.src_sel_z = 0;
6466 tex.src_sel_w = 0;
6467 tex.dst_gpr = i == 0 ? gradientsH : gradientsV;
6468 tex.dst_sel_x = 0;
6469 tex.dst_sel_y = 1;
6470 tex.dst_sel_z = 7;
6471 tex.dst_sel_w = 7;
6472 tex.inst_mod = 1; // Use per pixel gradient calculation
6473 tex.sampler_id = 0;
6474 tex.resource_id = tex.sampler_id;
6475 r = r600_bytecode_add_tex(ctx->bc, &tex);
6476 if (r)
6477 return r;
6478 }
6479
6480 for (i = 0; i < 2; i++) {
6481 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6482 alu.op = ALU_OP3_MULADD;
6483 alu.is_op3 = 1;
6484 alu.src[0].sel = gradientsH;
6485 alu.src[0].chan = i;
6486 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6487 alu.src[1].sel = sample_gpr;
6488 alu.src[1].chan = 2;
6489 }
6490 else {
6491 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
6492 }
6493 alu.src[2].sel = interp_gpr;
6494 alu.src[2].chan = interp_base_chan + i;
6495 alu.dst.sel = ctx->temp_reg;
6496 alu.dst.chan = i;
6497 alu.last = i == 1;
6498
6499 r = r600_bytecode_add_alu(ctx->bc, &alu);
6500 if (r)
6501 return r;
6502 }
6503
6504 for (i = 0; i < 2; i++) {
6505 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6506 alu.op = ALU_OP3_MULADD;
6507 alu.is_op3 = 1;
6508 alu.src[0].sel = gradientsV;
6509 alu.src[0].chan = i;
6510 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6511 alu.src[1].sel = sample_gpr;
6512 alu.src[1].chan = 3;
6513 }
6514 else {
6515 r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
6516 }
6517 alu.src[2].sel = ctx->temp_reg;
6518 alu.src[2].chan = i;
6519 alu.dst.sel = ctx->temp_reg;
6520 alu.dst.chan = i;
6521 alu.last = i == 1;
6522
6523 r = r600_bytecode_add_alu(ctx->bc, &alu);
6524 if (r)
6525 return r;
6526 }
6527 }
6528
6529 tmp = r600_get_temp(ctx);
6530 for (i = 0; i < 8; i++) {
6531 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6532 alu.op = i < 4 ? ALU_OP2_INTERP_ZW : ALU_OP2_INTERP_XY;
6533
6534 alu.dst.sel = tmp;
6535 if ((i > 1 && i < 6)) {
6536 alu.dst.write = 1;
6537 }
6538 else {
6539 alu.dst.write = 0;
6540 }
6541 alu.dst.chan = i % 4;
6542
6543 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
6544 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6545 alu.src[0].sel = ctx->temp_reg;
6546 alu.src[0].chan = 1 - (i % 2);
6547 } else {
6548 alu.src[0].sel = interp_gpr;
6549 alu.src[0].chan = interp_base_chan + 1 - (i % 2);
6550 }
6551 alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
6552 alu.src[1].chan = 0;
6553
6554 alu.last = i % 4 == 3;
6555 alu.bank_swizzle_force = SQ_ALU_VEC_210;
6556
6557 r = r600_bytecode_add_alu(ctx->bc, &alu);
6558 if (r)
6559 return r;
6560 }
6561
6562 // INTERP can't swizzle dst
6563 lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6564 for (i = 0; i <= lasti; i++) {
6565 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6566 continue;
6567
6568 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6569 alu.op = ALU_OP1_MOV;
6570 alu.src[0].sel = tmp;
6571 alu.src[0].chan = ctx->src[0].swizzle[i];
6572 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6573 alu.dst.write = 1;
6574 alu.last = i == lasti;
6575 r = r600_bytecode_add_alu(ctx->bc, &alu);
6576 if (r)
6577 return r;
6578 }
6579
6580 return 0;
6581 }
6582
6583
6584 static int tgsi_helper_copy(struct r600_shader_ctx *ctx, struct tgsi_full_instruction *inst)
6585 {
6586 struct r600_bytecode_alu alu;
6587 int i, r;
6588
6589 for (i = 0; i < 4; i++) {
6590 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6591 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) {
6592 alu.op = ALU_OP0_NOP;
6593 alu.dst.chan = i;
6594 } else {
6595 alu.op = ALU_OP1_MOV;
6596 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6597 alu.src[0].sel = ctx->temp_reg;
6598 alu.src[0].chan = i;
6599 }
6600 if (i == 3) {
6601 alu.last = 1;
6602 }
6603 r = r600_bytecode_add_alu(ctx->bc, &alu);
6604 if (r)
6605 return r;
6606 }
6607 return 0;
6608 }
6609
6610 static int tgsi_make_src_for_op3(struct r600_shader_ctx *ctx,
6611 unsigned temp, int chan,
6612 struct r600_bytecode_alu_src *bc_src,
6613 const struct r600_shader_src *shader_src)
6614 {
6615 struct r600_bytecode_alu alu;
6616 int r;
6617
6618 r600_bytecode_src(bc_src, shader_src, chan);
6619
6620 /* op3 operands don't support abs modifier */
6621 if (bc_src->abs) {
6622 assert(temp!=0); /* we actually need the extra register, make sure it is allocated. */
6623 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6624 alu.op = ALU_OP1_MOV;
6625 alu.dst.sel = temp;
6626 alu.dst.chan = chan;
6627 alu.dst.write = 1;
6628
6629 alu.src[0] = *bc_src;
6630 alu.last = true; // sufficient?
6631 r = r600_bytecode_add_alu(ctx->bc, &alu);
6632 if (r)
6633 return r;
6634
6635 memset(bc_src, 0, sizeof(*bc_src));
6636 bc_src->sel = temp;
6637 bc_src->chan = chan;
6638 }
6639 return 0;
6640 }
6641
6642 static int tgsi_op3_dst(struct r600_shader_ctx *ctx, int dst)
6643 {
6644 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6645 struct r600_bytecode_alu alu;
6646 int i, j, r;
6647 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6648 int temp_regs[4];
6649 unsigned op = ctx->inst_info->op;
6650
6651 if (op == ALU_OP3_MULADD_IEEE &&
6652 ctx->info.properties[TGSI_PROPERTY_MUL_ZERO_WINS])
6653 op = ALU_OP3_MULADD;
6654
6655 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
6656 temp_regs[j] = 0;
6657 if (ctx->src[j].abs)
6658 temp_regs[j] = r600_get_temp(ctx);
6659 }
6660 for (i = 0; i < lasti + 1; i++) {
6661 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6662 continue;
6663
6664 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6665 alu.op = op;
6666 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
6667 r = tgsi_make_src_for_op3(ctx, temp_regs[j], i, &alu.src[j], &ctx->src[j]);
6668 if (r)
6669 return r;
6670 }
6671
6672 if (dst == -1) {
6673 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6674 } else {
6675 alu.dst.sel = dst;
6676 }
6677 alu.dst.chan = i;
6678 alu.dst.write = 1;
6679 alu.is_op3 = 1;
6680 if (i == lasti) {
6681 alu.last = 1;
6682 }
6683 r = r600_bytecode_add_alu(ctx->bc, &alu);
6684 if (r)
6685 return r;
6686 }
6687 return 0;
6688 }
6689
6690 static int tgsi_op3(struct r600_shader_ctx *ctx)
6691 {
6692 return tgsi_op3_dst(ctx, -1);
6693 }
6694
6695 static int tgsi_dp(struct r600_shader_ctx *ctx)
6696 {
6697 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6698 struct r600_bytecode_alu alu;
6699 int i, j, r;
6700 unsigned op = ctx->inst_info->op;
6701 if (op == ALU_OP2_DOT4_IEEE &&
6702 ctx->info.properties[TGSI_PROPERTY_MUL_ZERO_WINS])
6703 op = ALU_OP2_DOT4;
6704
6705 for (i = 0; i < 4; i++) {
6706 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6707 alu.op = op;
6708 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
6709 r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
6710 }
6711
6712 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6713 alu.dst.chan = i;
6714 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
6715 /* handle some special cases */
6716 switch (inst->Instruction.Opcode) {
6717 case TGSI_OPCODE_DP2:
6718 if (i > 1) {
6719 alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
6720 alu.src[0].chan = alu.src[1].chan = 0;
6721 }
6722 break;
6723 case TGSI_OPCODE_DP3:
6724 if (i > 2) {
6725 alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
6726 alu.src[0].chan = alu.src[1].chan = 0;
6727 }
6728 break;
6729 default:
6730 break;
6731 }
6732 if (i == 3) {
6733 alu.last = 1;
6734 }
6735 r = r600_bytecode_add_alu(ctx->bc, &alu);
6736 if (r)
6737 return r;
6738 }
6739 return 0;
6740 }
6741
6742 static inline boolean tgsi_tex_src_requires_loading(struct r600_shader_ctx *ctx,
6743 unsigned index)
6744 {
6745 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6746 return (inst->Src[index].Register.File != TGSI_FILE_TEMPORARY &&
6747 inst->Src[index].Register.File != TGSI_FILE_INPUT &&
6748 inst->Src[index].Register.File != TGSI_FILE_OUTPUT) ||
6749 ctx->src[index].neg || ctx->src[index].abs ||
6750 (inst->Src[index].Register.File == TGSI_FILE_INPUT && ctx->type == PIPE_SHADER_GEOMETRY);
6751 }
6752
6753 static inline unsigned tgsi_tex_get_src_gpr(struct r600_shader_ctx *ctx,
6754 unsigned index)
6755 {
6756 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6757 return ctx->file_offset[inst->Src[index].Register.File] + inst->Src[index].Register.Index;
6758 }
6759
6760 static int do_vtx_fetch_inst(struct r600_shader_ctx *ctx, boolean src_requires_loading)
6761 {
6762 struct r600_bytecode_vtx vtx;
6763 struct r600_bytecode_alu alu;
6764 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6765 int src_gpr, r, i;
6766 int id = tgsi_tex_get_src_gpr(ctx, 1);
6767
6768 src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
6769 if (src_requires_loading) {
6770 for (i = 0; i < 4; i++) {
6771 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6772 alu.op = ALU_OP1_MOV;
6773 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6774 alu.dst.sel = ctx->temp_reg;
6775 alu.dst.chan = i;
6776 if (i == 3)
6777 alu.last = 1;
6778 alu.dst.write = 1;
6779 r = r600_bytecode_add_alu(ctx->bc, &alu);
6780 if (r)
6781 return r;
6782 }
6783 src_gpr = ctx->temp_reg;
6784 }
6785
6786 memset(&vtx, 0, sizeof(vtx));
6787 vtx.op = FETCH_OP_VFETCH;
6788 vtx.buffer_id = id + R600_MAX_CONST_BUFFERS;
6789 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
6790 vtx.src_gpr = src_gpr;
6791 vtx.mega_fetch_count = 16;
6792 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
6793 vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; /* SEL_X */
6794 vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; /* SEL_Y */
6795 vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; /* SEL_Z */
6796 vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; /* SEL_W */
6797 vtx.use_const_fields = 1;
6798
6799 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
6800 return r;
6801
6802 if (ctx->bc->chip_class >= EVERGREEN)
6803 return 0;
6804
6805 for (i = 0; i < 4; i++) {
6806 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6807 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6808 continue;
6809
6810 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6811 alu.op = ALU_OP2_AND_INT;
6812
6813 alu.dst.chan = i;
6814 alu.dst.sel = vtx.dst_gpr;
6815 alu.dst.write = 1;
6816
6817 alu.src[0].sel = vtx.dst_gpr;
6818 alu.src[0].chan = i;
6819
6820 alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL;
6821 alu.src[1].sel += (id * 2);
6822 alu.src[1].chan = i % 4;
6823 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
6824
6825 if (i == lasti)
6826 alu.last = 1;
6827 r = r600_bytecode_add_alu(ctx->bc, &alu);
6828 if (r)
6829 return r;
6830 }
6831
6832 if (inst->Dst[0].Register.WriteMask & 3) {
6833 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6834 alu.op = ALU_OP2_OR_INT;
6835
6836 alu.dst.chan = 3;
6837 alu.dst.sel = vtx.dst_gpr;
6838 alu.dst.write = 1;
6839
6840 alu.src[0].sel = vtx.dst_gpr;
6841 alu.src[0].chan = 3;
6842
6843 alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL + (id * 2) + 1;
6844 alu.src[1].chan = 0;
6845 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
6846
6847 alu.last = 1;
6848 r = r600_bytecode_add_alu(ctx->bc, &alu);
6849 if (r)
6850 return r;
6851 }
6852 return 0;
6853 }
6854
6855 static int r600_do_buffer_txq(struct r600_shader_ctx *ctx, int reg_idx, int offset)
6856 {
6857 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6858 struct r600_bytecode_alu alu;
6859 int r;
6860 int id = tgsi_tex_get_src_gpr(ctx, reg_idx) + offset;
6861
6862 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6863 alu.op = ALU_OP1_MOV;
6864 alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
6865 if (ctx->bc->chip_class >= EVERGREEN) {
6866 /* channel 0 or 2 of each word */
6867 alu.src[0].sel += (id / 2);
6868 alu.src[0].chan = (id % 2) * 2;
6869 } else {
6870 /* r600 we have them at channel 2 of the second dword */
6871 alu.src[0].sel += (id * 2) + 1;
6872 alu.src[0].chan = 1;
6873 }
6874 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
6875 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
6876 alu.last = 1;
6877 r = r600_bytecode_add_alu(ctx->bc, &alu);
6878 if (r)
6879 return r;
6880 return 0;
6881 }
6882
6883 static int tgsi_tex(struct r600_shader_ctx *ctx)
6884 {
6885 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6886 struct r600_bytecode_tex tex;
6887 struct r600_bytecode_alu alu;
6888 unsigned src_gpr;
6889 int r, i, j;
6890 int opcode;
6891 bool read_compressed_msaa = ctx->bc->has_compressed_msaa_texturing &&
6892 inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
6893 (inst->Texture.Texture == TGSI_TEXTURE_2D_MSAA ||
6894 inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA);
6895
6896 bool txf_add_offsets = inst->Texture.NumOffsets &&
6897 inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
6898 inst->Texture.Texture != TGSI_TEXTURE_BUFFER;
6899
6900 /* Texture fetch instructions can only use gprs as source.
6901 * Also they cannot negate the source or take the absolute value */
6902 const boolean src_requires_loading = (inst->Instruction.Opcode != TGSI_OPCODE_TXQS &&
6903 tgsi_tex_src_requires_loading(ctx, 0)) ||
6904 read_compressed_msaa || txf_add_offsets;
6905
6906 boolean src_loaded = FALSE;
6907 unsigned sampler_src_reg = 1;
6908 int8_t offset_x = 0, offset_y = 0, offset_z = 0;
6909 boolean has_txq_cube_array_z = false;
6910 unsigned sampler_index_mode;
6911
6912 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ &&
6913 ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
6914 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)))
6915 if (inst->Dst[0].Register.WriteMask & 4) {
6916 ctx->shader->has_txq_cube_array_z_comp = true;
6917 has_txq_cube_array_z = true;
6918 }
6919
6920 if (inst->Instruction.Opcode == TGSI_OPCODE_TEX2 ||
6921 inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
6922 inst->Instruction.Opcode == TGSI_OPCODE_TXL2 ||
6923 inst->Instruction.Opcode == TGSI_OPCODE_TG4)
6924 sampler_src_reg = 2;
6925
6926 /* TGSI moves the sampler to src reg 3 for TXD */
6927 if (inst->Instruction.Opcode == TGSI_OPCODE_TXD)
6928 sampler_src_reg = 3;
6929
6930 sampler_index_mode = inst->Src[sampler_src_reg].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
6931
6932 src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
6933
6934 if (inst->Texture.Texture == TGSI_TEXTURE_BUFFER) {
6935 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) {
6936 ctx->shader->uses_tex_buffers = true;
6937 return r600_do_buffer_txq(ctx, 1, 0);
6938 }
6939 else if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
6940 if (ctx->bc->chip_class < EVERGREEN)
6941 ctx->shader->uses_tex_buffers = true;
6942 return do_vtx_fetch_inst(ctx, src_requires_loading);
6943 }
6944 }
6945
6946 if (inst->Instruction.Opcode == TGSI_OPCODE_TXP) {
6947 int out_chan;
6948 /* Add perspective divide */
6949 if (ctx->bc->chip_class == CAYMAN) {
6950 out_chan = 2;
6951 for (i = 0; i < 3; i++) {
6952 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6953 alu.op = ALU_OP1_RECIP_IEEE;
6954 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
6955
6956 alu.dst.sel = ctx->temp_reg;
6957 alu.dst.chan = i;
6958 if (i == 2)
6959 alu.last = 1;
6960 if (out_chan == i)
6961 alu.dst.write = 1;
6962 r = r600_bytecode_add_alu(ctx->bc, &alu);
6963 if (r)
6964 return r;
6965 }
6966
6967 } else {
6968 out_chan = 3;
6969 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6970 alu.op = ALU_OP1_RECIP_IEEE;
6971 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
6972
6973 alu.dst.sel = ctx->temp_reg;
6974 alu.dst.chan = out_chan;
6975 alu.last = 1;
6976 alu.dst.write = 1;
6977 r = r600_bytecode_add_alu(ctx->bc, &alu);
6978 if (r)
6979 return r;
6980 }
6981
6982 for (i = 0; i < 3; i++) {
6983 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6984 alu.op = ALU_OP2_MUL;
6985 alu.src[0].sel = ctx->temp_reg;
6986 alu.src[0].chan = out_chan;
6987 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6988 alu.dst.sel = ctx->temp_reg;
6989 alu.dst.chan = i;
6990 alu.dst.write = 1;
6991 r = r600_bytecode_add_alu(ctx->bc, &alu);
6992 if (r)
6993 return r;
6994 }
6995 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6996 alu.op = ALU_OP1_MOV;
6997 alu.src[0].sel = V_SQ_ALU_SRC_1;
6998 alu.src[0].chan = 0;
6999 alu.dst.sel = ctx->temp_reg;
7000 alu.dst.chan = 3;
7001 alu.last = 1;
7002 alu.dst.write = 1;
7003 r = r600_bytecode_add_alu(ctx->bc, &alu);
7004 if (r)
7005 return r;
7006 src_loaded = TRUE;
7007 src_gpr = ctx->temp_reg;
7008 }
7009
7010
7011 if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
7012 inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7013 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
7014 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
7015 inst->Instruction.Opcode != TGSI_OPCODE_TXQ) {
7016
7017 static const unsigned src0_swizzle[] = {2, 2, 0, 1};
7018 static const unsigned src1_swizzle[] = {1, 0, 2, 2};
7019
7020 /* tmp1.xyzw = CUBE(R0.zzxy, R0.yxzz) */
7021 for (i = 0; i < 4; i++) {
7022 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7023 alu.op = ALU_OP2_CUBE;
7024 r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
7025 r600_bytecode_src(&alu.src[1], &ctx->src[0], src1_swizzle[i]);
7026 alu.dst.sel = ctx->temp_reg;
7027 alu.dst.chan = i;
7028 if (i == 3)
7029 alu.last = 1;
7030 alu.dst.write = 1;
7031 r = r600_bytecode_add_alu(ctx->bc, &alu);
7032 if (r)
7033 return r;
7034 }
7035
7036 /* tmp1.z = RCP_e(|tmp1.z|) */
7037 if (ctx->bc->chip_class == CAYMAN) {
7038 for (i = 0; i < 3; i++) {
7039 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7040 alu.op = ALU_OP1_RECIP_IEEE;
7041 alu.src[0].sel = ctx->temp_reg;
7042 alu.src[0].chan = 2;
7043 alu.src[0].abs = 1;
7044 alu.dst.sel = ctx->temp_reg;
7045 alu.dst.chan = i;
7046 if (i == 2)
7047 alu.dst.write = 1;
7048 if (i == 2)
7049 alu.last = 1;
7050 r = r600_bytecode_add_alu(ctx->bc, &alu);
7051 if (r)
7052 return r;
7053 }
7054 } else {
7055 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7056 alu.op = ALU_OP1_RECIP_IEEE;
7057 alu.src[0].sel = ctx->temp_reg;
7058 alu.src[0].chan = 2;
7059 alu.src[0].abs = 1;
7060 alu.dst.sel = ctx->temp_reg;
7061 alu.dst.chan = 2;
7062 alu.dst.write = 1;
7063 alu.last = 1;
7064 r = r600_bytecode_add_alu(ctx->bc, &alu);
7065 if (r)
7066 return r;
7067 }
7068
7069 /* MULADD R0.x, R0.x, PS1, (0x3FC00000, 1.5f).x
7070 * MULADD R0.y, R0.y, PS1, (0x3FC00000, 1.5f).x
7071 * muladd has no writemask, have to use another temp
7072 */
7073 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7074 alu.op = ALU_OP3_MULADD;
7075 alu.is_op3 = 1;
7076
7077 alu.src[0].sel = ctx->temp_reg;
7078 alu.src[0].chan = 0;
7079 alu.src[1].sel = ctx->temp_reg;
7080 alu.src[1].chan = 2;
7081
7082 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
7083 alu.src[2].chan = 0;
7084 alu.src[2].value = u_bitcast_f2u(1.5f);
7085
7086 alu.dst.sel = ctx->temp_reg;
7087 alu.dst.chan = 0;
7088 alu.dst.write = 1;
7089
7090 r = r600_bytecode_add_alu(ctx->bc, &alu);
7091 if (r)
7092 return r;
7093
7094 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7095 alu.op = ALU_OP3_MULADD;
7096 alu.is_op3 = 1;
7097
7098 alu.src[0].sel = ctx->temp_reg;
7099 alu.src[0].chan = 1;
7100 alu.src[1].sel = ctx->temp_reg;
7101 alu.src[1].chan = 2;
7102
7103 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
7104 alu.src[2].chan = 0;
7105 alu.src[2].value = u_bitcast_f2u(1.5f);
7106
7107 alu.dst.sel = ctx->temp_reg;
7108 alu.dst.chan = 1;
7109 alu.dst.write = 1;
7110
7111 alu.last = 1;
7112 r = r600_bytecode_add_alu(ctx->bc, &alu);
7113 if (r)
7114 return r;
7115 /* write initial compare value into Z component
7116 - W src 0 for shadow cube
7117 - X src 1 for shadow cube array */
7118 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
7119 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
7120 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7121 alu.op = ALU_OP1_MOV;
7122 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
7123 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
7124 else
7125 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7126 alu.dst.sel = ctx->temp_reg;
7127 alu.dst.chan = 2;
7128 alu.dst.write = 1;
7129 alu.last = 1;
7130 r = r600_bytecode_add_alu(ctx->bc, &alu);
7131 if (r)
7132 return r;
7133 }
7134
7135 if (inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7136 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
7137 if (ctx->bc->chip_class >= EVERGREEN) {
7138 int mytmp = r600_get_temp(ctx);
7139 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7140 alu.op = ALU_OP1_MOV;
7141 alu.src[0].sel = ctx->temp_reg;
7142 alu.src[0].chan = 3;
7143 alu.dst.sel = mytmp;
7144 alu.dst.chan = 0;
7145 alu.dst.write = 1;
7146 alu.last = 1;
7147 r = r600_bytecode_add_alu(ctx->bc, &alu);
7148 if (r)
7149 return r;
7150
7151 /* have to multiply original layer by 8 and add to face id (temp.w) in Z */
7152 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7153 alu.op = ALU_OP3_MULADD;
7154 alu.is_op3 = 1;
7155 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7156 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7157 alu.src[1].chan = 0;
7158 alu.src[1].value = u_bitcast_f2u(8.0f);
7159 alu.src[2].sel = mytmp;
7160 alu.src[2].chan = 0;
7161 alu.dst.sel = ctx->temp_reg;
7162 alu.dst.chan = 3;
7163 alu.dst.write = 1;
7164 alu.last = 1;
7165 r = r600_bytecode_add_alu(ctx->bc, &alu);
7166 if (r)
7167 return r;
7168 } else if (ctx->bc->chip_class < EVERGREEN) {
7169 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7170 tex.op = FETCH_OP_SET_CUBEMAP_INDEX;
7171 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7172 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7173 tex.src_gpr = r600_get_temp(ctx);
7174 tex.src_sel_x = 0;
7175 tex.src_sel_y = 0;
7176 tex.src_sel_z = 0;
7177 tex.src_sel_w = 0;
7178 tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
7179 tex.coord_type_x = 1;
7180 tex.coord_type_y = 1;
7181 tex.coord_type_z = 1;
7182 tex.coord_type_w = 1;
7183 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7184 alu.op = ALU_OP1_MOV;
7185 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7186 alu.dst.sel = tex.src_gpr;
7187 alu.dst.chan = 0;
7188 alu.last = 1;
7189 alu.dst.write = 1;
7190 r = r600_bytecode_add_alu(ctx->bc, &alu);
7191 if (r)
7192 return r;
7193
7194 r = r600_bytecode_add_tex(ctx->bc, &tex);
7195 if (r)
7196 return r;
7197 }
7198
7199 }
7200
7201 /* for cube forms of lod and bias we need to route things */
7202 if (inst->Instruction.Opcode == TGSI_OPCODE_TXB ||
7203 inst->Instruction.Opcode == TGSI_OPCODE_TXL ||
7204 inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
7205 inst->Instruction.Opcode == TGSI_OPCODE_TXL2) {
7206 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7207 alu.op = ALU_OP1_MOV;
7208 if (inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
7209 inst->Instruction.Opcode == TGSI_OPCODE_TXL2)
7210 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
7211 else
7212 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7213 alu.dst.sel = ctx->temp_reg;
7214 alu.dst.chan = 2;
7215 alu.last = 1;
7216 alu.dst.write = 1;
7217 r = r600_bytecode_add_alu(ctx->bc, &alu);
7218 if (r)
7219 return r;
7220 }
7221
7222 src_loaded = TRUE;
7223 src_gpr = ctx->temp_reg;
7224 }
7225
7226 if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) {
7227 int temp_h = 0, temp_v = 0;
7228 int start_val = 0;
7229
7230 /* if we've already loaded the src (i.e. CUBE don't reload it). */
7231 if (src_loaded == TRUE)
7232 start_val = 1;
7233 else
7234 src_loaded = TRUE;
7235 for (i = start_val; i < 3; i++) {
7236 int treg = r600_get_temp(ctx);
7237
7238 if (i == 0)
7239 src_gpr = treg;
7240 else if (i == 1)
7241 temp_h = treg;
7242 else
7243 temp_v = treg;
7244
7245 for (j = 0; j < 4; j++) {
7246 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7247 alu.op = ALU_OP1_MOV;
7248 r600_bytecode_src(&alu.src[0], &ctx->src[i], j);
7249 alu.dst.sel = treg;
7250 alu.dst.chan = j;
7251 if (j == 3)
7252 alu.last = 1;
7253 alu.dst.write = 1;
7254 r = r600_bytecode_add_alu(ctx->bc, &alu);
7255 if (r)
7256 return r;
7257 }
7258 }
7259 for (i = 1; i < 3; i++) {
7260 /* set gradients h/v */
7261 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7262 tex.op = (i == 1) ? FETCH_OP_SET_GRADIENTS_H :
7263 FETCH_OP_SET_GRADIENTS_V;
7264 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7265 tex.sampler_index_mode = sampler_index_mode;
7266 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7267 tex.resource_index_mode = sampler_index_mode;
7268
7269 tex.src_gpr = (i == 1) ? temp_h : temp_v;
7270 tex.src_sel_x = 0;
7271 tex.src_sel_y = 1;
7272 tex.src_sel_z = 2;
7273 tex.src_sel_w = 3;
7274
7275 tex.dst_gpr = r600_get_temp(ctx); /* just to avoid confusing the asm scheduler */
7276 tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
7277 if (inst->Texture.Texture != TGSI_TEXTURE_RECT) {
7278 tex.coord_type_x = 1;
7279 tex.coord_type_y = 1;
7280 tex.coord_type_z = 1;
7281 tex.coord_type_w = 1;
7282 }
7283 r = r600_bytecode_add_tex(ctx->bc, &tex);
7284 if (r)
7285 return r;
7286 }
7287 }
7288
7289 if (src_requires_loading && !src_loaded) {
7290 for (i = 0; i < 4; i++) {
7291 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7292 alu.op = ALU_OP1_MOV;
7293 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7294 alu.dst.sel = ctx->temp_reg;
7295 alu.dst.chan = i;
7296 if (i == 3)
7297 alu.last = 1;
7298 alu.dst.write = 1;
7299 r = r600_bytecode_add_alu(ctx->bc, &alu);
7300 if (r)
7301 return r;
7302 }
7303 src_loaded = TRUE;
7304 src_gpr = ctx->temp_reg;
7305 }
7306
7307 /* get offset values */
7308 if (inst->Texture.NumOffsets) {
7309 assert(inst->Texture.NumOffsets == 1);
7310
7311 /* The texture offset feature doesn't work with the TXF instruction
7312 * and must be emulated by adding the offset to the texture coordinates. */
7313 if (txf_add_offsets) {
7314 const struct tgsi_texture_offset *off = inst->TexOffsets;
7315
7316 switch (inst->Texture.Texture) {
7317 case TGSI_TEXTURE_3D:
7318 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7319 alu.op = ALU_OP2_ADD_INT;
7320 alu.src[0].sel = src_gpr;
7321 alu.src[0].chan = 2;
7322 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7323 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleZ];
7324 alu.dst.sel = src_gpr;
7325 alu.dst.chan = 2;
7326 alu.dst.write = 1;
7327 alu.last = 1;
7328 r = r600_bytecode_add_alu(ctx->bc, &alu);
7329 if (r)
7330 return r;
7331 /* fall through */
7332
7333 case TGSI_TEXTURE_2D:
7334 case TGSI_TEXTURE_SHADOW2D:
7335 case TGSI_TEXTURE_RECT:
7336 case TGSI_TEXTURE_SHADOWRECT:
7337 case TGSI_TEXTURE_2D_ARRAY:
7338 case TGSI_TEXTURE_SHADOW2D_ARRAY:
7339 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7340 alu.op = ALU_OP2_ADD_INT;
7341 alu.src[0].sel = src_gpr;
7342 alu.src[0].chan = 1;
7343 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7344 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleY];
7345 alu.dst.sel = src_gpr;
7346 alu.dst.chan = 1;
7347 alu.dst.write = 1;
7348 alu.last = 1;
7349 r = r600_bytecode_add_alu(ctx->bc, &alu);
7350 if (r)
7351 return r;
7352 /* fall through */
7353
7354 case TGSI_TEXTURE_1D:
7355 case TGSI_TEXTURE_SHADOW1D:
7356 case TGSI_TEXTURE_1D_ARRAY:
7357 case TGSI_TEXTURE_SHADOW1D_ARRAY:
7358 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7359 alu.op = ALU_OP2_ADD_INT;
7360 alu.src[0].sel = src_gpr;
7361 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7362 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleX];
7363 alu.dst.sel = src_gpr;
7364 alu.dst.write = 1;
7365 alu.last = 1;
7366 r = r600_bytecode_add_alu(ctx->bc, &alu);
7367 if (r)
7368 return r;
7369 break;
7370 /* texture offsets do not apply to other texture targets */
7371 }
7372 } else {
7373 switch (inst->Texture.Texture) {
7374 case TGSI_TEXTURE_3D:
7375 offset_z = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleZ] << 1;
7376 /* fallthrough */
7377 case TGSI_TEXTURE_2D:
7378 case TGSI_TEXTURE_SHADOW2D:
7379 case TGSI_TEXTURE_RECT:
7380 case TGSI_TEXTURE_SHADOWRECT:
7381 case TGSI_TEXTURE_2D_ARRAY:
7382 case TGSI_TEXTURE_SHADOW2D_ARRAY:
7383 offset_y = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleY] << 1;
7384 /* fallthrough */
7385 case TGSI_TEXTURE_1D:
7386 case TGSI_TEXTURE_SHADOW1D:
7387 case TGSI_TEXTURE_1D_ARRAY:
7388 case TGSI_TEXTURE_SHADOW1D_ARRAY:
7389 offset_x = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleX] << 1;
7390 }
7391 }
7392 }
7393
7394 /* Obtain the sample index for reading a compressed MSAA color texture.
7395 * To read the FMASK, we use the ldfptr instruction, which tells us
7396 * where the samples are stored.
7397 * For uncompressed 8x MSAA surfaces, ldfptr should return 0x76543210,
7398 * which is the identity mapping. Each nibble says which physical sample
7399 * should be fetched to get that sample.
7400 *
7401 * Assume src.z contains the sample index. It should be modified like this:
7402 * src.z = (ldfptr() >> (src.z * 4)) & 0xF;
7403 * Then fetch the texel with src.
7404 */
7405 if (read_compressed_msaa) {
7406 unsigned sample_chan = 3;
7407 unsigned temp = r600_get_temp(ctx);
7408 assert(src_loaded);
7409
7410 /* temp.w = ldfptr() */
7411 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7412 tex.op = FETCH_OP_LD;
7413 tex.inst_mod = 1; /* to indicate this is ldfptr */
7414 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7415 tex.sampler_index_mode = sampler_index_mode;
7416 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7417 tex.resource_index_mode = sampler_index_mode;
7418 tex.src_gpr = src_gpr;
7419 tex.dst_gpr = temp;
7420 tex.dst_sel_x = 7; /* mask out these components */
7421 tex.dst_sel_y = 7;
7422 tex.dst_sel_z = 7;
7423 tex.dst_sel_w = 0; /* store X */
7424 tex.src_sel_x = 0;
7425 tex.src_sel_y = 1;
7426 tex.src_sel_z = 2;
7427 tex.src_sel_w = 3;
7428 tex.offset_x = offset_x;
7429 tex.offset_y = offset_y;
7430 tex.offset_z = offset_z;
7431 r = r600_bytecode_add_tex(ctx->bc, &tex);
7432 if (r)
7433 return r;
7434
7435 /* temp.x = sample_index*4 */
7436 if (ctx->bc->chip_class == CAYMAN) {
7437 for (i = 0 ; i < 4; i++) {
7438 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7439 alu.op = ALU_OP2_MULLO_INT;
7440 alu.src[0].sel = src_gpr;
7441 alu.src[0].chan = sample_chan;
7442 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7443 alu.src[1].value = 4;
7444 alu.dst.sel = temp;
7445 alu.dst.chan = i;
7446 alu.dst.write = i == 0;
7447 if (i == 3)
7448 alu.last = 1;
7449 r = r600_bytecode_add_alu(ctx->bc, &alu);
7450 if (r)
7451 return r;
7452 }
7453 } else {
7454 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7455 alu.op = ALU_OP2_MULLO_INT;
7456 alu.src[0].sel = src_gpr;
7457 alu.src[0].chan = sample_chan;
7458 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7459 alu.src[1].value = 4;
7460 alu.dst.sel = temp;
7461 alu.dst.chan = 0;
7462 alu.dst.write = 1;
7463 alu.last = 1;
7464 r = r600_bytecode_add_alu(ctx->bc, &alu);
7465 if (r)
7466 return r;
7467 }
7468
7469 /* sample_index = temp.w >> temp.x */
7470 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7471 alu.op = ALU_OP2_LSHR_INT;
7472 alu.src[0].sel = temp;
7473 alu.src[0].chan = 3;
7474 alu.src[1].sel = temp;
7475 alu.src[1].chan = 0;
7476 alu.dst.sel = src_gpr;
7477 alu.dst.chan = sample_chan;
7478 alu.dst.write = 1;
7479 alu.last = 1;
7480 r = r600_bytecode_add_alu(ctx->bc, &alu);
7481 if (r)
7482 return r;
7483
7484 /* sample_index & 0xF */
7485 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7486 alu.op = ALU_OP2_AND_INT;
7487 alu.src[0].sel = src_gpr;
7488 alu.src[0].chan = sample_chan;
7489 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7490 alu.src[1].value = 0xF;
7491 alu.dst.sel = src_gpr;
7492 alu.dst.chan = sample_chan;
7493 alu.dst.write = 1;
7494 alu.last = 1;
7495 r = r600_bytecode_add_alu(ctx->bc, &alu);
7496 if (r)
7497 return r;
7498 #if 0
7499 /* visualize the FMASK */
7500 for (i = 0; i < 4; i++) {
7501 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7502 alu.op = ALU_OP1_INT_TO_FLT;
7503 alu.src[0].sel = src_gpr;
7504 alu.src[0].chan = sample_chan;
7505 alu.dst.sel = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
7506 alu.dst.chan = i;
7507 alu.dst.write = 1;
7508 alu.last = 1;
7509 r = r600_bytecode_add_alu(ctx->bc, &alu);
7510 if (r)
7511 return r;
7512 }
7513 return 0;
7514 #endif
7515 }
7516
7517 /* does this shader want a num layers from TXQ for a cube array? */
7518 if (has_txq_cube_array_z) {
7519 int id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7520
7521 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7522 alu.op = ALU_OP1_MOV;
7523
7524 alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
7525 if (ctx->bc->chip_class >= EVERGREEN) {
7526 /* channel 1 or 3 of each word */
7527 alu.src[0].sel += (id / 2);
7528 alu.src[0].chan = ((id % 2) * 2) + 1;
7529 } else {
7530 /* r600 we have them at channel 2 of the second dword */
7531 alu.src[0].sel += (id * 2) + 1;
7532 alu.src[0].chan = 2;
7533 }
7534 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
7535 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
7536 alu.last = 1;
7537 r = r600_bytecode_add_alu(ctx->bc, &alu);
7538 if (r)
7539 return r;
7540 /* disable writemask from texture instruction */
7541 inst->Dst[0].Register.WriteMask &= ~4;
7542 }
7543
7544 opcode = ctx->inst_info->op;
7545 if (opcode == FETCH_OP_GATHER4 &&
7546 inst->TexOffsets[0].File != TGSI_FILE_NULL &&
7547 inst->TexOffsets[0].File != TGSI_FILE_IMMEDIATE) {
7548 opcode = FETCH_OP_GATHER4_O;
7549
7550 /* GATHER4_O/GATHER4_C_O use offset values loaded by
7551 SET_TEXTURE_OFFSETS instruction. The immediate offset values
7552 encoded in the instruction are ignored. */
7553 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7554 tex.op = FETCH_OP_SET_TEXTURE_OFFSETS;
7555 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7556 tex.sampler_index_mode = sampler_index_mode;
7557 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7558 tex.resource_index_mode = sampler_index_mode;
7559
7560 tex.src_gpr = ctx->file_offset[inst->TexOffsets[0].File] + inst->TexOffsets[0].Index;
7561 tex.src_sel_x = inst->TexOffsets[0].SwizzleX;
7562 tex.src_sel_y = inst->TexOffsets[0].SwizzleY;
7563 tex.src_sel_z = inst->TexOffsets[0].SwizzleZ;
7564 tex.src_sel_w = 4;
7565
7566 tex.dst_sel_x = 7;
7567 tex.dst_sel_y = 7;
7568 tex.dst_sel_z = 7;
7569 tex.dst_sel_w = 7;
7570
7571 r = r600_bytecode_add_tex(ctx->bc, &tex);
7572 if (r)
7573 return r;
7574 }
7575
7576 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
7577 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
7578 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
7579 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
7580 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY ||
7581 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
7582 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
7583 switch (opcode) {
7584 case FETCH_OP_SAMPLE:
7585 opcode = FETCH_OP_SAMPLE_C;
7586 break;
7587 case FETCH_OP_SAMPLE_L:
7588 opcode = FETCH_OP_SAMPLE_C_L;
7589 break;
7590 case FETCH_OP_SAMPLE_LB:
7591 opcode = FETCH_OP_SAMPLE_C_LB;
7592 break;
7593 case FETCH_OP_SAMPLE_G:
7594 opcode = FETCH_OP_SAMPLE_C_G;
7595 break;
7596 /* Texture gather variants */
7597 case FETCH_OP_GATHER4:
7598 opcode = FETCH_OP_GATHER4_C;
7599 break;
7600 case FETCH_OP_GATHER4_O:
7601 opcode = FETCH_OP_GATHER4_C_O;
7602 break;
7603 }
7604 }
7605
7606 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7607 tex.op = opcode;
7608
7609 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7610 tex.sampler_index_mode = sampler_index_mode;
7611 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7612 tex.resource_index_mode = sampler_index_mode;
7613 tex.src_gpr = src_gpr;
7614 tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
7615
7616 if (inst->Instruction.Opcode == TGSI_OPCODE_DDX_FINE ||
7617 inst->Instruction.Opcode == TGSI_OPCODE_DDY_FINE) {
7618 tex.inst_mod = 1; /* per pixel gradient calculation instead of per 2x2 quad */
7619 }
7620
7621 if (inst->Instruction.Opcode == TGSI_OPCODE_TG4) {
7622 int8_t texture_component_select = ctx->literals[4 * inst->Src[1].Register.Index + inst->Src[1].Register.SwizzleX];
7623 tex.inst_mod = texture_component_select;
7624
7625 if (ctx->bc->chip_class == CAYMAN) {
7626 /* GATHER4 result order is different from TGSI TG4 */
7627 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 0 : 7;
7628 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 4) ? 1 : 7;
7629 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 1) ? 2 : 7;
7630 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
7631 } else {
7632 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
7633 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
7634 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
7635 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
7636 }
7637 }
7638 else if (inst->Instruction.Opcode == TGSI_OPCODE_LODQ) {
7639 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
7640 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
7641 tex.dst_sel_z = 7;
7642 tex.dst_sel_w = 7;
7643 }
7644 else if (inst->Instruction.Opcode == TGSI_OPCODE_TXQS) {
7645 tex.dst_sel_x = 3;
7646 tex.dst_sel_y = 7;
7647 tex.dst_sel_z = 7;
7648 tex.dst_sel_w = 7;
7649 }
7650 else {
7651 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
7652 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
7653 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
7654 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
7655 }
7656
7657
7658 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQS) {
7659 tex.src_sel_x = 4;
7660 tex.src_sel_y = 4;
7661 tex.src_sel_z = 4;
7662 tex.src_sel_w = 4;
7663 } else if (src_loaded) {
7664 tex.src_sel_x = 0;
7665 tex.src_sel_y = 1;
7666 tex.src_sel_z = 2;
7667 tex.src_sel_w = 3;
7668 } else {
7669 tex.src_sel_x = ctx->src[0].swizzle[0];
7670 tex.src_sel_y = ctx->src[0].swizzle[1];
7671 tex.src_sel_z = ctx->src[0].swizzle[2];
7672 tex.src_sel_w = ctx->src[0].swizzle[3];
7673 tex.src_rel = ctx->src[0].rel;
7674 }
7675
7676 if (inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
7677 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
7678 inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7679 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
7680 tex.src_sel_x = 1;
7681 tex.src_sel_y = 0;
7682 tex.src_sel_z = 3;
7683 tex.src_sel_w = 2; /* route Z compare or Lod value into W */
7684 }
7685
7686 if (inst->Texture.Texture != TGSI_TEXTURE_RECT &&
7687 inst->Texture.Texture != TGSI_TEXTURE_SHADOWRECT) {
7688 tex.coord_type_x = 1;
7689 tex.coord_type_y = 1;
7690 }
7691 tex.coord_type_z = 1;
7692 tex.coord_type_w = 1;
7693
7694 tex.offset_x = offset_x;
7695 tex.offset_y = offset_y;
7696 if (inst->Instruction.Opcode == TGSI_OPCODE_TG4 &&
7697 (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
7698 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY)) {
7699 tex.offset_z = 0;
7700 }
7701 else {
7702 tex.offset_z = offset_z;
7703 }
7704
7705 /* Put the depth for comparison in W.
7706 * TGSI_TEXTURE_SHADOW2D_ARRAY already has the depth in W.
7707 * Some instructions expect the depth in Z. */
7708 if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
7709 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
7710 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
7711 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) &&
7712 opcode != FETCH_OP_SAMPLE_C_L &&
7713 opcode != FETCH_OP_SAMPLE_C_LB) {
7714 tex.src_sel_w = tex.src_sel_z;
7715 }
7716
7717 if (inst->Texture.Texture == TGSI_TEXTURE_1D_ARRAY ||
7718 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) {
7719 if (opcode == FETCH_OP_SAMPLE_C_L ||
7720 opcode == FETCH_OP_SAMPLE_C_LB) {
7721 /* the array index is read from Y */
7722 tex.coord_type_y = 0;
7723 } else {
7724 /* the array index is read from Z */
7725 tex.coord_type_z = 0;
7726 tex.src_sel_z = tex.src_sel_y;
7727 }
7728 } else if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
7729 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
7730 ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7731 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
7732 (ctx->bc->chip_class >= EVERGREEN)))
7733 /* the array index is read from Z */
7734 tex.coord_type_z = 0;
7735
7736 /* mask unused source components */
7737 if (opcode == FETCH_OP_SAMPLE || opcode == FETCH_OP_GATHER4) {
7738 switch (inst->Texture.Texture) {
7739 case TGSI_TEXTURE_2D:
7740 case TGSI_TEXTURE_RECT:
7741 tex.src_sel_z = 7;
7742 tex.src_sel_w = 7;
7743 break;
7744 case TGSI_TEXTURE_1D_ARRAY:
7745 tex.src_sel_y = 7;
7746 tex.src_sel_w = 7;
7747 break;
7748 case TGSI_TEXTURE_1D:
7749 tex.src_sel_y = 7;
7750 tex.src_sel_z = 7;
7751 tex.src_sel_w = 7;
7752 break;
7753 }
7754 }
7755
7756 r = r600_bytecode_add_tex(ctx->bc, &tex);
7757 if (r)
7758 return r;
7759
7760 /* add shadow ambient support - gallium doesn't do it yet */
7761 return 0;
7762 }
7763
7764 static int find_hw_atomic_counter(struct r600_shader_ctx *ctx,
7765 struct tgsi_full_src_register *src)
7766 {
7767 unsigned i;
7768
7769 if (src->Register.Indirect) {
7770 for (i = 0; i < ctx->shader->nhwatomic_ranges; i++) {
7771 if (src->Indirect.ArrayID == ctx->shader->atomics[i].array_id)
7772 return ctx->shader->atomics[i].hw_idx;
7773 }
7774 } else {
7775 uint32_t index = src->Register.Index;
7776 for (i = 0; i < ctx->shader->nhwatomic_ranges; i++) {
7777 if (ctx->shader->atomics[i].buffer_id != (unsigned)src->Dimension.Index)
7778 continue;
7779 if (index > ctx->shader->atomics[i].end)
7780 continue;
7781 if (index < ctx->shader->atomics[i].start)
7782 continue;
7783 uint32_t offset = (index - ctx->shader->atomics[i].start);
7784 return ctx->shader->atomics[i].hw_idx + offset;
7785 }
7786 }
7787 assert(0);
7788 return -1;
7789 }
7790
7791
7792 static int tgsi_load_gds(struct r600_shader_ctx *ctx)
7793 {
7794 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7795 int r;
7796 struct r600_bytecode_gds gds;
7797 int uav_id = 0;
7798 int uav_index_mode = 0;
7799
7800 uav_id = find_hw_atomic_counter(ctx, &inst->Src[0]);
7801
7802 if (inst->Src[0].Register.Indirect)
7803 uav_index_mode = 2;
7804
7805 memset(&gds, 0, sizeof(struct r600_bytecode_gds));
7806 gds.op = FETCH_OP_GDS_READ_RET;
7807 gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
7808 gds.uav_id = uav_id;
7809 gds.uav_index_mode = uav_index_mode;
7810 gds.src_gpr = ctx->temp_reg;
7811 gds.src_sel_x = 4;
7812 gds.src_sel_y = 4;
7813 gds.src_sel_z = 4;
7814 gds.dst_sel_x = 0;
7815 gds.dst_sel_y = 7;
7816 gds.dst_sel_z = 7;
7817 gds.dst_sel_w = 7;
7818 gds.src_gpr2 = ctx->temp_reg;
7819 gds.alloc_consume = 1;
7820 r = r600_bytecode_add_gds(ctx->bc, &gds);
7821 if (r)
7822 return r;
7823
7824 ctx->bc->cf_last->vpm = 1;
7825 return 0;
7826 }
7827
7828 /* this fixes up 1D arrays properly */
7829 static int load_index_src(struct r600_shader_ctx *ctx, int src_index, int *idx_gpr)
7830 {
7831 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7832 int r, i;
7833 struct r600_bytecode_alu alu;
7834 int temp_reg = r600_get_temp(ctx);
7835
7836 for (i = 0; i < 4; i++) {
7837 bool def_val = true, write_zero = false;
7838 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7839 alu.op = ALU_OP1_MOV;
7840 alu.dst.sel = temp_reg;
7841 alu.dst.chan = i;
7842
7843 switch (inst->Memory.Texture) {
7844 case TGSI_TEXTURE_BUFFER:
7845 case TGSI_TEXTURE_1D:
7846 if (i == 1 || i == 2 || i == 3) {
7847 write_zero = true;
7848 }
7849 break;
7850 case TGSI_TEXTURE_1D_ARRAY:
7851 if (i == 1 || i == 3)
7852 write_zero = true;
7853 else if (i == 2) {
7854 r600_bytecode_src(&alu.src[0], &ctx->src[src_index], 1);
7855 def_val = false;
7856 }
7857 break;
7858 case TGSI_TEXTURE_2D:
7859 if (i == 2 || i == 3)
7860 write_zero = true;
7861 break;
7862 default:
7863 if (i == 3)
7864 write_zero = true;
7865 break;
7866 }
7867
7868 if (write_zero) {
7869 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
7870 alu.src[0].value = 0;
7871 } else if (def_val) {
7872 r600_bytecode_src(&alu.src[0], &ctx->src[src_index], i);
7873 }
7874
7875 if (i == 3)
7876 alu.last = 1;
7877 alu.dst.write = 1;
7878 r = r600_bytecode_add_alu(ctx->bc, &alu);
7879 if (r)
7880 return r;
7881 }
7882 *idx_gpr = temp_reg;
7883 return 0;
7884 }
7885
7886 static int tgsi_load_rat(struct r600_shader_ctx *ctx)
7887 {
7888 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7889 /* have to work out the offset into the RAT immediate return buffer */
7890 struct r600_bytecode_vtx vtx;
7891 struct r600_bytecode_cf *cf;
7892 int r;
7893 int idx_gpr;
7894 unsigned format, num_format, format_comp, endian;
7895 const struct util_format_description *desc;
7896 unsigned rat_index_mode;
7897 unsigned immed_base;
7898
7899 rat_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
7900
7901 immed_base = R600_IMAGE_IMMED_RESOURCE_OFFSET;
7902 r = load_index_src(ctx, 1, &idx_gpr);
7903 if (r)
7904 return r;
7905
7906 if (rat_index_mode)
7907 egcm_load_index_reg(ctx->bc, 1, false);
7908
7909 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
7910 cf = ctx->bc->cf_last;
7911
7912 cf->rat.id = ctx->shader->rat_base + inst->Src[0].Register.Index;
7913 cf->rat.inst = V_RAT_INST_NOP_RTN;
7914 cf->rat.index_mode = rat_index_mode;
7915 cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND;
7916 cf->output.gpr = ctx->thread_id_gpr;
7917 cf->output.index_gpr = idx_gpr;
7918 cf->output.comp_mask = 0xf;
7919 cf->output.burst_count = 1;
7920 cf->vpm = 1;
7921 cf->barrier = 1;
7922 cf->mark = 1;
7923 cf->output.elem_size = 0;
7924
7925 r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK);
7926 cf = ctx->bc->cf_last;
7927 cf->barrier = 1;
7928
7929 desc = util_format_description(inst->Memory.Format);
7930 r600_vertex_data_type(inst->Memory.Format,
7931 &format, &num_format, &format_comp, &endian);
7932 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
7933 vtx.op = FETCH_OP_VFETCH;
7934 vtx.buffer_id = immed_base + inst->Src[0].Register.Index;
7935 vtx.buffer_index_mode = rat_index_mode;
7936 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
7937 vtx.src_gpr = ctx->thread_id_gpr;
7938 vtx.src_sel_x = 1;
7939 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
7940 vtx.dst_sel_x = desc->swizzle[0];
7941 vtx.dst_sel_y = desc->swizzle[1];
7942 vtx.dst_sel_z = desc->swizzle[2];
7943 vtx.dst_sel_w = desc->swizzle[3];
7944 vtx.srf_mode_all = 1;
7945 vtx.data_format = format;
7946 vtx.num_format_all = num_format;
7947 vtx.format_comp_all = format_comp;
7948 vtx.endian = endian;
7949 vtx.offset = 0;
7950 vtx.mega_fetch_count = 3;
7951 r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx);
7952 if (r)
7953 return r;
7954 cf = ctx->bc->cf_last;
7955 cf->barrier = 1;
7956 return 0;
7957 }
7958
7959 static int tgsi_load(struct r600_shader_ctx *ctx)
7960 {
7961 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7962 if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
7963 return tgsi_load_rat(ctx);
7964 if (inst->Src[0].Register.File == TGSI_FILE_HW_ATOMIC)
7965 return tgsi_load_gds(ctx);
7966 return 0;
7967 }
7968
7969 static int tgsi_store_rat(struct r600_shader_ctx *ctx)
7970 {
7971 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7972 struct r600_bytecode_cf *cf;
7973 bool src_requires_loading = false;
7974 int val_gpr, idx_gpr;
7975 int r, i;
7976 unsigned rat_index_mode;
7977
7978 rat_index_mode = inst->Dst[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
7979
7980 r = load_index_src(ctx, 0, &idx_gpr);
7981 if (r)
7982 return r;
7983
7984 if (inst->Src[1].Register.File != TGSI_FILE_TEMPORARY)
7985 src_requires_loading = true;
7986
7987 if (src_requires_loading) {
7988 struct r600_bytecode_alu alu;
7989 for (i = 0; i < 4; i++) {
7990 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7991 alu.op = ALU_OP1_MOV;
7992 alu.dst.sel = ctx->temp_reg;
7993 alu.dst.chan = i;
7994
7995 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
7996 if (i == 3)
7997 alu.last = 1;
7998 alu.dst.write = 1;
7999 r = r600_bytecode_add_alu(ctx->bc, &alu);
8000 if (r)
8001 return r;
8002 }
8003 val_gpr = ctx->temp_reg;
8004 } else
8005 val_gpr = tgsi_tex_get_src_gpr(ctx, 1);
8006 if (rat_index_mode)
8007 egcm_load_index_reg(ctx->bc, 1, false);
8008
8009 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
8010 cf = ctx->bc->cf_last;
8011
8012 cf->rat.id = ctx->shader->rat_base + inst->Dst[0].Register.Index;
8013 cf->rat.inst = V_RAT_INST_STORE_TYPED;
8014 cf->rat.index_mode = rat_index_mode;
8015 cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
8016 cf->output.gpr = val_gpr;
8017 cf->output.index_gpr = idx_gpr;
8018 cf->output.comp_mask = 0xf;
8019 cf->output.burst_count = 1;
8020 cf->vpm = 1;
8021 cf->barrier = 1;
8022 cf->output.elem_size = 0;
8023 return 0;
8024 }
8025
8026 static int tgsi_store(struct r600_shader_ctx *ctx)
8027 {
8028 return tgsi_store_rat(ctx);
8029 }
8030
8031 static int tgsi_atomic_op_rat(struct r600_shader_ctx *ctx)
8032 {
8033 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8034 /* have to work out the offset into the RAT immediate return buffer */
8035 struct r600_bytecode_alu alu;
8036 struct r600_bytecode_vtx vtx;
8037 struct r600_bytecode_cf *cf;
8038 int r;
8039 int idx_gpr;
8040 unsigned format, num_format, format_comp, endian;
8041 const struct util_format_description *desc;
8042 unsigned rat_index_mode;
8043 unsigned immed_base;
8044
8045 immed_base = R600_IMAGE_IMMED_RESOURCE_OFFSET;
8046
8047 assert (inst->Src[0].Register.File == TGSI_FILE_IMAGE);
8048 rat_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
8049
8050 r = load_index_src(ctx, 1, &idx_gpr);
8051 if (r)
8052 return r;
8053
8054 if (ctx->inst_info->op == V_RAT_INST_CMPXCHG_INT_RTN) {
8055 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8056 alu.op = ALU_OP1_MOV;
8057 alu.dst.sel = ctx->thread_id_gpr;
8058 alu.dst.chan = 0;
8059 alu.dst.write = 1;
8060 r600_bytecode_src(&alu.src[0], &ctx->src[3], 0);
8061 alu.last = 1;
8062 r = r600_bytecode_add_alu(ctx->bc, &alu);
8063 if (r)
8064 return r;
8065
8066 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8067 alu.op = ALU_OP1_MOV;
8068 alu.dst.sel = ctx->thread_id_gpr;
8069 alu.dst.chan = 3;
8070 alu.dst.write = 1;
8071 r600_bytecode_src(&alu.src[0], &ctx->src[2], 0);
8072 alu.last = 1;
8073 r = r600_bytecode_add_alu(ctx->bc, &alu);
8074 if (r)
8075 return r;
8076 } else {
8077 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8078 alu.op = ALU_OP1_MOV;
8079 alu.dst.sel = ctx->thread_id_gpr;
8080 alu.dst.chan = 0;
8081 alu.dst.write = 1;
8082 r600_bytecode_src(&alu.src[0], &ctx->src[2], 0);
8083 alu.last = 1;
8084 r = r600_bytecode_add_alu(ctx->bc, &alu);
8085 if (r)
8086 return r;
8087 }
8088
8089 if (rat_index_mode)
8090 egcm_load_index_reg(ctx->bc, 1, false);
8091 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
8092 cf = ctx->bc->cf_last;
8093
8094 cf->rat.id = ctx->shader->rat_base + inst->Src[0].Register.Index;
8095 cf->rat.inst = ctx->inst_info->op;
8096 cf->rat.index_mode = rat_index_mode;
8097 cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND;
8098 cf->output.gpr = ctx->thread_id_gpr;
8099 cf->output.index_gpr = idx_gpr;
8100 cf->output.comp_mask = 0xf;
8101 cf->output.burst_count = 1;
8102 cf->vpm = 1;
8103 cf->barrier = 1;
8104 cf->mark = 1;
8105 cf->output.elem_size = 0;
8106 r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK);
8107 cf = ctx->bc->cf_last;
8108 cf->barrier = 1;
8109 cf->cf_addr = 1;
8110
8111 desc = util_format_description(inst->Memory.Format);
8112 r600_vertex_data_type(inst->Memory.Format,
8113 &format, &num_format, &format_comp, &endian);
8114 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
8115 vtx.op = FETCH_OP_VFETCH;
8116 vtx.buffer_id = immed_base + inst->Src[0].Register.Index;
8117 vtx.buffer_index_mode = rat_index_mode;
8118 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
8119 vtx.src_gpr = ctx->thread_id_gpr;
8120 vtx.src_sel_x = 1;
8121 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8122 vtx.dst_sel_x = desc->swizzle[0];
8123 vtx.dst_sel_y = 7;
8124 vtx.dst_sel_z = 7;
8125 vtx.dst_sel_w = 7;
8126 vtx.use_const_fields = 0;
8127 vtx.srf_mode_all = 1;
8128 vtx.data_format = format;
8129 vtx.num_format_all = num_format;
8130 vtx.format_comp_all = format_comp;
8131 vtx.endian = endian;
8132 vtx.offset = 0;
8133 vtx.mega_fetch_count = 0xf;
8134 r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx);
8135 if (r)
8136 return r;
8137 cf = ctx->bc->cf_last;
8138 cf->vpm = 1;
8139 cf->barrier = 1;
8140 return 0;
8141 }
8142
8143 static int get_gds_op(int opcode)
8144 {
8145 switch (opcode) {
8146 case TGSI_OPCODE_ATOMUADD:
8147 return FETCH_OP_GDS_ADD_RET;
8148 case TGSI_OPCODE_ATOMAND:
8149 return FETCH_OP_GDS_AND_RET;
8150 case TGSI_OPCODE_ATOMOR:
8151 return FETCH_OP_GDS_OR_RET;
8152 case TGSI_OPCODE_ATOMXOR:
8153 return FETCH_OP_GDS_XOR_RET;
8154 case TGSI_OPCODE_ATOMUMIN:
8155 return FETCH_OP_GDS_MIN_UINT_RET;
8156 case TGSI_OPCODE_ATOMUMAX:
8157 return FETCH_OP_GDS_MAX_UINT_RET;
8158 case TGSI_OPCODE_ATOMXCHG:
8159 return FETCH_OP_GDS_XCHG_RET;
8160 case TGSI_OPCODE_ATOMCAS:
8161 return FETCH_OP_GDS_CMP_XCHG_RET;
8162 default:
8163 return -1;
8164 }
8165 }
8166
8167 static int tgsi_atomic_op_gds(struct r600_shader_ctx *ctx)
8168 {
8169 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8170 struct r600_bytecode_gds gds;
8171 struct r600_bytecode_alu alu;
8172 int gds_op = get_gds_op(inst->Instruction.Opcode);
8173 int r;
8174 int uav_id = 0;
8175 int uav_index_mode = 0;
8176
8177 if (gds_op == -1) {
8178 fprintf(stderr, "unknown GDS op for opcode %d\n", inst->Instruction.Opcode);
8179 return -1;
8180 }
8181
8182 uav_id = find_hw_atomic_counter(ctx, &inst->Src[0]);
8183
8184 if (inst->Src[0].Register.Indirect)
8185 uav_index_mode = 2;
8186
8187 if (inst->Src[2].Register.File == TGSI_FILE_IMMEDIATE) {
8188 int value = (ctx->literals[4 * inst->Src[2].Register.Index + inst->Src[2].Register.SwizzleX]);
8189 int abs_value = abs(value);
8190 if (abs_value != value && gds_op == FETCH_OP_GDS_ADD_RET)
8191 gds_op = FETCH_OP_GDS_SUB_RET;
8192 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8193 alu.op = ALU_OP1_MOV;
8194 alu.dst.sel = ctx->temp_reg;
8195 alu.dst.chan = 0;
8196 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
8197 alu.src[0].value = abs_value;
8198 alu.last = 1;
8199 alu.dst.write = 1;
8200 r = r600_bytecode_add_alu(ctx->bc, &alu);
8201 if (r)
8202 return r;
8203 } else {
8204 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8205 alu.op = ALU_OP1_MOV;
8206 alu.dst.sel = ctx->temp_reg;
8207 alu.dst.chan = 0;
8208 r600_bytecode_src(&alu.src[0], &ctx->src[2], 0);
8209 alu.last = 1;
8210 alu.dst.write = 1;
8211 r = r600_bytecode_add_alu(ctx->bc, &alu);
8212 if (r)
8213 return r;
8214 }
8215
8216 memset(&gds, 0, sizeof(struct r600_bytecode_gds));
8217 gds.op = gds_op;
8218 gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8219 gds.uav_id = uav_id;
8220 gds.uav_index_mode = uav_index_mode;
8221 gds.src_gpr = ctx->temp_reg;
8222 gds.src_gpr2 = ctx->temp_reg;
8223 gds.src_sel_x = 4;
8224 gds.src_sel_y = 0;
8225 gds.src_sel_z = 4;
8226 gds.dst_sel_x = 0;
8227 gds.dst_sel_y = 7;
8228 gds.dst_sel_z = 7;
8229 gds.dst_sel_w = 7;
8230 gds.alloc_consume = 1;
8231 r = r600_bytecode_add_gds(ctx->bc, &gds);
8232 if (r)
8233 return r;
8234 ctx->bc->cf_last->vpm = 1;
8235 return 0;
8236 }
8237
8238 static int tgsi_atomic_op(struct r600_shader_ctx *ctx)
8239 {
8240 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8241 if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
8242 return tgsi_atomic_op_rat(ctx);
8243 if (inst->Src[0].Register.File == TGSI_FILE_HW_ATOMIC)
8244 return tgsi_atomic_op_gds(ctx);
8245 return 0;
8246 }
8247
8248 static int tgsi_resq(struct r600_shader_ctx *ctx)
8249 {
8250 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8251 unsigned sampler_index_mode;
8252 struct r600_bytecode_tex tex;
8253 int r;
8254 boolean has_txq_cube_array_z = false;
8255
8256 if (inst->Memory.Texture == TGSI_TEXTURE_BUFFER) {
8257 ctx->shader->uses_tex_buffers = true;
8258 return r600_do_buffer_txq(ctx, 0, ctx->shader->image_size_const_offset);
8259 }
8260
8261 if (inst->Memory.Texture == TGSI_TEXTURE_CUBE_ARRAY &&
8262 inst->Dst[0].Register.WriteMask & 4) {
8263 ctx->shader->has_txq_cube_array_z_comp = true;
8264 has_txq_cube_array_z = true;
8265 }
8266
8267 sampler_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
8268 if (sampler_index_mode)
8269 egcm_load_index_reg(ctx->bc, 1, false);
8270
8271
8272 /* does this shader want a num layers from TXQ for a cube array? */
8273 if (has_txq_cube_array_z) {
8274 int id = tgsi_tex_get_src_gpr(ctx, 0) + ctx->shader->image_size_const_offset;
8275 struct r600_bytecode_alu alu;
8276
8277 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8278 alu.op = ALU_OP1_MOV;
8279
8280 alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
8281 /* channel 1 or 3 of each word */
8282 alu.src[0].sel += (id / 2);
8283 alu.src[0].chan = ((id % 2) * 2) + 1;
8284 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
8285 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
8286 alu.last = 1;
8287 r = r600_bytecode_add_alu(ctx->bc, &alu);
8288 if (r)
8289 return r;
8290 /* disable writemask from texture instruction */
8291 inst->Dst[0].Register.WriteMask &= ~4;
8292 }
8293 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
8294 tex.op = ctx->inst_info->op;
8295 tex.sampler_id = R600_IMAGE_REAL_RESOURCE_OFFSET + inst->Src[0].Register.Index;
8296 tex.sampler_index_mode = sampler_index_mode;
8297 tex.resource_id = tex.sampler_id;
8298 tex.resource_index_mode = sampler_index_mode;
8299 tex.src_sel_x = 4;
8300 tex.src_sel_y = 4;
8301 tex.src_sel_z = 4;
8302 tex.src_sel_w = 4;
8303 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
8304 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
8305 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
8306 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
8307 tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8308 r = r600_bytecode_add_tex(ctx->bc, &tex);
8309 if (r)
8310 return r;
8311
8312 return 0;
8313 }
8314
8315 static int tgsi_lrp(struct r600_shader_ctx *ctx)
8316 {
8317 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8318 struct r600_bytecode_alu alu;
8319 unsigned lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
8320 unsigned i, temp_regs[2];
8321 int r;
8322
8323 /* optimize if it's just an equal balance */
8324 if (ctx->src[0].sel == V_SQ_ALU_SRC_0_5) {
8325 for (i = 0; i < lasti + 1; i++) {
8326 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
8327 continue;
8328
8329 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8330 alu.op = ALU_OP2_ADD;
8331 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
8332 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
8333 alu.omod = 3;
8334 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
8335 alu.dst.chan = i;
8336 if (i == lasti) {
8337 alu.last = 1;
8338 }
8339 r = r600_bytecode_add_alu(ctx->bc, &alu);
8340 if (r)
8341 return r;
8342 }
8343 return 0;
8344 }
8345
8346 /* 1 - src0 */
8347 for (i = 0; i < lasti + 1; i++) {
8348 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
8349 continue;
8350
8351 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8352 alu.op = ALU_OP2_ADD;
8353 alu.src[0].sel = V_SQ_ALU_SRC_1;
8354 alu.src[0].chan = 0;
8355 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
8356 r600_bytecode_src_toggle_neg(&alu.src[1]);
8357 alu.dst.sel = ctx->temp_reg;
8358 alu.dst.chan = i;
8359 if (i == lasti) {
8360 alu.last = 1;
8361 }
8362 alu.dst.write = 1;
8363 r = r600_bytecode_add_alu(ctx->bc, &alu);
8364 if (r)
8365 return r;
8366 }
8367
8368 /* (1 - src0) * src2 */
8369 for (i = 0; i < lasti + 1; i++) {
8370 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
8371 continue;
8372
8373 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8374 alu.op = ALU_OP2_MUL;
8375 alu.src[0].sel = ctx->temp_reg;
8376 alu.src[0].chan = i;
8377 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
8378 alu.dst.sel = ctx->temp_reg;
8379 alu.dst.chan = i;
8380 if (i == lasti) {
8381 alu.last = 1;
8382 }
8383 alu.dst.write = 1;
8384 r = r600_bytecode_add_alu(ctx->bc, &alu);
8385 if (r)
8386 return r;
8387 }
8388
8389 /* src0 * src1 + (1 - src0) * src2 */
8390 if (ctx->src[0].abs)
8391 temp_regs[0] = r600_get_temp(ctx);
8392 else
8393 temp_regs[0] = 0;
8394 if (ctx->src[1].abs)
8395 temp_regs[1] = r600_get_temp(ctx);
8396 else
8397 temp_regs[1] = 0;
8398
8399 for (i = 0; i < lasti + 1; i++) {
8400 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
8401 continue;
8402
8403 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8404 alu.op = ALU_OP3_MULADD;
8405 alu.is_op3 = 1;
8406 r = tgsi_make_src_for_op3(ctx, temp_regs[0], i, &alu.src[0], &ctx->src[0]);
8407 if (r)
8408 return r;
8409 r = tgsi_make_src_for_op3(ctx, temp_regs[1], i, &alu.src[1], &ctx->src[1]);
8410 if (r)
8411 return r;
8412 alu.src[2].sel = ctx->temp_reg;
8413 alu.src[2].chan = i;
8414
8415 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
8416 alu.dst.chan = i;
8417 if (i == lasti) {
8418 alu.last = 1;
8419 }
8420 r = r600_bytecode_add_alu(ctx->bc, &alu);
8421 if (r)
8422 return r;
8423 }
8424 return 0;
8425 }
8426
8427 static int tgsi_cmp(struct r600_shader_ctx *ctx)
8428 {
8429 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8430 struct r600_bytecode_alu alu;
8431 int i, r, j;
8432 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
8433 int temp_regs[3];
8434 unsigned op;
8435
8436 if (ctx->src[0].abs && ctx->src[0].neg) {
8437 op = ALU_OP3_CNDE;
8438 ctx->src[0].abs = 0;
8439 ctx->src[0].neg = 0;
8440 } else {
8441 op = ALU_OP3_CNDGE;
8442 }
8443
8444 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
8445 temp_regs[j] = 0;
8446 if (ctx->src[j].abs)
8447 temp_regs[j] = r600_get_temp(ctx);
8448 }
8449
8450 for (i = 0; i < lasti + 1; i++) {
8451 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
8452 continue;
8453
8454 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8455 alu.op = op;
8456 r = tgsi_make_src_for_op3(ctx, temp_regs[0], i, &alu.src[0], &ctx->src[0]);
8457 if (r)
8458 return r;
8459 r = tgsi_make_src_for_op3(ctx, temp_regs[2], i, &alu.src[1], &ctx->src[2]);
8460 if (r)
8461 return r;
8462 r = tgsi_make_src_for_op3(ctx, temp_regs[1], i, &alu.src[2], &ctx->src[1]);
8463 if (r)
8464 return r;
8465 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
8466 alu.dst.chan = i;
8467 alu.dst.write = 1;
8468 alu.is_op3 = 1;
8469 if (i == lasti)
8470 alu.last = 1;
8471 r = r600_bytecode_add_alu(ctx->bc, &alu);
8472 if (r)
8473 return r;
8474 }
8475 return 0;
8476 }
8477
8478 static int tgsi_ucmp(struct r600_shader_ctx *ctx)
8479 {
8480 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8481 struct r600_bytecode_alu alu;
8482 int i, r;
8483 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
8484
8485 for (i = 0; i < lasti + 1; i++) {
8486 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
8487 continue;
8488
8489 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8490 alu.op = ALU_OP3_CNDE_INT;
8491 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
8492 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
8493 r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
8494 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
8495 alu.dst.chan = i;
8496 alu.dst.write = 1;
8497 alu.is_op3 = 1;
8498 if (i == lasti)
8499 alu.last = 1;
8500 r = r600_bytecode_add_alu(ctx->bc, &alu);
8501 if (r)
8502 return r;
8503 }
8504 return 0;
8505 }
8506
8507 static int tgsi_exp(struct r600_shader_ctx *ctx)
8508 {
8509 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8510 struct r600_bytecode_alu alu;
8511 int r;
8512 unsigned i;
8513
8514 /* result.x = 2^floor(src); */
8515 if (inst->Dst[0].Register.WriteMask & 1) {
8516 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8517
8518 alu.op = ALU_OP1_FLOOR;
8519 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
8520
8521 alu.dst.sel = ctx->temp_reg;
8522 alu.dst.chan = 0;
8523 alu.dst.write = 1;
8524 alu.last = 1;
8525 r = r600_bytecode_add_alu(ctx->bc, &alu);
8526 if (r)
8527 return r;
8528
8529 if (ctx->bc->chip_class == CAYMAN) {
8530 for (i = 0; i < 3; i++) {
8531 alu.op = ALU_OP1_EXP_IEEE;
8532 alu.src[0].sel = ctx->temp_reg;
8533 alu.src[0].chan = 0;
8534
8535 alu.dst.sel = ctx->temp_reg;
8536 alu.dst.chan = i;
8537 alu.dst.write = i == 0;
8538 alu.last = i == 2;
8539 r = r600_bytecode_add_alu(ctx->bc, &alu);
8540 if (r)
8541 return r;
8542 }
8543 } else {
8544 alu.op = ALU_OP1_EXP_IEEE;
8545 alu.src[0].sel = ctx->temp_reg;
8546 alu.src[0].chan = 0;
8547
8548 alu.dst.sel = ctx->temp_reg;
8549 alu.dst.chan = 0;
8550 alu.dst.write = 1;
8551 alu.last = 1;
8552 r = r600_bytecode_add_alu(ctx->bc, &alu);
8553 if (r)
8554 return r;
8555 }
8556 }
8557
8558 /* result.y = tmp - floor(tmp); */
8559 if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
8560 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8561
8562 alu.op = ALU_OP1_FRACT;
8563 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
8564
8565 alu.dst.sel = ctx->temp_reg;
8566 #if 0
8567 r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
8568 if (r)
8569 return r;
8570 #endif
8571 alu.dst.write = 1;
8572 alu.dst.chan = 1;
8573
8574 alu.last = 1;
8575
8576 r = r600_bytecode_add_alu(ctx->bc, &alu);
8577 if (r)
8578 return r;
8579 }
8580
8581 /* result.z = RoughApprox2ToX(tmp);*/
8582 if ((inst->Dst[0].Register.WriteMask >> 2) & 0x1) {
8583 if (ctx->bc->chip_class == CAYMAN) {
8584 for (i = 0; i < 3; i++) {
8585 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8586 alu.op = ALU_OP1_EXP_IEEE;
8587 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
8588
8589 alu.dst.sel = ctx->temp_reg;
8590 alu.dst.chan = i;
8591 if (i == 2) {
8592 alu.dst.write = 1;
8593 alu.last = 1;
8594 }
8595
8596 r = r600_bytecode_add_alu(ctx->bc, &alu);
8597 if (r)
8598 return r;
8599 }
8600 } else {
8601 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8602 alu.op = ALU_OP1_EXP_IEEE;
8603 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
8604
8605 alu.dst.sel = ctx->temp_reg;
8606 alu.dst.write = 1;
8607 alu.dst.chan = 2;
8608
8609 alu.last = 1;
8610
8611 r = r600_bytecode_add_alu(ctx->bc, &alu);
8612 if (r)
8613 return r;
8614 }
8615 }
8616
8617 /* result.w = 1.0;*/
8618 if ((inst->Dst[0].Register.WriteMask >> 3) & 0x1) {
8619 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8620
8621 alu.op = ALU_OP1_MOV;
8622 alu.src[0].sel = V_SQ_ALU_SRC_1;
8623 alu.src[0].chan = 0;
8624
8625 alu.dst.sel = ctx->temp_reg;
8626 alu.dst.chan = 3;
8627 alu.dst.write = 1;
8628 alu.last = 1;
8629 r = r600_bytecode_add_alu(ctx->bc, &alu);
8630 if (r)
8631 return r;
8632 }
8633 return tgsi_helper_copy(ctx, inst);
8634 }
8635
8636 static int tgsi_log(struct r600_shader_ctx *ctx)
8637 {
8638 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8639 struct r600_bytecode_alu alu;
8640 int r;
8641 unsigned i;
8642
8643 /* result.x = floor(log2(|src|)); */
8644 if (inst->Dst[0].Register.WriteMask & 1) {
8645 if (ctx->bc->chip_class == CAYMAN) {
8646 for (i = 0; i < 3; i++) {
8647 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8648
8649 alu.op = ALU_OP1_LOG_IEEE;
8650 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
8651 r600_bytecode_src_set_abs(&alu.src[0]);
8652
8653 alu.dst.sel = ctx->temp_reg;
8654 alu.dst.chan = i;
8655 if (i == 0)
8656 alu.dst.write = 1;
8657 if (i == 2)
8658 alu.last = 1;
8659 r = r600_bytecode_add_alu(ctx->bc, &alu);
8660 if (r)
8661 return r;
8662 }
8663
8664 } else {
8665 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8666
8667 alu.op = ALU_OP1_LOG_IEEE;
8668 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
8669 r600_bytecode_src_set_abs(&alu.src[0]);
8670
8671 alu.dst.sel = ctx->temp_reg;
8672 alu.dst.chan = 0;
8673 alu.dst.write = 1;
8674 alu.last = 1;
8675 r = r600_bytecode_add_alu(ctx->bc, &alu);
8676 if (r)
8677 return r;
8678 }
8679
8680 alu.op = ALU_OP1_FLOOR;
8681 alu.src[0].sel = ctx->temp_reg;
8682 alu.src[0].chan = 0;
8683
8684 alu.dst.sel = ctx->temp_reg;
8685 alu.dst.chan = 0;
8686 alu.dst.write = 1;
8687 alu.last = 1;
8688
8689 r = r600_bytecode_add_alu(ctx->bc, &alu);
8690 if (r)
8691 return r;
8692 }
8693
8694 /* result.y = |src.x| / (2 ^ floor(log2(|src.x|))); */
8695 if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
8696
8697 if (ctx->bc->chip_class == CAYMAN) {
8698 for (i = 0; i < 3; i++) {
8699 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8700
8701 alu.op = ALU_OP1_LOG_IEEE;
8702 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
8703 r600_bytecode_src_set_abs(&alu.src[0]);
8704
8705 alu.dst.sel = ctx->temp_reg;
8706 alu.dst.chan = i;
8707 if (i == 1)
8708 alu.dst.write = 1;
8709 if (i == 2)
8710 alu.last = 1;
8711
8712 r = r600_bytecode_add_alu(ctx->bc, &alu);
8713 if (r)
8714 return r;
8715 }
8716 } else {
8717 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8718
8719 alu.op = ALU_OP1_LOG_IEEE;
8720 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
8721 r600_bytecode_src_set_abs(&alu.src[0]);
8722
8723 alu.dst.sel = ctx->temp_reg;
8724 alu.dst.chan = 1;
8725 alu.dst.write = 1;
8726 alu.last = 1;
8727
8728 r = r600_bytecode_add_alu(ctx->bc, &alu);
8729 if (r)
8730 return r;
8731 }
8732
8733 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8734
8735 alu.op = ALU_OP1_FLOOR;
8736 alu.src[0].sel = ctx->temp_reg;
8737 alu.src[0].chan = 1;
8738
8739 alu.dst.sel = ctx->temp_reg;
8740 alu.dst.chan = 1;
8741 alu.dst.write = 1;
8742 alu.last = 1;
8743
8744 r = r600_bytecode_add_alu(ctx->bc, &alu);
8745 if (r)
8746 return r;
8747
8748 if (ctx->bc->chip_class == CAYMAN) {
8749 for (i = 0; i < 3; i++) {
8750 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8751 alu.op = ALU_OP1_EXP_IEEE;
8752 alu.src[0].sel = ctx->temp_reg;
8753 alu.src[0].chan = 1;
8754
8755 alu.dst.sel = ctx->temp_reg;
8756 alu.dst.chan = i;
8757 if (i == 1)
8758 alu.dst.write = 1;
8759 if (i == 2)
8760 alu.last = 1;
8761
8762 r = r600_bytecode_add_alu(ctx->bc, &alu);
8763 if (r)
8764 return r;
8765 }
8766 } else {
8767 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8768 alu.op = ALU_OP1_EXP_IEEE;
8769 alu.src[0].sel = ctx->temp_reg;
8770 alu.src[0].chan = 1;
8771
8772 alu.dst.sel = ctx->temp_reg;
8773 alu.dst.chan = 1;
8774 alu.dst.write = 1;
8775 alu.last = 1;
8776
8777 r = r600_bytecode_add_alu(ctx->bc, &alu);
8778 if (r)
8779 return r;
8780 }
8781
8782 if (ctx->bc->chip_class == CAYMAN) {
8783 for (i = 0; i < 3; i++) {
8784 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8785 alu.op = ALU_OP1_RECIP_IEEE;
8786 alu.src[0].sel = ctx->temp_reg;
8787 alu.src[0].chan = 1;
8788
8789 alu.dst.sel = ctx->temp_reg;
8790 alu.dst.chan = i;
8791 if (i == 1)
8792 alu.dst.write = 1;
8793 if (i == 2)
8794 alu.last = 1;
8795
8796 r = r600_bytecode_add_alu(ctx->bc, &alu);
8797 if (r)
8798 return r;
8799 }
8800 } else {
8801 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8802 alu.op = ALU_OP1_RECIP_IEEE;
8803 alu.src[0].sel = ctx->temp_reg;
8804 alu.src[0].chan = 1;
8805
8806 alu.dst.sel = ctx->temp_reg;
8807 alu.dst.chan = 1;
8808 alu.dst.write = 1;
8809 alu.last = 1;
8810
8811 r = r600_bytecode_add_alu(ctx->bc, &alu);
8812 if (r)
8813 return r;
8814 }
8815
8816 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8817
8818 alu.op = ALU_OP2_MUL;
8819
8820 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
8821 r600_bytecode_src_set_abs(&alu.src[0]);
8822
8823 alu.src[1].sel = ctx->temp_reg;
8824 alu.src[1].chan = 1;
8825
8826 alu.dst.sel = ctx->temp_reg;
8827 alu.dst.chan = 1;
8828 alu.dst.write = 1;
8829 alu.last = 1;
8830
8831 r = r600_bytecode_add_alu(ctx->bc, &alu);
8832 if (r)
8833 return r;
8834 }
8835
8836 /* result.z = log2(|src|);*/
8837 if ((inst->Dst[0].Register.WriteMask >> 2) & 1) {
8838 if (ctx->bc->chip_class == CAYMAN) {
8839 for (i = 0; i < 3; i++) {
8840 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8841
8842 alu.op = ALU_OP1_LOG_IEEE;
8843 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
8844 r600_bytecode_src_set_abs(&alu.src[0]);
8845
8846 alu.dst.sel = ctx->temp_reg;
8847 if (i == 2)
8848 alu.dst.write = 1;
8849 alu.dst.chan = i;
8850 if (i == 2)
8851 alu.last = 1;
8852
8853 r = r600_bytecode_add_alu(ctx->bc, &alu);
8854 if (r)
8855 return r;
8856 }
8857 } else {
8858 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8859
8860 alu.op = ALU_OP1_LOG_IEEE;
8861 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
8862 r600_bytecode_src_set_abs(&alu.src[0]);
8863
8864 alu.dst.sel = ctx->temp_reg;
8865 alu.dst.write = 1;
8866 alu.dst.chan = 2;
8867 alu.last = 1;
8868
8869 r = r600_bytecode_add_alu(ctx->bc, &alu);
8870 if (r)
8871 return r;
8872 }
8873 }
8874
8875 /* result.w = 1.0; */
8876 if ((inst->Dst[0].Register.WriteMask >> 3) & 1) {
8877 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8878
8879 alu.op = ALU_OP1_MOV;
8880 alu.src[0].sel = V_SQ_ALU_SRC_1;
8881 alu.src[0].chan = 0;
8882
8883 alu.dst.sel = ctx->temp_reg;
8884 alu.dst.chan = 3;
8885 alu.dst.write = 1;
8886 alu.last = 1;
8887
8888 r = r600_bytecode_add_alu(ctx->bc, &alu);
8889 if (r)
8890 return r;
8891 }
8892
8893 return tgsi_helper_copy(ctx, inst);
8894 }
8895
8896 static int tgsi_eg_arl(struct r600_shader_ctx *ctx)
8897 {
8898 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8899 struct r600_bytecode_alu alu;
8900 int r;
8901 int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
8902 unsigned reg = get_address_file_reg(ctx, inst->Dst[0].Register.Index);
8903
8904 assert(inst->Dst[0].Register.Index < 3);
8905 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8906
8907 switch (inst->Instruction.Opcode) {
8908 case TGSI_OPCODE_ARL:
8909 alu.op = ALU_OP1_FLT_TO_INT_FLOOR;
8910 break;
8911 case TGSI_OPCODE_ARR:
8912 alu.op = ALU_OP1_FLT_TO_INT;
8913 break;
8914 case TGSI_OPCODE_UARL:
8915 alu.op = ALU_OP1_MOV;
8916 break;
8917 default:
8918 assert(0);
8919 return -1;
8920 }
8921
8922 for (i = 0; i <= lasti; ++i) {
8923 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
8924 continue;
8925 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
8926 alu.last = i == lasti;
8927 alu.dst.sel = reg;
8928 alu.dst.chan = i;
8929 alu.dst.write = 1;
8930 r = r600_bytecode_add_alu(ctx->bc, &alu);
8931 if (r)
8932 return r;
8933 }
8934
8935 if (inst->Dst[0].Register.Index > 0)
8936 ctx->bc->index_loaded[inst->Dst[0].Register.Index - 1] = 0;
8937 else
8938 ctx->bc->ar_loaded = 0;
8939
8940 return 0;
8941 }
8942 static int tgsi_r600_arl(struct r600_shader_ctx *ctx)
8943 {
8944 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8945 struct r600_bytecode_alu alu;
8946 int r;
8947 int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
8948
8949 switch (inst->Instruction.Opcode) {
8950 case TGSI_OPCODE_ARL:
8951 memset(&alu, 0, sizeof(alu));
8952 alu.op = ALU_OP1_FLOOR;
8953 alu.dst.sel = ctx->bc->ar_reg;
8954 alu.dst.write = 1;
8955 for (i = 0; i <= lasti; ++i) {
8956 if (inst->Dst[0].Register.WriteMask & (1 << i)) {
8957 alu.dst.chan = i;
8958 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
8959 alu.last = i == lasti;
8960 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
8961 return r;
8962 }
8963 }
8964
8965 memset(&alu, 0, sizeof(alu));
8966 alu.op = ALU_OP1_FLT_TO_INT;
8967 alu.src[0].sel = ctx->bc->ar_reg;
8968 alu.dst.sel = ctx->bc->ar_reg;
8969 alu.dst.write = 1;
8970 /* FLT_TO_INT is trans-only on r600/r700 */
8971 alu.last = TRUE;
8972 for (i = 0; i <= lasti; ++i) {
8973 alu.dst.chan = i;
8974 alu.src[0].chan = i;
8975 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
8976 return r;
8977 }
8978 break;
8979 case TGSI_OPCODE_ARR:
8980 memset(&alu, 0, sizeof(alu));
8981 alu.op = ALU_OP1_FLT_TO_INT;
8982 alu.dst.sel = ctx->bc->ar_reg;
8983 alu.dst.write = 1;
8984 /* FLT_TO_INT is trans-only on r600/r700 */
8985 alu.last = TRUE;
8986 for (i = 0; i <= lasti; ++i) {
8987 if (inst->Dst[0].Register.WriteMask & (1 << i)) {
8988 alu.dst.chan = i;
8989 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
8990 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
8991 return r;
8992 }
8993 }
8994 break;
8995 case TGSI_OPCODE_UARL:
8996 memset(&alu, 0, sizeof(alu));
8997 alu.op = ALU_OP1_MOV;
8998 alu.dst.sel = ctx->bc->ar_reg;
8999 alu.dst.write = 1;
9000 for (i = 0; i <= lasti; ++i) {
9001 if (inst->Dst[0].Register.WriteMask & (1 << i)) {
9002 alu.dst.chan = i;
9003 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
9004 alu.last = i == lasti;
9005 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
9006 return r;
9007 }
9008 }
9009 break;
9010 default:
9011 assert(0);
9012 return -1;
9013 }
9014
9015 ctx->bc->ar_loaded = 0;
9016 return 0;
9017 }
9018
9019 static int tgsi_opdst(struct r600_shader_ctx *ctx)
9020 {
9021 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9022 struct r600_bytecode_alu alu;
9023 int i, r = 0;
9024
9025 for (i = 0; i < 4; i++) {
9026 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9027
9028 alu.op = ALU_OP2_MUL;
9029 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9030
9031 if (i == 0 || i == 3) {
9032 alu.src[0].sel = V_SQ_ALU_SRC_1;
9033 } else {
9034 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
9035 }
9036
9037 if (i == 0 || i == 2) {
9038 alu.src[1].sel = V_SQ_ALU_SRC_1;
9039 } else {
9040 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
9041 }
9042 if (i == 3)
9043 alu.last = 1;
9044 r = r600_bytecode_add_alu(ctx->bc, &alu);
9045 if (r)
9046 return r;
9047 }
9048 return 0;
9049 }
9050
9051 static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode, int alu_type)
9052 {
9053 struct r600_bytecode_alu alu;
9054 int r;
9055
9056 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9057 alu.op = opcode;
9058 alu.execute_mask = 1;
9059 alu.update_pred = 1;
9060
9061 alu.dst.sel = ctx->temp_reg;
9062 alu.dst.write = 1;
9063 alu.dst.chan = 0;
9064
9065 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9066 alu.src[1].sel = V_SQ_ALU_SRC_0;
9067 alu.src[1].chan = 0;
9068
9069 alu.last = 1;
9070
9071 r = r600_bytecode_add_alu_type(ctx->bc, &alu, alu_type);
9072 if (r)
9073 return r;
9074 return 0;
9075 }
9076
9077 static int pops(struct r600_shader_ctx *ctx, int pops)
9078 {
9079 unsigned force_pop = ctx->bc->force_add_cf;
9080
9081 if (!force_pop) {
9082 int alu_pop = 3;
9083 if (ctx->bc->cf_last) {
9084 if (ctx->bc->cf_last->op == CF_OP_ALU)
9085 alu_pop = 0;
9086 else if (ctx->bc->cf_last->op == CF_OP_ALU_POP_AFTER)
9087 alu_pop = 1;
9088 }
9089 alu_pop += pops;
9090 if (alu_pop == 1) {
9091 ctx->bc->cf_last->op = CF_OP_ALU_POP_AFTER;
9092 ctx->bc->force_add_cf = 1;
9093 } else if (alu_pop == 2) {
9094 ctx->bc->cf_last->op = CF_OP_ALU_POP2_AFTER;
9095 ctx->bc->force_add_cf = 1;
9096 } else {
9097 force_pop = 1;
9098 }
9099 }
9100
9101 if (force_pop) {
9102 r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP);
9103 ctx->bc->cf_last->pop_count = pops;
9104 ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
9105 }
9106
9107 return 0;
9108 }
9109
9110 static inline void callstack_update_max_depth(struct r600_shader_ctx *ctx,
9111 unsigned reason)
9112 {
9113 struct r600_stack_info *stack = &ctx->bc->stack;
9114 unsigned elements;
9115 int entries;
9116
9117 unsigned entry_size = stack->entry_size;
9118
9119 elements = (stack->loop + stack->push_wqm ) * entry_size;
9120 elements += stack->push;
9121
9122 switch (ctx->bc->chip_class) {
9123 case R600:
9124 case R700:
9125 /* pre-r8xx: if any non-WQM PUSH instruction is invoked, 2 elements on
9126 * the stack must be reserved to hold the current active/continue
9127 * masks */
9128 if (reason == FC_PUSH_VPM) {
9129 elements += 2;
9130 }
9131 break;
9132
9133 case CAYMAN:
9134 /* r9xx: any stack operation on empty stack consumes 2 additional
9135 * elements */
9136 elements += 2;
9137
9138 /* fallthrough */
9139 /* FIXME: do the two elements added above cover the cases for the
9140 * r8xx+ below? */
9141
9142 case EVERGREEN:
9143 /* r8xx+: 2 extra elements are not always required, but one extra
9144 * element must be added for each of the following cases:
9145 * 1. There is an ALU_ELSE_AFTER instruction at the point of greatest
9146 * stack usage.
9147 * (Currently we don't use ALU_ELSE_AFTER.)
9148 * 2. There are LOOP/WQM frames on the stack when any flavor of non-WQM
9149 * PUSH instruction executed.
9150 *
9151 * NOTE: it seems we also need to reserve additional element in some
9152 * other cases, e.g. when we have 4 levels of PUSH_VPM in the shader,
9153 * then STACK_SIZE should be 2 instead of 1 */
9154 if (reason == FC_PUSH_VPM) {
9155 elements += 1;
9156 }
9157 break;
9158
9159 default:
9160 assert(0);
9161 break;
9162 }
9163
9164 /* NOTE: it seems STACK_SIZE is interpreted by hw as if entry_size is 4
9165 * for all chips, so we use 4 in the final formula, not the real entry_size
9166 * for the chip */
9167 entry_size = 4;
9168
9169 entries = (elements + (entry_size - 1)) / entry_size;
9170
9171 if (entries > stack->max_entries)
9172 stack->max_entries = entries;
9173 }
9174
9175 static inline void callstack_pop(struct r600_shader_ctx *ctx, unsigned reason)
9176 {
9177 switch(reason) {
9178 case FC_PUSH_VPM:
9179 --ctx->bc->stack.push;
9180 assert(ctx->bc->stack.push >= 0);
9181 break;
9182 case FC_PUSH_WQM:
9183 --ctx->bc->stack.push_wqm;
9184 assert(ctx->bc->stack.push_wqm >= 0);
9185 break;
9186 case FC_LOOP:
9187 --ctx->bc->stack.loop;
9188 assert(ctx->bc->stack.loop >= 0);
9189 break;
9190 default:
9191 assert(0);
9192 break;
9193 }
9194 }
9195
9196 static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason)
9197 {
9198 switch (reason) {
9199 case FC_PUSH_VPM:
9200 ++ctx->bc->stack.push;
9201 break;
9202 case FC_PUSH_WQM:
9203 ++ctx->bc->stack.push_wqm;
9204 case FC_LOOP:
9205 ++ctx->bc->stack.loop;
9206 break;
9207 default:
9208 assert(0);
9209 }
9210
9211 callstack_update_max_depth(ctx, reason);
9212 }
9213
9214 static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp)
9215 {
9216 struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[fc_sp];
9217
9218 sp->mid = realloc((void *)sp->mid,
9219 sizeof(struct r600_bytecode_cf *) * (sp->num_mid + 1));
9220 sp->mid[sp->num_mid] = ctx->bc->cf_last;
9221 sp->num_mid++;
9222 }
9223
9224 static void fc_pushlevel(struct r600_shader_ctx *ctx, int type)
9225 {
9226 assert(ctx->bc->fc_sp < ARRAY_SIZE(ctx->bc->fc_stack));
9227 ctx->bc->fc_stack[ctx->bc->fc_sp].type = type;
9228 ctx->bc->fc_stack[ctx->bc->fc_sp].start = ctx->bc->cf_last;
9229 ctx->bc->fc_sp++;
9230 }
9231
9232 static void fc_poplevel(struct r600_shader_ctx *ctx)
9233 {
9234 struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[ctx->bc->fc_sp - 1];
9235 free(sp->mid);
9236 sp->mid = NULL;
9237 sp->num_mid = 0;
9238 sp->start = NULL;
9239 sp->type = 0;
9240 ctx->bc->fc_sp--;
9241 }
9242
9243 #if 0
9244 static int emit_return(struct r600_shader_ctx *ctx)
9245 {
9246 r600_bytecode_add_cfinst(ctx->bc, CF_OP_RETURN));
9247 return 0;
9248 }
9249
9250 static int emit_jump_to_offset(struct r600_shader_ctx *ctx, int pops, int offset)
9251 {
9252
9253 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP));
9254 ctx->bc->cf_last->pop_count = pops;
9255 /* XXX work out offset */
9256 return 0;
9257 }
9258
9259 static int emit_setret_in_loop_flag(struct r600_shader_ctx *ctx, unsigned flag_value)
9260 {
9261 return 0;
9262 }
9263
9264 static void emit_testflag(struct r600_shader_ctx *ctx)
9265 {
9266
9267 }
9268
9269 static void emit_return_on_flag(struct r600_shader_ctx *ctx, unsigned ifidx)
9270 {
9271 emit_testflag(ctx);
9272 emit_jump_to_offset(ctx, 1, 4);
9273 emit_setret_in_loop_flag(ctx, V_SQ_ALU_SRC_0);
9274 pops(ctx, ifidx + 1);
9275 emit_return(ctx);
9276 }
9277
9278 static void break_loop_on_flag(struct r600_shader_ctx *ctx, unsigned fc_sp)
9279 {
9280 emit_testflag(ctx);
9281
9282 r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
9283 ctx->bc->cf_last->pop_count = 1;
9284
9285 fc_set_mid(ctx, fc_sp);
9286
9287 pops(ctx, 1);
9288 }
9289 #endif
9290
9291 static int emit_if(struct r600_shader_ctx *ctx, int opcode)
9292 {
9293 int alu_type = CF_OP_ALU_PUSH_BEFORE;
9294
9295 /* There is a hardware bug on Cayman where a BREAK/CONTINUE followed by
9296 * LOOP_STARTxxx for nested loops may put the branch stack into a state
9297 * such that ALU_PUSH_BEFORE doesn't work as expected. Workaround this
9298 * by replacing the ALU_PUSH_BEFORE with a PUSH + ALU */
9299 if (ctx->bc->chip_class == CAYMAN && ctx->bc->stack.loop > 1) {
9300 r600_bytecode_add_cfinst(ctx->bc, CF_OP_PUSH);
9301 ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
9302 alu_type = CF_OP_ALU;
9303 }
9304
9305 emit_logic_pred(ctx, opcode, alu_type);
9306
9307 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
9308
9309 fc_pushlevel(ctx, FC_IF);
9310
9311 callstack_push(ctx, FC_PUSH_VPM);
9312 return 0;
9313 }
9314
9315 static int tgsi_if(struct r600_shader_ctx *ctx)
9316 {
9317 return emit_if(ctx, ALU_OP2_PRED_SETNE);
9318 }
9319
9320 static int tgsi_uif(struct r600_shader_ctx *ctx)
9321 {
9322 return emit_if(ctx, ALU_OP2_PRED_SETNE_INT);
9323 }
9324
9325 static int tgsi_else(struct r600_shader_ctx *ctx)
9326 {
9327 r600_bytecode_add_cfinst(ctx->bc, CF_OP_ELSE);
9328 ctx->bc->cf_last->pop_count = 1;
9329
9330 fc_set_mid(ctx, ctx->bc->fc_sp - 1);
9331 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id;
9332 return 0;
9333 }
9334
9335 static int tgsi_endif(struct r600_shader_ctx *ctx)
9336 {
9337 pops(ctx, 1);
9338 if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].type != FC_IF) {
9339 R600_ERR("if/endif unbalanced in shader\n");
9340 return -1;
9341 }
9342
9343 if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid == NULL) {
9344 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id + 2;
9345 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->pop_count = 1;
9346 } else {
9347 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid[0]->cf_addr = ctx->bc->cf_last->id + 2;
9348 }
9349 fc_poplevel(ctx);
9350
9351 callstack_pop(ctx, FC_PUSH_VPM);
9352 return 0;
9353 }
9354
9355 static int tgsi_bgnloop(struct r600_shader_ctx *ctx)
9356 {
9357 /* LOOP_START_DX10 ignores the LOOP_CONFIG* registers, so it is not
9358 * limited to 4096 iterations, like the other LOOP_* instructions. */
9359 r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_START_DX10);
9360
9361 fc_pushlevel(ctx, FC_LOOP);
9362
9363 /* check stack depth */
9364 callstack_push(ctx, FC_LOOP);
9365 return 0;
9366 }
9367
9368 static int tgsi_endloop(struct r600_shader_ctx *ctx)
9369 {
9370 int i;
9371
9372 r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_END);
9373
9374 if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].type != FC_LOOP) {
9375 R600_ERR("loop/endloop in shader code are not paired.\n");
9376 return -EINVAL;
9377 }
9378
9379 /* fixup loop pointers - from r600isa
9380 LOOP END points to CF after LOOP START,
9381 LOOP START point to CF after LOOP END
9382 BRK/CONT point to LOOP END CF
9383 */
9384 ctx->bc->cf_last->cf_addr = ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->id + 2;
9385
9386 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id + 2;
9387
9388 for (i = 0; i < ctx->bc->fc_stack[ctx->bc->fc_sp - 1].num_mid; i++) {
9389 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid[i]->cf_addr = ctx->bc->cf_last->id;
9390 }
9391 /* XXX add LOOPRET support */
9392 fc_poplevel(ctx);
9393 callstack_pop(ctx, FC_LOOP);
9394 return 0;
9395 }
9396
9397 static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx)
9398 {
9399 unsigned int fscp;
9400
9401 for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
9402 {
9403 if (FC_LOOP == ctx->bc->fc_stack[fscp - 1].type)
9404 break;
9405 }
9406
9407 if (fscp == 0) {
9408 R600_ERR("Break not inside loop/endloop pair\n");
9409 return -EINVAL;
9410 }
9411
9412 r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
9413
9414 fc_set_mid(ctx, fscp - 1);
9415
9416 return 0;
9417 }
9418
9419 static int tgsi_gs_emit(struct r600_shader_ctx *ctx)
9420 {
9421 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9422 int stream = ctx->literals[inst->Src[0].Register.Index * 4 + inst->Src[0].Register.SwizzleX];
9423 int r;
9424
9425 if (ctx->inst_info->op == CF_OP_EMIT_VERTEX)
9426 emit_gs_ring_writes(ctx, ctx->gs_stream_output_info, stream, TRUE);
9427
9428 r = r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
9429 if (!r) {
9430 ctx->bc->cf_last->count = stream; // Count field for CUT/EMIT_VERTEX indicates which stream
9431 if (ctx->inst_info->op == CF_OP_EMIT_VERTEX)
9432 return emit_inc_ring_offset(ctx, stream, TRUE);
9433 }
9434 return r;
9435 }
9436
9437 static int tgsi_umad(struct r600_shader_ctx *ctx)
9438 {
9439 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9440 struct r600_bytecode_alu alu;
9441 int i, j, k, r;
9442 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9443
9444 /* src0 * src1 */
9445 for (i = 0; i < lasti + 1; i++) {
9446 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9447 continue;
9448
9449 if (ctx->bc->chip_class == CAYMAN) {
9450 for (j = 0 ; j < 4; j++) {
9451 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9452
9453 alu.op = ALU_OP2_MULLO_UINT;
9454 for (k = 0; k < inst->Instruction.NumSrcRegs; k++) {
9455 r600_bytecode_src(&alu.src[k], &ctx->src[k], i);
9456 }
9457 alu.dst.chan = j;
9458 alu.dst.sel = ctx->temp_reg;
9459 alu.dst.write = (j == i);
9460 if (j == 3)
9461 alu.last = 1;
9462 r = r600_bytecode_add_alu(ctx->bc, &alu);
9463 if (r)
9464 return r;
9465 }
9466 } else {
9467 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9468
9469 alu.dst.chan = i;
9470 alu.dst.sel = ctx->temp_reg;
9471 alu.dst.write = 1;
9472
9473 alu.op = ALU_OP2_MULLO_UINT;
9474 for (j = 0; j < 2; j++) {
9475 r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
9476 }
9477
9478 alu.last = 1;
9479 r = r600_bytecode_add_alu(ctx->bc, &alu);
9480 if (r)
9481 return r;
9482 }
9483 }
9484
9485
9486 for (i = 0; i < lasti + 1; i++) {
9487 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9488 continue;
9489
9490 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9491 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9492
9493 alu.op = ALU_OP2_ADD_INT;
9494
9495 alu.src[0].sel = ctx->temp_reg;
9496 alu.src[0].chan = i;
9497
9498 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
9499 if (i == lasti) {
9500 alu.last = 1;
9501 }
9502 r = r600_bytecode_add_alu(ctx->bc, &alu);
9503 if (r)
9504 return r;
9505 }
9506 return 0;
9507 }
9508
9509 static int tgsi_pk2h(struct r600_shader_ctx *ctx)
9510 {
9511 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9512 struct r600_bytecode_alu alu;
9513 int r, i;
9514 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9515
9516 /* temp.xy = f32_to_f16(src) */
9517 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9518 alu.op = ALU_OP1_FLT32_TO_FLT16;
9519 alu.dst.chan = 0;
9520 alu.dst.sel = ctx->temp_reg;
9521 alu.dst.write = 1;
9522 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9523 r = r600_bytecode_add_alu(ctx->bc, &alu);
9524 if (r)
9525 return r;
9526 alu.dst.chan = 1;
9527 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
9528 alu.last = 1;
9529 r = r600_bytecode_add_alu(ctx->bc, &alu);
9530 if (r)
9531 return r;
9532
9533 /* dst.x = temp.y * 0x10000 + temp.x */
9534 for (i = 0; i < lasti + 1; i++) {
9535 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9536 continue;
9537
9538 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9539 alu.op = ALU_OP3_MULADD_UINT24;
9540 alu.is_op3 = 1;
9541 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9542 alu.last = i == lasti;
9543 alu.src[0].sel = ctx->temp_reg;
9544 alu.src[0].chan = 1;
9545 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
9546 alu.src[1].value = 0x10000;
9547 alu.src[2].sel = ctx->temp_reg;
9548 alu.src[2].chan = 0;
9549 r = r600_bytecode_add_alu(ctx->bc, &alu);
9550 if (r)
9551 return r;
9552 }
9553
9554 return 0;
9555 }
9556
9557 static int tgsi_up2h(struct r600_shader_ctx *ctx)
9558 {
9559 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9560 struct r600_bytecode_alu alu;
9561 int r, i;
9562 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9563
9564 /* temp.x = src.x */
9565 /* note: no need to mask out the high bits */
9566 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9567 alu.op = ALU_OP1_MOV;
9568 alu.dst.chan = 0;
9569 alu.dst.sel = ctx->temp_reg;
9570 alu.dst.write = 1;
9571 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9572 r = r600_bytecode_add_alu(ctx->bc, &alu);
9573 if (r)
9574 return r;
9575
9576 /* temp.y = src.x >> 16 */
9577 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9578 alu.op = ALU_OP2_LSHR_INT;
9579 alu.dst.chan = 1;
9580 alu.dst.sel = ctx->temp_reg;
9581 alu.dst.write = 1;
9582 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9583 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
9584 alu.src[1].value = 16;
9585 alu.last = 1;
9586 r = r600_bytecode_add_alu(ctx->bc, &alu);
9587 if (r)
9588 return r;
9589
9590 /* dst.wz = dst.xy = f16_to_f32(temp.xy) */
9591 for (i = 0; i < lasti + 1; i++) {
9592 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9593 continue;
9594 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9595 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9596 alu.op = ALU_OP1_FLT16_TO_FLT32;
9597 alu.src[0].sel = ctx->temp_reg;
9598 alu.src[0].chan = i % 2;
9599 alu.last = i == lasti;
9600 r = r600_bytecode_add_alu(ctx->bc, &alu);
9601 if (r)
9602 return r;
9603 }
9604
9605 return 0;
9606 }
9607
9608 static int tgsi_bfe(struct r600_shader_ctx *ctx)
9609 {
9610 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9611 struct r600_bytecode_alu alu;
9612 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9613 int r, i;
9614 int dst = -1;
9615
9616 if ((inst->Src[0].Register.File == inst->Dst[0].Register.File &&
9617 inst->Src[0].Register.Index == inst->Dst[0].Register.Index) ||
9618 (inst->Src[2].Register.File == inst->Dst[0].Register.File &&
9619 inst->Src[2].Register.Index == inst->Dst[0].Register.Index))
9620 dst = r600_get_temp(ctx);
9621
9622 r = tgsi_op3_dst(ctx, dst);
9623 if (r)
9624 return r;
9625
9626 for (i = 0; i < lasti + 1; i++) {
9627 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9628 alu.op = ALU_OP2_SETGE_INT;
9629 r600_bytecode_src(&alu.src[0], &ctx->src[2], i);
9630 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
9631 alu.src[1].value = 32;
9632 alu.dst.sel = ctx->temp_reg;
9633 alu.dst.chan = i;
9634 alu.dst.write = 1;
9635 if (i == lasti)
9636 alu.last = 1;
9637 r = r600_bytecode_add_alu(ctx->bc, &alu);
9638 if (r)
9639 return r;
9640 }
9641
9642 for (i = 0; i < lasti + 1; i++) {
9643 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9644 alu.op = ALU_OP3_CNDE_INT;
9645 alu.is_op3 = 1;
9646 alu.src[0].sel = ctx->temp_reg;
9647 alu.src[0].chan = i;
9648
9649 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9650 if (dst != -1)
9651 alu.src[1].sel = dst;
9652 else
9653 alu.src[1].sel = alu.dst.sel;
9654 alu.src[1].chan = i;
9655 r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
9656 alu.dst.write = 1;
9657 if (i == lasti)
9658 alu.last = 1;
9659 r = r600_bytecode_add_alu(ctx->bc, &alu);
9660 if (r)
9661 return r;
9662 }
9663
9664 return 0;
9665 }
9666
9667 static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
9668 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_r600_arl},
9669 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2},
9670 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit},
9671
9672 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate},
9673
9674 [TGSI_OPCODE_RSQ] = { ALU_OP0_NOP, tgsi_rsq},
9675 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp},
9676 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log},
9677 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL_IEEE, tgsi_op2},
9678 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2},
9679 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
9680 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
9681 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst},
9682 /* MIN_DX10 returns non-nan result if one src is NaN, MIN returns NaN */
9683 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2},
9684 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2},
9685 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap},
9686 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2},
9687 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3},
9688 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp},
9689 [TGSI_OPCODE_FMA] = { ALU_OP0_NOP, tgsi_unsupported},
9690 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
9691 [21] = { ALU_OP0_NOP, tgsi_unsupported},
9692 [22] = { ALU_OP0_NOP, tgsi_unsupported},
9693 [23] = { ALU_OP0_NOP, tgsi_unsupported},
9694 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2},
9695 [25] = { ALU_OP0_NOP, tgsi_unsupported},
9696 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2},
9697 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2},
9698 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
9699 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
9700 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, tgsi_pow},
9701 [31] = { ALU_OP0_NOP, tgsi_unsupported},
9702 [32] = { ALU_OP0_NOP, tgsi_unsupported},
9703 [33] = { ALU_OP0_NOP, tgsi_unsupported},
9704 [34] = { ALU_OP0_NOP, tgsi_unsupported},
9705 [35] = { ALU_OP0_NOP, tgsi_unsupported},
9706 [TGSI_OPCODE_COS] = { ALU_OP1_COS, tgsi_trig},
9707 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
9708 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
9709 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */
9710 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_unsupported},
9711 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported},
9712 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported},
9713 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported},
9714 [44] = { ALU_OP0_NOP, tgsi_unsupported},
9715 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2},
9716 [46] = { ALU_OP0_NOP, tgsi_unsupported},
9717 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2},
9718 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, tgsi_trig},
9719 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap},
9720 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2},
9721 [51] = { ALU_OP0_NOP, tgsi_unsupported},
9722 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex},
9723 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex},
9724 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex},
9725 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_unsupported},
9726 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported},
9727 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported},
9728 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported},
9729 [59] = { ALU_OP0_NOP, tgsi_unsupported},
9730 [60] = { ALU_OP0_NOP, tgsi_unsupported},
9731 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_r600_arl},
9732 [62] = { ALU_OP0_NOP, tgsi_unsupported},
9733 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported},
9734 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported},
9735 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg},
9736 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp},
9737 [67] = { ALU_OP0_NOP, tgsi_unsupported},
9738 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
9739 [69] = { ALU_OP0_NOP, tgsi_unsupported},
9740 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported},
9741 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
9742 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex},
9743 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
9744 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if},
9745 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif},
9746 [76] = { ALU_OP0_NOP, tgsi_unsupported},
9747 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else},
9748 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif},
9749 [TGSI_OPCODE_DDX_FINE] = { ALU_OP0_NOP, tgsi_unsupported},
9750 [TGSI_OPCODE_DDY_FINE] = { ALU_OP0_NOP, tgsi_unsupported},
9751 [81] = { ALU_OP0_NOP, tgsi_unsupported},
9752 [82] = { ALU_OP0_NOP, tgsi_unsupported},
9753 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2},
9754 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
9755 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2},
9756 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2},
9757 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2_trans},
9758 [88] = { ALU_OP0_NOP, tgsi_unsupported},
9759 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2},
9760 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2},
9761 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod},
9762 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2},
9763 [93] = { ALU_OP0_NOP, tgsi_unsupported},
9764 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex},
9765 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
9766 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
9767 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
9768 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit},
9769 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop},
9770 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported},
9771 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop},
9772 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported},
9773 [103] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
9774 [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
9775 [TGSI_OPCODE_RESQ] = { ALU_OP0_NOP, tgsi_unsupported},
9776 [106] = { ALU_OP0_NOP, tgsi_unsupported},
9777 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported},
9778 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2},
9779 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2},
9780 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
9781 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
9782 [TGSI_OPCODE_MEMBAR] = { ALU_OP0_NOP, tgsi_unsupported},
9783 [113] = { ALU_OP0_NOP, tgsi_unsupported},
9784 [114] = { ALU_OP0_NOP, tgsi_unsupported},
9785 [115] = { ALU_OP0_NOP, tgsi_unsupported},
9786 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */
9787 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */
9788 [TGSI_OPCODE_DFMA] = { ALU_OP0_NOP, tgsi_unsupported},
9789 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_op2_trans},
9790 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv},
9791 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2},
9792 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2},
9793 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg},
9794 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2},
9795 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2_trans},
9796 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap},
9797 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_op2_trans},
9798 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
9799 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2},
9800 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv},
9801 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad},
9802 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2},
9803 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2},
9804 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod},
9805 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_UINT, tgsi_op2_trans},
9806 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2},
9807 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2},
9808 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2_trans},
9809 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
9810 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2_swap},
9811 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
9812 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported},
9813 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported},
9814 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
9815 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported},
9816 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported},
9817 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported},
9818 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported},
9819 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported},
9820 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported},
9821 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported},
9822 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported},
9823 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported},
9824 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported},
9825 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported},
9826 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported},
9827 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_r600_arl},
9828 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp},
9829 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs},
9830 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg},
9831 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_unsupported},
9832 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_unsupported},
9833 [163] = { ALU_OP0_NOP, tgsi_unsupported},
9834 [164] = { ALU_OP0_NOP, tgsi_unsupported},
9835 [165] = { ALU_OP0_NOP, tgsi_unsupported},
9836 [TGSI_OPCODE_BARRIER] = { ALU_OP0_NOP, tgsi_unsupported},
9837 [TGSI_OPCODE_ATOMUADD] = { ALU_OP0_NOP, tgsi_unsupported},
9838 [TGSI_OPCODE_ATOMXCHG] = { ALU_OP0_NOP, tgsi_unsupported},
9839 [TGSI_OPCODE_ATOMCAS] = { ALU_OP0_NOP, tgsi_unsupported},
9840 [TGSI_OPCODE_ATOMAND] = { ALU_OP0_NOP, tgsi_unsupported},
9841 [TGSI_OPCODE_ATOMOR] = { ALU_OP0_NOP, tgsi_unsupported},
9842 [TGSI_OPCODE_ATOMXOR] = { ALU_OP0_NOP, tgsi_unsupported},
9843 [TGSI_OPCODE_ATOMUMIN] = { ALU_OP0_NOP, tgsi_unsupported},
9844 [TGSI_OPCODE_ATOMUMAX] = { ALU_OP0_NOP, tgsi_unsupported},
9845 [TGSI_OPCODE_ATOMIMIN] = { ALU_OP0_NOP, tgsi_unsupported},
9846 [TGSI_OPCODE_ATOMIMAX] = { ALU_OP0_NOP, tgsi_unsupported},
9847 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex},
9848 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
9849 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex},
9850 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, tgsi_op2_trans},
9851 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, tgsi_op2_trans},
9852 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_unsupported},
9853 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_unsupported},
9854 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_unsupported},
9855 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_unsupported},
9856 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_unsupported},
9857 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_unsupported},
9858 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_unsupported},
9859 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_unsupported},
9860 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_unsupported},
9861 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_unsupported},
9862 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_unsupported},
9863 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_unsupported},
9864 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_unsupported},
9865 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported},
9866 };
9867
9868 static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = {
9869 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_eg_arl},
9870 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2},
9871 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit},
9872 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate},
9873 [TGSI_OPCODE_RSQ] = { ALU_OP0_NOP, tgsi_rsq},
9874 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp},
9875 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log},
9876 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL_IEEE, tgsi_op2},
9877 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2},
9878 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
9879 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
9880 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst},
9881 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2},
9882 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2},
9883 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap},
9884 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2},
9885 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3},
9886 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp},
9887 [TGSI_OPCODE_FMA] = { ALU_OP3_FMA, tgsi_op3},
9888 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
9889 [21] = { ALU_OP0_NOP, tgsi_unsupported},
9890 [22] = { ALU_OP0_NOP, tgsi_unsupported},
9891 [23] = { ALU_OP0_NOP, tgsi_unsupported},
9892 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2},
9893 [25] = { ALU_OP0_NOP, tgsi_unsupported},
9894 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2},
9895 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2},
9896 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
9897 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
9898 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, tgsi_pow},
9899 [31] = { ALU_OP0_NOP, tgsi_unsupported},
9900 [32] = { ALU_OP0_NOP, tgsi_unsupported},
9901 [33] = { ALU_OP0_NOP, tgsi_unsupported},
9902 [34] = { ALU_OP0_NOP, tgsi_unsupported},
9903 [35] = { ALU_OP0_NOP, tgsi_unsupported},
9904 [TGSI_OPCODE_COS] = { ALU_OP1_COS, tgsi_trig},
9905 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
9906 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
9907 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */
9908 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_pk2h},
9909 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported},
9910 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported},
9911 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported},
9912 [44] = { ALU_OP0_NOP, tgsi_unsupported},
9913 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2},
9914 [46] = { ALU_OP0_NOP, tgsi_unsupported},
9915 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2},
9916 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, tgsi_trig},
9917 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap},
9918 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2},
9919 [51] = { ALU_OP0_NOP, tgsi_unsupported},
9920 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex},
9921 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex},
9922 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex},
9923 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_up2h},
9924 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported},
9925 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported},
9926 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported},
9927 [59] = { ALU_OP0_NOP, tgsi_unsupported},
9928 [60] = { ALU_OP0_NOP, tgsi_unsupported},
9929 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_eg_arl},
9930 [62] = { ALU_OP0_NOP, tgsi_unsupported},
9931 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported},
9932 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported},
9933 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg},
9934 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp},
9935 [67] = { ALU_OP0_NOP, tgsi_unsupported},
9936 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
9937 [69] = { ALU_OP0_NOP, tgsi_unsupported},
9938 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported},
9939 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
9940 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex},
9941 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
9942 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if},
9943 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif},
9944 [76] = { ALU_OP0_NOP, tgsi_unsupported},
9945 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else},
9946 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif},
9947 [TGSI_OPCODE_DDX_FINE] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
9948 [TGSI_OPCODE_DDY_FINE] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
9949 [82] = { ALU_OP0_NOP, tgsi_unsupported},
9950 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2},
9951 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
9952 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2},
9953 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2},
9954 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2},
9955 [88] = { ALU_OP0_NOP, tgsi_unsupported},
9956 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2},
9957 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2},
9958 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod},
9959 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2},
9960 [93] = { ALU_OP0_NOP, tgsi_unsupported},
9961 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex},
9962 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
9963 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
9964 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
9965 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit},
9966 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop},
9967 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported},
9968 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop},
9969 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported},
9970 [103] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
9971 [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
9972 [TGSI_OPCODE_RESQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_resq},
9973 [106] = { ALU_OP0_NOP, tgsi_unsupported},
9974 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported},
9975 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2},
9976 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2},
9977 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
9978 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
9979 [TGSI_OPCODE_MEMBAR] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
9980 [113] = { ALU_OP0_NOP, tgsi_unsupported},
9981 [114] = { ALU_OP0_NOP, tgsi_unsupported},
9982 [115] = { ALU_OP0_NOP, tgsi_unsupported},
9983 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */
9984 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */
9985 /* Refer below for TGSI_OPCODE_DFMA */
9986 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_f2i},
9987 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv},
9988 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2},
9989 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2},
9990 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg},
9991 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2},
9992 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2},
9993 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap},
9994 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_f2i},
9995 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
9996 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2},
9997 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv},
9998 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad},
9999 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2},
10000 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2},
10001 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod},
10002 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_UINT, tgsi_op2_trans},
10003 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2},
10004 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2},
10005 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2},
10006 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
10007 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2},
10008 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
10009 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported},
10010 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported},
10011 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
10012 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported},
10013 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported},
10014 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported},
10015 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported},
10016 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported},
10017 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported},
10018 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported},
10019 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported},
10020 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported},
10021 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported},
10022 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported},
10023 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported},
10024 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_eg_arl},
10025 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp},
10026 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs},
10027 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg},
10028 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_load},
10029 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_store},
10030 [163] = { ALU_OP0_NOP, tgsi_unsupported},
10031 [164] = { ALU_OP0_NOP, tgsi_unsupported},
10032 [165] = { ALU_OP0_NOP, tgsi_unsupported},
10033 [TGSI_OPCODE_BARRIER] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
10034 [TGSI_OPCODE_ATOMUADD] = { V_RAT_INST_ADD_RTN, tgsi_atomic_op},
10035 [TGSI_OPCODE_ATOMXCHG] = { V_RAT_INST_XCHG_RTN, tgsi_atomic_op},
10036 [TGSI_OPCODE_ATOMCAS] = { V_RAT_INST_CMPXCHG_INT_RTN, tgsi_atomic_op},
10037 [TGSI_OPCODE_ATOMAND] = { V_RAT_INST_AND_RTN, tgsi_atomic_op},
10038 [TGSI_OPCODE_ATOMOR] = { V_RAT_INST_OR_RTN, tgsi_atomic_op},
10039 [TGSI_OPCODE_ATOMXOR] = { V_RAT_INST_XOR_RTN, tgsi_atomic_op},
10040 [TGSI_OPCODE_ATOMUMIN] = { V_RAT_INST_MIN_UINT_RTN, tgsi_atomic_op},
10041 [TGSI_OPCODE_ATOMUMAX] = { V_RAT_INST_MAX_UINT_RTN, tgsi_atomic_op},
10042 [TGSI_OPCODE_ATOMIMIN] = { V_RAT_INST_MIN_INT_RTN, tgsi_atomic_op},
10043 [TGSI_OPCODE_ATOMIMAX] = { V_RAT_INST_MAX_INT_RTN, tgsi_atomic_op},
10044 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex},
10045 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
10046 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex},
10047 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, tgsi_op2_trans},
10048 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, tgsi_op2_trans},
10049 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_tex},
10050 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_tex},
10051 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_bfe},
10052 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_bfe},
10053 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_bfi},
10054 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_op2},
10055 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_op2},
10056 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_op2},
10057 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_msb},
10058 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_msb},
10059 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_interp_egcm},
10060 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_interp_egcm},
10061 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_interp_egcm},
10062 [TGSI_OPCODE_F2D] = { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},
10063 [TGSI_OPCODE_D2F] = { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},
10064 [TGSI_OPCODE_DABS] = { ALU_OP1_MOV, tgsi_op2_64},
10065 [TGSI_OPCODE_DNEG] = { ALU_OP2_ADD_64, tgsi_dneg},
10066 [TGSI_OPCODE_DADD] = { ALU_OP2_ADD_64, tgsi_op2_64},
10067 [TGSI_OPCODE_DMUL] = { ALU_OP2_MUL_64, cayman_mul_double_instr},
10068 [TGSI_OPCODE_DDIV] = { 0, cayman_ddiv_instr },
10069 [TGSI_OPCODE_DMAX] = { ALU_OP2_MAX_64, tgsi_op2_64},
10070 [TGSI_OPCODE_DMIN] = { ALU_OP2_MIN_64, tgsi_op2_64},
10071 [TGSI_OPCODE_DSLT] = { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
10072 [TGSI_OPCODE_DSGE] = { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},
10073 [TGSI_OPCODE_DSEQ] = { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},
10074 [TGSI_OPCODE_DSNE] = { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},
10075 [TGSI_OPCODE_DRCP] = { ALU_OP2_RECIP_64, cayman_emit_double_instr},
10076 [TGSI_OPCODE_DSQRT] = { ALU_OP2_SQRT_64, cayman_emit_double_instr},
10077 [TGSI_OPCODE_DMAD] = { ALU_OP3_FMA_64, tgsi_op3_64},
10078 [TGSI_OPCODE_DFMA] = { ALU_OP3_FMA_64, tgsi_op3_64},
10079 [TGSI_OPCODE_DFRAC] = { ALU_OP1_FRACT_64, tgsi_op2_64},
10080 [TGSI_OPCODE_DLDEXP] = { ALU_OP2_LDEXP_64, tgsi_op2_64},
10081 [TGSI_OPCODE_DFRACEXP] = { ALU_OP1_FREXP_64, tgsi_dfracexp},
10082 [TGSI_OPCODE_D2I] = { ALU_OP1_FLT_TO_INT, egcm_double_to_int},
10083 [TGSI_OPCODE_I2D] = { ALU_OP1_INT_TO_FLT, egcm_int_to_double},
10084 [TGSI_OPCODE_D2U] = { ALU_OP1_FLT_TO_UINT, egcm_double_to_int},
10085 [TGSI_OPCODE_U2D] = { ALU_OP1_UINT_TO_FLT, egcm_int_to_double},
10086 [TGSI_OPCODE_DRSQ] = { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},
10087 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported},
10088 };
10089
10090 static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = {
10091 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_eg_arl},
10092 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2},
10093 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit},
10094 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, cayman_emit_float_instr},
10095 [TGSI_OPCODE_RSQ] = { ALU_OP1_RECIPSQRT_IEEE, cayman_emit_float_instr},
10096 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp},
10097 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log},
10098 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL_IEEE, tgsi_op2},
10099 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2},
10100 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
10101 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
10102 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst},
10103 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2},
10104 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2},
10105 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap},
10106 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2},
10107 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3},
10108 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp},
10109 [TGSI_OPCODE_FMA] = { ALU_OP3_FMA, tgsi_op3},
10110 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, cayman_emit_float_instr},
10111 [21] = { ALU_OP0_NOP, tgsi_unsupported},
10112 [22] = { ALU_OP0_NOP, tgsi_unsupported},
10113 [23] = { ALU_OP0_NOP, tgsi_unsupported},
10114 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2},
10115 [25] = { ALU_OP0_NOP, tgsi_unsupported},
10116 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2},
10117 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2},
10118 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, cayman_emit_float_instr},
10119 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, cayman_emit_float_instr},
10120 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, cayman_pow},
10121 [31] = { ALU_OP0_NOP, tgsi_unsupported},
10122 [32] = { ALU_OP0_NOP, tgsi_unsupported},
10123 [33] = { ALU_OP0_NOP, tgsi_unsupported},
10124 [34] = { ALU_OP0_NOP, tgsi_unsupported},
10125 [35] = { ALU_OP0_NOP, tgsi_unsupported},
10126 [TGSI_OPCODE_COS] = { ALU_OP1_COS, cayman_trig},
10127 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
10128 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
10129 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */
10130 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_pk2h},
10131 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported},
10132 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported},
10133 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported},
10134 [44] = { ALU_OP0_NOP, tgsi_unsupported},
10135 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2},
10136 [46] = { ALU_OP0_NOP, tgsi_unsupported},
10137 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2},
10138 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, cayman_trig},
10139 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap},
10140 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2},
10141 [51] = { ALU_OP0_NOP, tgsi_unsupported},
10142 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex},
10143 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex},
10144 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex},
10145 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_up2h},
10146 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported},
10147 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported},
10148 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported},
10149 [59] = { ALU_OP0_NOP, tgsi_unsupported},
10150 [60] = { ALU_OP0_NOP, tgsi_unsupported},
10151 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_eg_arl},
10152 [62] = { ALU_OP0_NOP, tgsi_unsupported},
10153 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported},
10154 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported},
10155 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg},
10156 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp},
10157 [67] = { ALU_OP0_NOP, tgsi_unsupported},
10158 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
10159 [69] = { ALU_OP0_NOP, tgsi_unsupported},
10160 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported},
10161 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
10162 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex},
10163 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
10164 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if},
10165 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif},
10166 [76] = { ALU_OP0_NOP, tgsi_unsupported},
10167 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else},
10168 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif},
10169 [TGSI_OPCODE_DDX_FINE] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
10170 [TGSI_OPCODE_DDY_FINE] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
10171 [82] = { ALU_OP0_NOP, tgsi_unsupported},
10172 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2},
10173 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2},
10174 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2},
10175 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2},
10176 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2},
10177 [88] = { ALU_OP0_NOP, tgsi_unsupported},
10178 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2},
10179 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2},
10180 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod},
10181 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2},
10182 [93] = { ALU_OP0_NOP, tgsi_unsupported},
10183 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex},
10184 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
10185 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
10186 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
10187 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit},
10188 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop},
10189 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported},
10190 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop},
10191 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported},
10192 [103] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
10193 [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
10194 [TGSI_OPCODE_RESQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_resq},
10195 [106] = { ALU_OP0_NOP, tgsi_unsupported},
10196 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported},
10197 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2},
10198 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2},
10199 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
10200 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
10201 [TGSI_OPCODE_MEMBAR] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
10202 [113] = { ALU_OP0_NOP, tgsi_unsupported},
10203 [114] = { ALU_OP0_NOP, tgsi_unsupported},
10204 [115] = { ALU_OP0_NOP, tgsi_unsupported},
10205 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */
10206 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */
10207 /* Refer below for TGSI_OPCODE_DFMA */
10208 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_op2},
10209 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv},
10210 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2},
10211 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2},
10212 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg},
10213 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2},
10214 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2},
10215 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap},
10216 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_op2},
10217 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2},
10218 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2},
10219 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv},
10220 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad},
10221 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2},
10222 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2},
10223 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod},
10224 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_INT, cayman_mul_int_instr},
10225 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2},
10226 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2},
10227 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2},
10228 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
10229 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2},
10230 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
10231 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported},
10232 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported},
10233 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
10234 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported},
10235 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported},
10236 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported},
10237 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported},
10238 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported},
10239 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported},
10240 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported},
10241 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported},
10242 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported},
10243 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported},
10244 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported},
10245 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported},
10246 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_eg_arl},
10247 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp},
10248 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs},
10249 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg},
10250 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_load},
10251 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_store},
10252 [163] = { ALU_OP0_NOP, tgsi_unsupported},
10253 [164] = { ALU_OP0_NOP, tgsi_unsupported},
10254 [165] = { ALU_OP0_NOP, tgsi_unsupported},
10255 [TGSI_OPCODE_BARRIER] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
10256 [TGSI_OPCODE_ATOMUADD] = { V_RAT_INST_ADD_RTN, tgsi_atomic_op},
10257 [TGSI_OPCODE_ATOMXCHG] = { V_RAT_INST_XCHG_RTN, tgsi_atomic_op},
10258 [TGSI_OPCODE_ATOMCAS] = { V_RAT_INST_CMPXCHG_INT_RTN, tgsi_atomic_op},
10259 [TGSI_OPCODE_ATOMAND] = { V_RAT_INST_AND_RTN, tgsi_atomic_op},
10260 [TGSI_OPCODE_ATOMOR] = { V_RAT_INST_OR_RTN, tgsi_atomic_op},
10261 [TGSI_OPCODE_ATOMXOR] = { V_RAT_INST_XOR_RTN, tgsi_atomic_op},
10262 [TGSI_OPCODE_ATOMUMIN] = { V_RAT_INST_MIN_UINT_RTN, tgsi_atomic_op},
10263 [TGSI_OPCODE_ATOMUMAX] = { V_RAT_INST_MAX_UINT_RTN, tgsi_atomic_op},
10264 [TGSI_OPCODE_ATOMIMIN] = { V_RAT_INST_MIN_INT_RTN, tgsi_atomic_op},
10265 [TGSI_OPCODE_ATOMIMAX] = { V_RAT_INST_MAX_INT_RTN, tgsi_atomic_op},
10266 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex},
10267 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
10268 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex},
10269 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, cayman_mul_int_instr},
10270 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, cayman_mul_int_instr},
10271 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_tex},
10272 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_tex},
10273 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_bfe},
10274 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_bfe},
10275 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_bfi},
10276 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_op2},
10277 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_op2},
10278 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_op2},
10279 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_msb},
10280 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_msb},
10281 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_interp_egcm},
10282 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_interp_egcm},
10283 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_interp_egcm},
10284 [TGSI_OPCODE_F2D] = { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},
10285 [TGSI_OPCODE_D2F] = { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},
10286 [TGSI_OPCODE_DABS] = { ALU_OP1_MOV, tgsi_op2_64},
10287 [TGSI_OPCODE_DNEG] = { ALU_OP2_ADD_64, tgsi_dneg},
10288 [TGSI_OPCODE_DADD] = { ALU_OP2_ADD_64, tgsi_op2_64},
10289 [TGSI_OPCODE_DMUL] = { ALU_OP2_MUL_64, cayman_mul_double_instr},
10290 [TGSI_OPCODE_DDIV] = { 0, cayman_ddiv_instr },
10291 [TGSI_OPCODE_DMAX] = { ALU_OP2_MAX_64, tgsi_op2_64},
10292 [TGSI_OPCODE_DMIN] = { ALU_OP2_MIN_64, tgsi_op2_64},
10293 [TGSI_OPCODE_DSLT] = { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
10294 [TGSI_OPCODE_DSGE] = { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},
10295 [TGSI_OPCODE_DSEQ] = { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},
10296 [TGSI_OPCODE_DSNE] = { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},
10297 [TGSI_OPCODE_DRCP] = { ALU_OP2_RECIP_64, cayman_emit_double_instr},
10298 [TGSI_OPCODE_DSQRT] = { ALU_OP2_SQRT_64, cayman_emit_double_instr},
10299 [TGSI_OPCODE_DMAD] = { ALU_OP3_FMA_64, tgsi_op3_64},
10300 [TGSI_OPCODE_DFMA] = { ALU_OP3_FMA_64, tgsi_op3_64},
10301 [TGSI_OPCODE_DFRAC] = { ALU_OP1_FRACT_64, tgsi_op2_64},
10302 [TGSI_OPCODE_DLDEXP] = { ALU_OP2_LDEXP_64, tgsi_op2_64},
10303 [TGSI_OPCODE_DFRACEXP] = { ALU_OP1_FREXP_64, tgsi_dfracexp},
10304 [TGSI_OPCODE_D2I] = { ALU_OP1_FLT_TO_INT, egcm_double_to_int},
10305 [TGSI_OPCODE_I2D] = { ALU_OP1_INT_TO_FLT, egcm_int_to_double},
10306 [TGSI_OPCODE_D2U] = { ALU_OP1_FLT_TO_UINT, egcm_double_to_int},
10307 [TGSI_OPCODE_U2D] = { ALU_OP1_UINT_TO_FLT, egcm_int_to_double},
10308 [TGSI_OPCODE_DRSQ] = { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},
10309 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported},
10310 };