r600/shader: refactor mul hi/lo instruction emission
[mesa.git] / src / gallium / drivers / r600 / r600_shader.c
1 /*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 */
23 #include "r600_sq.h"
24 #include "r600_formats.h"
25 #include "r600_opcodes.h"
26 #include "r600_shader.h"
27 #include "r600d.h"
28
29 #include "sb/sb_public.h"
30
31 #include "pipe/p_shader_tokens.h"
32 #include "tgsi/tgsi_info.h"
33 #include "tgsi/tgsi_parse.h"
34 #include "tgsi/tgsi_scan.h"
35 #include "tgsi/tgsi_dump.h"
36 #include "util/u_bitcast.h"
37 #include "util/u_memory.h"
38 #include "util/u_math.h"
39 #include <stdio.h>
40 #include <errno.h>
41
42 /* CAYMAN notes
43 Why CAYMAN got loops for lots of instructions is explained here.
44
45 -These 8xx t-slot only ops are implemented in all vector slots.
46 MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT
47 These 8xx t-slot only opcodes become vector ops, with all four
48 slots expecting the arguments on sources a and b. Result is
49 broadcast to all channels.
50 MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT, MUL_64
51 These 8xx t-slot only opcodes become vector ops in the z, y, and
52 x slots.
53 EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64
54 RECIPSQRT_IEEE/CLAMPED/FF/_64/CLAMPED_64
55 SQRT_IEEE/_64
56 SIN/COS
57 The w slot may have an independent co-issued operation, or if the
58 result is required to be in the w slot, the opcode above may be
59 issued in the w slot as well.
60 The compiler must issue the source argument to slots z, y, and x
61 */
62
63 /* Contents of r0 on entry to various shaders
64
65 VS - .x = VertexID
66 .y = RelVertexID (??)
67 .w = InstanceID
68
69 GS - r0.xyw, r1.xyz = per-vertex offsets
70 r0.z = PrimitiveID
71
72 TCS - .x = PatchID
73 .y = RelPatchID (??)
74 .z = InvocationID
75 .w = tess factor base.
76
77 TES - .x = TessCoord.x
78 - .y = TessCoord.y
79 - .z = RelPatchID (??)
80 - .w = PrimitiveID
81
82 PS - face_gpr.z = SampleMask
83 face_gpr.w = SampleID
84 */
85 #define R600_SHADER_BUFFER_INFO_SEL (512 + R600_BUFFER_INFO_OFFSET / 16)
86 static int r600_shader_from_tgsi(struct r600_context *rctx,
87 struct r600_pipe_shader *pipeshader,
88 union r600_shader_key key);
89
90 static void r600_add_gpr_array(struct r600_shader *ps, int start_gpr,
91 int size, unsigned comp_mask) {
92
93 if (!size)
94 return;
95
96 if (ps->num_arrays == ps->max_arrays) {
97 ps->max_arrays += 64;
98 ps->arrays = realloc(ps->arrays, ps->max_arrays *
99 sizeof(struct r600_shader_array));
100 }
101
102 int n = ps->num_arrays;
103 ++ps->num_arrays;
104
105 ps->arrays[n].comp_mask = comp_mask;
106 ps->arrays[n].gpr_start = start_gpr;
107 ps->arrays[n].gpr_count = size;
108 }
109
110 static void r600_dump_streamout(struct pipe_stream_output_info *so)
111 {
112 unsigned i;
113
114 fprintf(stderr, "STREAMOUT\n");
115 for (i = 0; i < so->num_outputs; i++) {
116 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
117 so->output[i].start_component;
118 fprintf(stderr, " %i: MEM_STREAM%d_BUF%i[%i..%i] <- OUT[%i].%s%s%s%s%s\n",
119 i,
120 so->output[i].stream,
121 so->output[i].output_buffer,
122 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
123 so->output[i].register_index,
124 mask & 1 ? "x" : "",
125 mask & 2 ? "y" : "",
126 mask & 4 ? "z" : "",
127 mask & 8 ? "w" : "",
128 so->output[i].dst_offset < so->output[i].start_component ? " (will lower)" : "");
129 }
130 }
131
132 static int store_shader(struct pipe_context *ctx,
133 struct r600_pipe_shader *shader)
134 {
135 struct r600_context *rctx = (struct r600_context *)ctx;
136 uint32_t *ptr, i;
137
138 if (shader->bo == NULL) {
139 shader->bo = (struct r600_resource*)
140 pipe_buffer_create(ctx->screen, 0, PIPE_USAGE_IMMUTABLE, shader->shader.bc.ndw * 4);
141 if (shader->bo == NULL) {
142 return -ENOMEM;
143 }
144 ptr = r600_buffer_map_sync_with_rings(&rctx->b, shader->bo, PIPE_TRANSFER_WRITE);
145 if (R600_BIG_ENDIAN) {
146 for (i = 0; i < shader->shader.bc.ndw; ++i) {
147 ptr[i] = util_cpu_to_le32(shader->shader.bc.bytecode[i]);
148 }
149 } else {
150 memcpy(ptr, shader->shader.bc.bytecode, shader->shader.bc.ndw * sizeof(*ptr));
151 }
152 rctx->b.ws->buffer_unmap(shader->bo->buf);
153 }
154
155 return 0;
156 }
157
158 int r600_pipe_shader_create(struct pipe_context *ctx,
159 struct r600_pipe_shader *shader,
160 union r600_shader_key key)
161 {
162 struct r600_context *rctx = (struct r600_context *)ctx;
163 struct r600_pipe_shader_selector *sel = shader->selector;
164 int r;
165 bool dump = r600_can_dump_shader(&rctx->screen->b,
166 tgsi_get_processor_type(sel->tokens));
167 unsigned use_sb = !(rctx->screen->b.debug_flags & DBG_NO_SB);
168 unsigned sb_disasm = use_sb || (rctx->screen->b.debug_flags & DBG_SB_DISASM);
169 unsigned export_shader;
170
171 shader->shader.bc.isa = rctx->isa;
172
173 if (dump) {
174 fprintf(stderr, "--------------------------------------------------------------\n");
175 tgsi_dump(sel->tokens, 0);
176
177 if (sel->so.num_outputs) {
178 r600_dump_streamout(&sel->so);
179 }
180 }
181 r = r600_shader_from_tgsi(rctx, shader, key);
182 if (r) {
183 R600_ERR("translation from TGSI failed !\n");
184 goto error;
185 }
186 if (shader->shader.processor_type == PIPE_SHADER_VERTEX) {
187 /* only disable for vertex shaders in tess paths */
188 if (key.vs.as_ls)
189 use_sb = 0;
190 }
191 use_sb &= (shader->shader.processor_type != PIPE_SHADER_TESS_CTRL);
192 use_sb &= (shader->shader.processor_type != PIPE_SHADER_TESS_EVAL);
193 use_sb &= (shader->shader.processor_type != PIPE_SHADER_COMPUTE);
194
195 /* disable SB for shaders using doubles */
196 use_sb &= !shader->shader.uses_doubles;
197
198 use_sb &= !shader->shader.uses_atomics;
199 use_sb &= !shader->shader.uses_images;
200
201 /* Check if the bytecode has already been built. */
202 if (!shader->shader.bc.bytecode) {
203 r = r600_bytecode_build(&shader->shader.bc);
204 if (r) {
205 R600_ERR("building bytecode failed !\n");
206 goto error;
207 }
208 }
209
210 if (dump && !sb_disasm) {
211 fprintf(stderr, "--------------------------------------------------------------\n");
212 r600_bytecode_disasm(&shader->shader.bc);
213 fprintf(stderr, "______________________________________________________________\n");
214 } else if ((dump && sb_disasm) || use_sb) {
215 r = r600_sb_bytecode_process(rctx, &shader->shader.bc, &shader->shader,
216 dump, use_sb);
217 if (r) {
218 R600_ERR("r600_sb_bytecode_process failed !\n");
219 goto error;
220 }
221 }
222
223 if (shader->gs_copy_shader) {
224 if (dump) {
225 // dump copy shader
226 r = r600_sb_bytecode_process(rctx, &shader->gs_copy_shader->shader.bc,
227 &shader->gs_copy_shader->shader, dump, 0);
228 if (r)
229 goto error;
230 }
231
232 if ((r = store_shader(ctx, shader->gs_copy_shader)))
233 goto error;
234 }
235
236 /* Store the shader in a buffer. */
237 if ((r = store_shader(ctx, shader)))
238 goto error;
239
240 /* Build state. */
241 switch (shader->shader.processor_type) {
242 case PIPE_SHADER_TESS_CTRL:
243 evergreen_update_hs_state(ctx, shader);
244 break;
245 case PIPE_SHADER_TESS_EVAL:
246 if (key.tes.as_es)
247 evergreen_update_es_state(ctx, shader);
248 else
249 evergreen_update_vs_state(ctx, shader);
250 break;
251 case PIPE_SHADER_GEOMETRY:
252 if (rctx->b.chip_class >= EVERGREEN) {
253 evergreen_update_gs_state(ctx, shader);
254 evergreen_update_vs_state(ctx, shader->gs_copy_shader);
255 } else {
256 r600_update_gs_state(ctx, shader);
257 r600_update_vs_state(ctx, shader->gs_copy_shader);
258 }
259 break;
260 case PIPE_SHADER_VERTEX:
261 export_shader = key.vs.as_es;
262 if (rctx->b.chip_class >= EVERGREEN) {
263 if (key.vs.as_ls)
264 evergreen_update_ls_state(ctx, shader);
265 else if (key.vs.as_es)
266 evergreen_update_es_state(ctx, shader);
267 else
268 evergreen_update_vs_state(ctx, shader);
269 } else {
270 if (export_shader)
271 r600_update_es_state(ctx, shader);
272 else
273 r600_update_vs_state(ctx, shader);
274 }
275 break;
276 case PIPE_SHADER_FRAGMENT:
277 if (rctx->b.chip_class >= EVERGREEN) {
278 evergreen_update_ps_state(ctx, shader);
279 } else {
280 r600_update_ps_state(ctx, shader);
281 }
282 break;
283 case PIPE_SHADER_COMPUTE:
284 evergreen_update_ls_state(ctx, shader);
285 break;
286 default:
287 r = -EINVAL;
288 goto error;
289 }
290 return 0;
291
292 error:
293 r600_pipe_shader_destroy(ctx, shader);
294 return r;
295 }
296
297 void r600_pipe_shader_destroy(struct pipe_context *ctx UNUSED, struct r600_pipe_shader *shader)
298 {
299 r600_resource_reference(&shader->bo, NULL);
300 r600_bytecode_clear(&shader->shader.bc);
301 r600_release_command_buffer(&shader->command_buffer);
302 }
303
304 /*
305 * tgsi -> r600 shader
306 */
307 struct r600_shader_tgsi_instruction;
308
309 struct r600_shader_src {
310 unsigned sel;
311 unsigned swizzle[4];
312 unsigned neg;
313 unsigned abs;
314 unsigned rel;
315 unsigned kc_bank;
316 boolean kc_rel; /* true if cache bank is indexed */
317 uint32_t value[4];
318 };
319
320 struct eg_interp {
321 boolean enabled;
322 unsigned ij_index;
323 };
324
325 struct r600_shader_ctx {
326 struct tgsi_shader_info info;
327 struct tgsi_parse_context parse;
328 const struct tgsi_token *tokens;
329 unsigned type;
330 unsigned file_offset[TGSI_FILE_COUNT];
331 unsigned temp_reg;
332 const struct r600_shader_tgsi_instruction *inst_info;
333 struct r600_bytecode *bc;
334 struct r600_shader *shader;
335 struct r600_shader_src src[4];
336 uint32_t *literals;
337 uint32_t nliterals;
338 uint32_t max_driver_temp_used;
339 /* needed for evergreen interpolation */
340 struct eg_interp eg_interpolators[6]; // indexed by Persp/Linear * 3 + sample/center/centroid
341 /* evergreen/cayman also store sample mask in face register */
342 int face_gpr;
343 /* sample id is .w component stored in fixed point position register */
344 int fixed_pt_position_gpr;
345 int colors_used;
346 boolean clip_vertex_write;
347 unsigned cv_output;
348 unsigned edgeflag_output;
349 int cs_block_size_reg;
350 int cs_grid_size_reg;
351 bool cs_block_size_loaded, cs_grid_size_loaded;
352 int fragcoord_input;
353 int next_ring_offset;
354 int gs_out_ring_offset;
355 int gs_next_vertex;
356 struct r600_shader *gs_for_vs;
357 int gs_export_gpr_tregs[4];
358 int gs_rotated_input[2];
359 const struct pipe_stream_output_info *gs_stream_output_info;
360 unsigned enabled_stream_buffers_mask;
361 unsigned tess_input_info; /* temp with tess input offsets */
362 unsigned tess_output_info; /* temp with tess input offsets */
363 unsigned thread_id_gpr; /* temp with thread id calculated for images */
364 bool thread_id_gpr_loaded;
365 };
366
367 struct r600_shader_tgsi_instruction {
368 unsigned op;
369 int (*process)(struct r600_shader_ctx *ctx);
370 };
371
372 static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so, int stream, bool ind);
373 static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[];
374 static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx);
375 static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason);
376 static void fc_pushlevel(struct r600_shader_ctx *ctx, int type);
377 static int tgsi_else(struct r600_shader_ctx *ctx);
378 static int tgsi_endif(struct r600_shader_ctx *ctx);
379 static int tgsi_bgnloop(struct r600_shader_ctx *ctx);
380 static int tgsi_endloop(struct r600_shader_ctx *ctx);
381 static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx);
382 static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
383 unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,
384 unsigned int dst_reg);
385 static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
386 const struct r600_shader_src *shader_src,
387 unsigned chan);
388 static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg,
389 unsigned dst_reg, unsigned mask);
390
391 static int tgsi_last_instruction(unsigned writemask)
392 {
393 int i, lasti = 0;
394
395 for (i = 0; i < 4; i++) {
396 if (writemask & (1 << i)) {
397 lasti = i;
398 }
399 }
400 return lasti;
401 }
402
403 static int tgsi_is_supported(struct r600_shader_ctx *ctx)
404 {
405 struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction;
406 unsigned j;
407
408 if (i->Instruction.NumDstRegs > 1 && i->Instruction.Opcode != TGSI_OPCODE_DFRACEXP) {
409 R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs);
410 return -EINVAL;
411 }
412 #if 0
413 if (i->Instruction.Label) {
414 R600_ERR("label unsupported\n");
415 return -EINVAL;
416 }
417 #endif
418 for (j = 0; j < i->Instruction.NumSrcRegs; j++) {
419 if (i->Src[j].Register.Dimension) {
420 switch (i->Src[j].Register.File) {
421 case TGSI_FILE_CONSTANT:
422 case TGSI_FILE_HW_ATOMIC:
423 break;
424 case TGSI_FILE_INPUT:
425 if (ctx->type == PIPE_SHADER_GEOMETRY ||
426 ctx->type == PIPE_SHADER_TESS_CTRL ||
427 ctx->type == PIPE_SHADER_TESS_EVAL)
428 break;
429 case TGSI_FILE_OUTPUT:
430 if (ctx->type == PIPE_SHADER_TESS_CTRL)
431 break;
432 default:
433 R600_ERR("unsupported src %d (file %d, dimension %d)\n", j,
434 i->Src[j].Register.File,
435 i->Src[j].Register.Dimension);
436 return -EINVAL;
437 }
438 }
439 }
440 for (j = 0; j < i->Instruction.NumDstRegs; j++) {
441 if (i->Dst[j].Register.Dimension) {
442 if (ctx->type == PIPE_SHADER_TESS_CTRL)
443 continue;
444 R600_ERR("unsupported dst (dimension)\n");
445 return -EINVAL;
446 }
447 }
448 return 0;
449 }
450
451 int eg_get_interpolator_index(unsigned interpolate, unsigned location)
452 {
453 if (interpolate == TGSI_INTERPOLATE_COLOR ||
454 interpolate == TGSI_INTERPOLATE_LINEAR ||
455 interpolate == TGSI_INTERPOLATE_PERSPECTIVE)
456 {
457 int is_linear = interpolate == TGSI_INTERPOLATE_LINEAR;
458 int loc;
459
460 switch(location) {
461 case TGSI_INTERPOLATE_LOC_CENTER:
462 loc = 1;
463 break;
464 case TGSI_INTERPOLATE_LOC_CENTROID:
465 loc = 2;
466 break;
467 case TGSI_INTERPOLATE_LOC_SAMPLE:
468 default:
469 loc = 0; break;
470 }
471
472 return is_linear * 3 + loc;
473 }
474
475 return -1;
476 }
477
478 static void evergreen_interp_assign_ij_index(struct r600_shader_ctx *ctx,
479 int input)
480 {
481 int i = eg_get_interpolator_index(
482 ctx->shader->input[input].interpolate,
483 ctx->shader->input[input].interpolate_location);
484 assert(i >= 0);
485 ctx->shader->input[input].ij_index = ctx->eg_interpolators[i].ij_index;
486 }
487
488 static int evergreen_interp_alu(struct r600_shader_ctx *ctx, int input)
489 {
490 int i, r;
491 struct r600_bytecode_alu alu;
492 int gpr = 0, base_chan = 0;
493 int ij_index = ctx->shader->input[input].ij_index;
494
495 /* work out gpr and base_chan from index */
496 gpr = ij_index / 2;
497 base_chan = (2 * (ij_index % 2)) + 1;
498
499 for (i = 0; i < 8; i++) {
500 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
501
502 if (i < 4)
503 alu.op = ALU_OP2_INTERP_ZW;
504 else
505 alu.op = ALU_OP2_INTERP_XY;
506
507 if ((i > 1) && (i < 6)) {
508 alu.dst.sel = ctx->shader->input[input].gpr;
509 alu.dst.write = 1;
510 }
511
512 alu.dst.chan = i % 4;
513
514 alu.src[0].sel = gpr;
515 alu.src[0].chan = (base_chan - (i % 2));
516
517 alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
518
519 alu.bank_swizzle_force = SQ_ALU_VEC_210;
520 if ((i % 4) == 3)
521 alu.last = 1;
522 r = r600_bytecode_add_alu(ctx->bc, &alu);
523 if (r)
524 return r;
525 }
526 return 0;
527 }
528
529 static int evergreen_interp_flat(struct r600_shader_ctx *ctx, int input)
530 {
531 int i, r;
532 struct r600_bytecode_alu alu;
533
534 for (i = 0; i < 4; i++) {
535 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
536
537 alu.op = ALU_OP1_INTERP_LOAD_P0;
538
539 alu.dst.sel = ctx->shader->input[input].gpr;
540 alu.dst.write = 1;
541
542 alu.dst.chan = i;
543
544 alu.src[0].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
545 alu.src[0].chan = i;
546
547 if (i == 3)
548 alu.last = 1;
549 r = r600_bytecode_add_alu(ctx->bc, &alu);
550 if (r)
551 return r;
552 }
553 return 0;
554 }
555
556 /*
557 * Special export handling in shaders
558 *
559 * shader export ARRAY_BASE for EXPORT_POS:
560 * 60 is position
561 * 61 is misc vector
562 * 62, 63 are clip distance vectors
563 *
564 * The use of the values exported in 61-63 are controlled by PA_CL_VS_OUT_CNTL:
565 * VS_OUT_MISC_VEC_ENA - enables the use of all fields in export 61
566 * USE_VTX_POINT_SIZE - point size in the X channel of export 61
567 * USE_VTX_EDGE_FLAG - edge flag in the Y channel of export 61
568 * USE_VTX_RENDER_TARGET_INDX - render target index in the Z channel of export 61
569 * USE_VTX_VIEWPORT_INDX - viewport index in the W channel of export 61
570 * USE_VTX_KILL_FLAG - kill flag in the Z channel of export 61 (mutually
571 * exclusive from render target index)
572 * VS_OUT_CCDIST0_VEC_ENA/VS_OUT_CCDIST1_VEC_ENA - enable clip distance vectors
573 *
574 *
575 * shader export ARRAY_BASE for EXPORT_PIXEL:
576 * 0-7 CB targets
577 * 61 computed Z vector
578 *
579 * The use of the values exported in the computed Z vector are controlled
580 * by DB_SHADER_CONTROL:
581 * Z_EXPORT_ENABLE - Z as a float in RED
582 * STENCIL_REF_EXPORT_ENABLE - stencil ref as int in GREEN
583 * COVERAGE_TO_MASK_ENABLE - alpha to mask in ALPHA
584 * MASK_EXPORT_ENABLE - pixel sample mask in BLUE
585 * DB_SOURCE_FORMAT - export control restrictions
586 *
587 */
588
589
590 /* Map name/sid pair from tgsi to the 8-bit semantic index for SPI setup */
591 static int r600_spi_sid(struct r600_shader_io * io)
592 {
593 int index, name = io->name;
594
595 /* These params are handled differently, they don't need
596 * semantic indices, so we'll use 0 for them.
597 */
598 if (name == TGSI_SEMANTIC_POSITION ||
599 name == TGSI_SEMANTIC_PSIZE ||
600 name == TGSI_SEMANTIC_EDGEFLAG ||
601 name == TGSI_SEMANTIC_FACE ||
602 name == TGSI_SEMANTIC_SAMPLEMASK)
603 index = 0;
604 else {
605 if (name == TGSI_SEMANTIC_GENERIC) {
606 /* For generic params simply use sid from tgsi */
607 index = io->sid;
608 } else {
609 /* For non-generic params - pack name and sid into 8 bits */
610 index = 0x80 | (name<<3) | (io->sid);
611 }
612
613 /* Make sure that all really used indices have nonzero value, so
614 * we can just compare it to 0 later instead of comparing the name
615 * with different values to detect special cases. */
616 index++;
617 }
618
619 return index;
620 };
621
622 /* we need this to get a common lds index for vs/tcs/tes input/outputs */
623 int r600_get_lds_unique_index(unsigned semantic_name, unsigned index)
624 {
625 switch (semantic_name) {
626 case TGSI_SEMANTIC_POSITION:
627 return 0;
628 case TGSI_SEMANTIC_PSIZE:
629 return 1;
630 case TGSI_SEMANTIC_CLIPDIST:
631 assert(index <= 1);
632 return 2 + index;
633 case TGSI_SEMANTIC_GENERIC:
634 if (index <= 63-4)
635 return 4 + index - 9;
636 else
637 /* same explanation as in the default statement,
638 * the only user hitting this is st/nine.
639 */
640 return 0;
641
642 /* patch indices are completely separate and thus start from 0 */
643 case TGSI_SEMANTIC_TESSOUTER:
644 return 0;
645 case TGSI_SEMANTIC_TESSINNER:
646 return 1;
647 case TGSI_SEMANTIC_PATCH:
648 return 2 + index;
649
650 default:
651 /* Don't fail here. The result of this function is only used
652 * for LS, TCS, TES, and GS, where legacy GL semantics can't
653 * occur, but this function is called for all vertex shaders
654 * before it's known whether LS will be compiled or not.
655 */
656 return 0;
657 }
658 }
659
660 /* turn input into interpolate on EG */
661 static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index)
662 {
663 int r = 0;
664
665 if (ctx->shader->input[index].spi_sid) {
666 ctx->shader->input[index].lds_pos = ctx->shader->nlds++;
667 if (ctx->shader->input[index].interpolate > 0) {
668 evergreen_interp_assign_ij_index(ctx, index);
669 r = evergreen_interp_alu(ctx, index);
670 } else {
671 r = evergreen_interp_flat(ctx, index);
672 }
673 }
674 return r;
675 }
676
677 static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back)
678 {
679 struct r600_bytecode_alu alu;
680 int i, r;
681 int gpr_front = ctx->shader->input[front].gpr;
682 int gpr_back = ctx->shader->input[back].gpr;
683
684 for (i = 0; i < 4; i++) {
685 memset(&alu, 0, sizeof(alu));
686 alu.op = ALU_OP3_CNDGT;
687 alu.is_op3 = 1;
688 alu.dst.write = 1;
689 alu.dst.sel = gpr_front;
690 alu.src[0].sel = ctx->face_gpr;
691 alu.src[1].sel = gpr_front;
692 alu.src[2].sel = gpr_back;
693
694 alu.dst.chan = i;
695 alu.src[1].chan = i;
696 alu.src[2].chan = i;
697 alu.last = (i==3);
698
699 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
700 return r;
701 }
702
703 return 0;
704 }
705
706 /* execute a single slot ALU calculation */
707 static int single_alu_op2(struct r600_shader_ctx *ctx, int op,
708 int dst_sel, int dst_chan,
709 int src0_sel, unsigned src0_chan_val,
710 int src1_sel, unsigned src1_chan_val)
711 {
712 struct r600_bytecode_alu alu;
713 int r, i;
714
715 if (ctx->bc->chip_class == CAYMAN && op == ALU_OP2_MULLO_INT) {
716 for (i = 0; i < 4; i++) {
717 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
718 alu.op = op;
719 alu.src[0].sel = src0_sel;
720 if (src0_sel == V_SQ_ALU_SRC_LITERAL)
721 alu.src[0].value = src0_chan_val;
722 else
723 alu.src[0].chan = src0_chan_val;
724 alu.src[1].sel = src1_sel;
725 if (src1_sel == V_SQ_ALU_SRC_LITERAL)
726 alu.src[1].value = src1_chan_val;
727 else
728 alu.src[1].chan = src1_chan_val;
729 alu.dst.sel = dst_sel;
730 alu.dst.chan = i;
731 alu.dst.write = i == dst_chan;
732 alu.last = (i == 3);
733 r = r600_bytecode_add_alu(ctx->bc, &alu);
734 if (r)
735 return r;
736 }
737 return 0;
738 }
739
740 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
741 alu.op = op;
742 alu.src[0].sel = src0_sel;
743 if (src0_sel == V_SQ_ALU_SRC_LITERAL)
744 alu.src[0].value = src0_chan_val;
745 else
746 alu.src[0].chan = src0_chan_val;
747 alu.src[1].sel = src1_sel;
748 if (src1_sel == V_SQ_ALU_SRC_LITERAL)
749 alu.src[1].value = src1_chan_val;
750 else
751 alu.src[1].chan = src1_chan_val;
752 alu.dst.sel = dst_sel;
753 alu.dst.chan = dst_chan;
754 alu.dst.write = 1;
755 alu.last = 1;
756 r = r600_bytecode_add_alu(ctx->bc, &alu);
757 if (r)
758 return r;
759 return 0;
760 }
761
762 /* execute a single slot ALU calculation */
763 static int single_alu_op3(struct r600_shader_ctx *ctx, int op,
764 int dst_sel, int dst_chan,
765 int src0_sel, unsigned src0_chan_val,
766 int src1_sel, unsigned src1_chan_val,
767 int src2_sel, unsigned src2_chan_val)
768 {
769 struct r600_bytecode_alu alu;
770 int r;
771
772 /* validate this for other ops */
773 assert(op == ALU_OP3_MULADD_UINT24 || op == ALU_OP3_CNDE_INT);
774 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
775 alu.op = op;
776 alu.src[0].sel = src0_sel;
777 if (src0_sel == V_SQ_ALU_SRC_LITERAL)
778 alu.src[0].value = src0_chan_val;
779 else
780 alu.src[0].chan = src0_chan_val;
781 alu.src[1].sel = src1_sel;
782 if (src1_sel == V_SQ_ALU_SRC_LITERAL)
783 alu.src[1].value = src1_chan_val;
784 else
785 alu.src[1].chan = src1_chan_val;
786 alu.src[2].sel = src2_sel;
787 if (src2_sel == V_SQ_ALU_SRC_LITERAL)
788 alu.src[2].value = src2_chan_val;
789 else
790 alu.src[2].chan = src2_chan_val;
791 alu.dst.sel = dst_sel;
792 alu.dst.chan = dst_chan;
793 alu.is_op3 = 1;
794 alu.last = 1;
795 r = r600_bytecode_add_alu(ctx->bc, &alu);
796 if (r)
797 return r;
798 return 0;
799 }
800
801 /* put it in temp_reg.x */
802 static int get_lds_offset0(struct r600_shader_ctx *ctx,
803 int rel_patch_chan,
804 int temp_reg, bool is_patch_var)
805 {
806 int r;
807
808 /* MUL temp.x, patch_stride (input_vals.x), rel_patch_id (r0.y (tcs)) */
809 /* ADD
810 Dimension - patch0_offset (input_vals.z),
811 Non-dim - patch0_data_offset (input_vals.w)
812 */
813 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
814 temp_reg, 0,
815 ctx->tess_output_info, 0,
816 0, rel_patch_chan,
817 ctx->tess_output_info, is_patch_var ? 3 : 2);
818 if (r)
819 return r;
820 return 0;
821 }
822
823 static inline int get_address_file_reg(struct r600_shader_ctx *ctx, int index)
824 {
825 return index > 0 ? ctx->bc->index_reg[index - 1] : ctx->bc->ar_reg;
826 }
827
828 static int r600_get_temp(struct r600_shader_ctx *ctx)
829 {
830 return ctx->temp_reg + ctx->max_driver_temp_used++;
831 }
832
833 static int vs_add_primid_output(struct r600_shader_ctx *ctx, int prim_id_sid)
834 {
835 int i;
836 i = ctx->shader->noutput++;
837 ctx->shader->output[i].name = TGSI_SEMANTIC_PRIMID;
838 ctx->shader->output[i].sid = 0;
839 ctx->shader->output[i].gpr = 0;
840 ctx->shader->output[i].interpolate = TGSI_INTERPOLATE_CONSTANT;
841 ctx->shader->output[i].write_mask = 0x4;
842 ctx->shader->output[i].spi_sid = prim_id_sid;
843
844 return 0;
845 }
846
847 static int tgsi_barrier(struct r600_shader_ctx *ctx)
848 {
849 struct r600_bytecode_alu alu;
850 int r;
851
852 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
853 alu.op = ctx->inst_info->op;
854 alu.last = 1;
855
856 r = r600_bytecode_add_alu(ctx->bc, &alu);
857 if (r)
858 return r;
859 return 0;
860 }
861
862 static int tgsi_declaration(struct r600_shader_ctx *ctx)
863 {
864 struct tgsi_full_declaration *d = &ctx->parse.FullToken.FullDeclaration;
865 int r, i, j, count = d->Range.Last - d->Range.First + 1;
866
867 switch (d->Declaration.File) {
868 case TGSI_FILE_INPUT:
869 for (j = 0; j < count; j++) {
870 i = ctx->shader->ninput + j;
871 assert(i < ARRAY_SIZE(ctx->shader->input));
872 ctx->shader->input[i].name = d->Semantic.Name;
873 ctx->shader->input[i].sid = d->Semantic.Index + j;
874 ctx->shader->input[i].interpolate = d->Interp.Interpolate;
875 ctx->shader->input[i].interpolate_location = d->Interp.Location;
876 ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First + j;
877 if (ctx->type == PIPE_SHADER_FRAGMENT) {
878 ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]);
879 switch (ctx->shader->input[i].name) {
880 case TGSI_SEMANTIC_FACE:
881 if (ctx->face_gpr != -1)
882 ctx->shader->input[i].gpr = ctx->face_gpr; /* already allocated by allocate_system_value_inputs */
883 else
884 ctx->face_gpr = ctx->shader->input[i].gpr;
885 break;
886 case TGSI_SEMANTIC_COLOR:
887 ctx->colors_used++;
888 break;
889 case TGSI_SEMANTIC_POSITION:
890 ctx->fragcoord_input = i;
891 break;
892 case TGSI_SEMANTIC_PRIMID:
893 /* set this for now */
894 ctx->shader->gs_prim_id_input = true;
895 ctx->shader->ps_prim_id_input = i;
896 break;
897 }
898 if (ctx->bc->chip_class >= EVERGREEN) {
899 if ((r = evergreen_interp_input(ctx, i)))
900 return r;
901 }
902 } else if (ctx->type == PIPE_SHADER_GEOMETRY) {
903 /* FIXME probably skip inputs if they aren't passed in the ring */
904 ctx->shader->input[i].ring_offset = ctx->next_ring_offset;
905 ctx->next_ring_offset += 16;
906 if (ctx->shader->input[i].name == TGSI_SEMANTIC_PRIMID)
907 ctx->shader->gs_prim_id_input = true;
908 }
909 }
910 ctx->shader->ninput += count;
911 break;
912 case TGSI_FILE_OUTPUT:
913 for (j = 0; j < count; j++) {
914 i = ctx->shader->noutput + j;
915 assert(i < ARRAY_SIZE(ctx->shader->output));
916 ctx->shader->output[i].name = d->Semantic.Name;
917 ctx->shader->output[i].sid = d->Semantic.Index + j;
918 ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First + j;
919 ctx->shader->output[i].interpolate = d->Interp.Interpolate;
920 ctx->shader->output[i].write_mask = d->Declaration.UsageMask;
921 if (ctx->type == PIPE_SHADER_VERTEX ||
922 ctx->type == PIPE_SHADER_GEOMETRY ||
923 ctx->type == PIPE_SHADER_TESS_EVAL) {
924 ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]);
925 switch (d->Semantic.Name) {
926 case TGSI_SEMANTIC_CLIPDIST:
927 break;
928 case TGSI_SEMANTIC_PSIZE:
929 ctx->shader->vs_out_misc_write = 1;
930 ctx->shader->vs_out_point_size = 1;
931 break;
932 case TGSI_SEMANTIC_EDGEFLAG:
933 ctx->shader->vs_out_misc_write = 1;
934 ctx->shader->vs_out_edgeflag = 1;
935 ctx->edgeflag_output = i;
936 break;
937 case TGSI_SEMANTIC_VIEWPORT_INDEX:
938 ctx->shader->vs_out_misc_write = 1;
939 ctx->shader->vs_out_viewport = 1;
940 break;
941 case TGSI_SEMANTIC_LAYER:
942 ctx->shader->vs_out_misc_write = 1;
943 ctx->shader->vs_out_layer = 1;
944 break;
945 case TGSI_SEMANTIC_CLIPVERTEX:
946 ctx->clip_vertex_write = TRUE;
947 ctx->cv_output = i;
948 break;
949 }
950 if (ctx->type == PIPE_SHADER_GEOMETRY) {
951 ctx->gs_out_ring_offset += 16;
952 }
953 } else if (ctx->type == PIPE_SHADER_FRAGMENT) {
954 switch (d->Semantic.Name) {
955 case TGSI_SEMANTIC_COLOR:
956 ctx->shader->nr_ps_max_color_exports++;
957 break;
958 }
959 }
960 }
961 ctx->shader->noutput += count;
962 break;
963 case TGSI_FILE_TEMPORARY:
964 if (ctx->info.indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
965 if (d->Array.ArrayID) {
966 r600_add_gpr_array(ctx->shader,
967 ctx->file_offset[TGSI_FILE_TEMPORARY] +
968 d->Range.First,
969 d->Range.Last - d->Range.First + 1, 0x0F);
970 }
971 }
972 break;
973
974 case TGSI_FILE_CONSTANT:
975 case TGSI_FILE_SAMPLER:
976 case TGSI_FILE_SAMPLER_VIEW:
977 case TGSI_FILE_ADDRESS:
978 case TGSI_FILE_BUFFER:
979 case TGSI_FILE_IMAGE:
980 case TGSI_FILE_MEMORY:
981 break;
982
983 case TGSI_FILE_HW_ATOMIC:
984 i = ctx->shader->nhwatomic_ranges;
985 ctx->shader->atomics[i].start = d->Range.First;
986 ctx->shader->atomics[i].end = d->Range.Last;
987 ctx->shader->atomics[i].hw_idx = ctx->shader->atomic_base + ctx->shader->nhwatomic;
988 ctx->shader->atomics[i].array_id = d->Array.ArrayID;
989 ctx->shader->atomics[i].buffer_id = d->Dim.Index2D;
990 ctx->shader->nhwatomic_ranges++;
991 ctx->shader->nhwatomic += count;
992 break;
993
994 case TGSI_FILE_SYSTEM_VALUE:
995 if (d->Semantic.Name == TGSI_SEMANTIC_SAMPLEMASK ||
996 d->Semantic.Name == TGSI_SEMANTIC_SAMPLEID ||
997 d->Semantic.Name == TGSI_SEMANTIC_SAMPLEPOS) {
998 break; /* Already handled from allocate_system_value_inputs */
999 } else if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) {
1000 break;
1001 } else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID)
1002 break;
1003 else if (d->Semantic.Name == TGSI_SEMANTIC_INVOCATIONID)
1004 break;
1005 else if (d->Semantic.Name == TGSI_SEMANTIC_TESSINNER ||
1006 d->Semantic.Name == TGSI_SEMANTIC_TESSOUTER) {
1007 int param = r600_get_lds_unique_index(d->Semantic.Name, 0);
1008 int dreg = d->Semantic.Name == TGSI_SEMANTIC_TESSINNER ? 3 : 2;
1009 unsigned temp_reg = r600_get_temp(ctx);
1010
1011 r = get_lds_offset0(ctx, 2, temp_reg, true);
1012 if (r)
1013 return r;
1014
1015 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
1016 temp_reg, 0,
1017 temp_reg, 0,
1018 V_SQ_ALU_SRC_LITERAL, param * 16);
1019 if (r)
1020 return r;
1021
1022 do_lds_fetch_values(ctx, temp_reg, dreg, 0xf);
1023 }
1024 else if (d->Semantic.Name == TGSI_SEMANTIC_TESSCOORD) {
1025 /* MOV r1.x, r0.x;
1026 MOV r1.y, r0.y;
1027 */
1028 for (i = 0; i < 2; i++) {
1029 struct r600_bytecode_alu alu;
1030 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1031 alu.op = ALU_OP1_MOV;
1032 alu.src[0].sel = 0;
1033 alu.src[0].chan = 0 + i;
1034 alu.dst.sel = 1;
1035 alu.dst.chan = 0 + i;
1036 alu.dst.write = 1;
1037 alu.last = (i == 1) ? 1 : 0;
1038 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1039 return r;
1040 }
1041 /* ADD r1.z, 1.0f, -r0.x */
1042 struct r600_bytecode_alu alu;
1043 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1044 alu.op = ALU_OP2_ADD;
1045 alu.src[0].sel = V_SQ_ALU_SRC_1;
1046 alu.src[1].sel = 1;
1047 alu.src[1].chan = 0;
1048 alu.src[1].neg = 1;
1049 alu.dst.sel = 1;
1050 alu.dst.chan = 2;
1051 alu.dst.write = 1;
1052 alu.last = 1;
1053 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1054 return r;
1055
1056 /* ADD r1.z, r1.z, -r1.y */
1057 alu.op = ALU_OP2_ADD;
1058 alu.src[0].sel = 1;
1059 alu.src[0].chan = 2;
1060 alu.src[1].sel = 1;
1061 alu.src[1].chan = 1;
1062 alu.src[1].neg = 1;
1063 alu.dst.sel = 1;
1064 alu.dst.chan = 2;
1065 alu.dst.write = 1;
1066 alu.last = 1;
1067 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1068 return r;
1069 break;
1070 }
1071 break;
1072 default:
1073 R600_ERR("unsupported file %d declaration\n", d->Declaration.File);
1074 return -EINVAL;
1075 }
1076 return 0;
1077 }
1078
1079 static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_offset)
1080 {
1081 struct tgsi_parse_context parse;
1082 struct {
1083 boolean enabled;
1084 int *reg;
1085 unsigned name, alternate_name;
1086 } inputs[2] = {
1087 { false, &ctx->face_gpr, TGSI_SEMANTIC_SAMPLEMASK, ~0u }, /* lives in Front Face GPR.z */
1088
1089 { false, &ctx->fixed_pt_position_gpr, TGSI_SEMANTIC_SAMPLEID, TGSI_SEMANTIC_SAMPLEPOS } /* SAMPLEID is in Fixed Point Position GPR.w */
1090 };
1091 int num_regs = 0;
1092 unsigned k, i;
1093
1094 if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) {
1095 return 0;
1096 }
1097
1098 /* need to scan shader for system values and interpolateAtSample/Offset/Centroid */
1099 while (!tgsi_parse_end_of_tokens(&parse)) {
1100 tgsi_parse_token(&parse);
1101
1102 if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) {
1103 const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
1104 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE ||
1105 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
1106 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID)
1107 {
1108 int interpolate, location, k;
1109
1110 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
1111 location = TGSI_INTERPOLATE_LOC_CENTER;
1112 inputs[1].enabled = true; /* needs SAMPLEID */
1113 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
1114 location = TGSI_INTERPOLATE_LOC_CENTER;
1115 /* Needs sample positions, currently those are always available */
1116 } else {
1117 location = TGSI_INTERPOLATE_LOC_CENTROID;
1118 }
1119
1120 interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index];
1121 k = eg_get_interpolator_index(interpolate, location);
1122 if (k >= 0)
1123 ctx->eg_interpolators[k].enabled = true;
1124 }
1125 } else if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_DECLARATION) {
1126 struct tgsi_full_declaration *d = &parse.FullToken.FullDeclaration;
1127 if (d->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
1128 for (k = 0; k < ARRAY_SIZE(inputs); k++) {
1129 if (d->Semantic.Name == inputs[k].name ||
1130 d->Semantic.Name == inputs[k].alternate_name) {
1131 inputs[k].enabled = true;
1132 }
1133 }
1134 }
1135 }
1136 }
1137
1138 tgsi_parse_free(&parse);
1139
1140 for (i = 0; i < ARRAY_SIZE(inputs); i++) {
1141 boolean enabled = inputs[i].enabled;
1142 int *reg = inputs[i].reg;
1143 unsigned name = inputs[i].name;
1144
1145 if (enabled) {
1146 int gpr = gpr_offset + num_regs++;
1147 ctx->shader->nsys_inputs++;
1148
1149 // add to inputs, allocate a gpr
1150 k = ctx->shader->ninput++;
1151 ctx->shader->input[k].name = name;
1152 ctx->shader->input[k].sid = 0;
1153 ctx->shader->input[k].interpolate = TGSI_INTERPOLATE_CONSTANT;
1154 ctx->shader->input[k].interpolate_location = TGSI_INTERPOLATE_LOC_CENTER;
1155 *reg = ctx->shader->input[k].gpr = gpr;
1156 }
1157 }
1158
1159 return gpr_offset + num_regs;
1160 }
1161
1162 /*
1163 * for evergreen we need to scan the shader to find the number of GPRs we need to
1164 * reserve for interpolation and system values
1165 *
1166 * we need to know if we are going to emit
1167 * any sample or centroid inputs
1168 * if perspective and linear are required
1169 */
1170 static int evergreen_gpr_count(struct r600_shader_ctx *ctx)
1171 {
1172 unsigned i;
1173 int num_baryc;
1174 struct tgsi_parse_context parse;
1175
1176 memset(&ctx->eg_interpolators, 0, sizeof(ctx->eg_interpolators));
1177
1178 for (i = 0; i < ctx->info.num_inputs; i++) {
1179 int k;
1180 /* skip position/face/mask/sampleid */
1181 if (ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_POSITION ||
1182 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_FACE ||
1183 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEMASK ||
1184 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEID)
1185 continue;
1186
1187 k = eg_get_interpolator_index(
1188 ctx->info.input_interpolate[i],
1189 ctx->info.input_interpolate_loc[i]);
1190 if (k >= 0)
1191 ctx->eg_interpolators[k].enabled = TRUE;
1192 }
1193
1194 if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) {
1195 return 0;
1196 }
1197
1198 /* need to scan shader for system values and interpolateAtSample/Offset/Centroid */
1199 while (!tgsi_parse_end_of_tokens(&parse)) {
1200 tgsi_parse_token(&parse);
1201
1202 if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) {
1203 const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
1204 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE ||
1205 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
1206 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID)
1207 {
1208 int interpolate, location, k;
1209
1210 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
1211 location = TGSI_INTERPOLATE_LOC_CENTER;
1212 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
1213 location = TGSI_INTERPOLATE_LOC_CENTER;
1214 } else {
1215 location = TGSI_INTERPOLATE_LOC_CENTROID;
1216 }
1217
1218 interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index];
1219 k = eg_get_interpolator_index(interpolate, location);
1220 if (k >= 0)
1221 ctx->eg_interpolators[k].enabled = true;
1222 }
1223 }
1224 }
1225
1226 tgsi_parse_free(&parse);
1227
1228 /* assign gpr to each interpolator according to priority */
1229 num_baryc = 0;
1230 for (i = 0; i < ARRAY_SIZE(ctx->eg_interpolators); i++) {
1231 if (ctx->eg_interpolators[i].enabled) {
1232 ctx->eg_interpolators[i].ij_index = num_baryc;
1233 num_baryc ++;
1234 }
1235 }
1236
1237 /* XXX PULL MODEL and LINE STIPPLE */
1238
1239 num_baryc = (num_baryc + 1) >> 1;
1240 return allocate_system_value_inputs(ctx, num_baryc);
1241 }
1242
1243 /* sample_id_sel == NULL means fetch for current sample */
1244 static int load_sample_position(struct r600_shader_ctx *ctx, struct r600_shader_src *sample_id, int chan_sel)
1245 {
1246 struct r600_bytecode_vtx vtx;
1247 int r, t1;
1248
1249 assert(ctx->fixed_pt_position_gpr != -1);
1250
1251 t1 = r600_get_temp(ctx);
1252
1253 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
1254 vtx.op = FETCH_OP_VFETCH;
1255 vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;
1256 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1257 if (sample_id == NULL) {
1258 vtx.src_gpr = ctx->fixed_pt_position_gpr; // SAMPLEID is in .w;
1259 vtx.src_sel_x = 3;
1260 }
1261 else {
1262 struct r600_bytecode_alu alu;
1263
1264 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1265 alu.op = ALU_OP1_MOV;
1266 r600_bytecode_src(&alu.src[0], sample_id, chan_sel);
1267 alu.dst.sel = t1;
1268 alu.dst.write = 1;
1269 alu.last = 1;
1270 r = r600_bytecode_add_alu(ctx->bc, &alu);
1271 if (r)
1272 return r;
1273
1274 vtx.src_gpr = t1;
1275 vtx.src_sel_x = 0;
1276 }
1277 vtx.mega_fetch_count = 16;
1278 vtx.dst_gpr = t1;
1279 vtx.dst_sel_x = 0;
1280 vtx.dst_sel_y = 1;
1281 vtx.dst_sel_z = 2;
1282 vtx.dst_sel_w = 3;
1283 vtx.data_format = FMT_32_32_32_32_FLOAT;
1284 vtx.num_format_all = 2;
1285 vtx.format_comp_all = 1;
1286 vtx.use_const_fields = 0;
1287 vtx.offset = 0;
1288 vtx.endian = r600_endian_swap(32);
1289 vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */
1290
1291 r = r600_bytecode_add_vtx(ctx->bc, &vtx);
1292 if (r)
1293 return r;
1294
1295 return t1;
1296 }
1297
1298 static int load_block_grid_size(struct r600_shader_ctx *ctx, bool load_block)
1299 {
1300 struct r600_bytecode_vtx vtx;
1301 int r, t1;
1302
1303 if (ctx->cs_block_size_loaded)
1304 return ctx->cs_block_size_reg;
1305 if (ctx->cs_grid_size_loaded)
1306 return ctx->cs_grid_size_reg;
1307
1308 t1 = load_block ? ctx->cs_block_size_reg : ctx->cs_grid_size_reg;
1309 struct r600_bytecode_alu alu;
1310 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1311 alu.op = ALU_OP1_MOV;
1312 alu.src[0].sel = V_SQ_ALU_SRC_0;
1313 alu.dst.sel = t1;
1314 alu.dst.write = 1;
1315 alu.last = 1;
1316 r = r600_bytecode_add_alu(ctx->bc, &alu);
1317 if (r)
1318 return r;
1319
1320 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
1321 vtx.op = FETCH_OP_VFETCH;
1322 vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;
1323 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1324 vtx.src_gpr = t1;
1325 vtx.src_sel_x = 0;
1326
1327 vtx.mega_fetch_count = 16;
1328 vtx.dst_gpr = t1;
1329 vtx.dst_sel_x = 0;
1330 vtx.dst_sel_y = 1;
1331 vtx.dst_sel_z = 2;
1332 vtx.dst_sel_w = 7;
1333 vtx.data_format = FMT_32_32_32_32;
1334 vtx.num_format_all = 1;
1335 vtx.format_comp_all = 0;
1336 vtx.use_const_fields = 0;
1337 vtx.offset = load_block ? 0 : 16; // first element is size of buffer
1338 vtx.endian = r600_endian_swap(32);
1339 vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */
1340
1341 r = r600_bytecode_add_vtx(ctx->bc, &vtx);
1342 if (r)
1343 return r;
1344
1345 if (load_block)
1346 ctx->cs_block_size_loaded = true;
1347 else
1348 ctx->cs_grid_size_loaded = true;
1349 return t1;
1350 }
1351
1352 static void tgsi_src(struct r600_shader_ctx *ctx,
1353 const struct tgsi_full_src_register *tgsi_src,
1354 struct r600_shader_src *r600_src)
1355 {
1356 memset(r600_src, 0, sizeof(*r600_src));
1357 r600_src->swizzle[0] = tgsi_src->Register.SwizzleX;
1358 r600_src->swizzle[1] = tgsi_src->Register.SwizzleY;
1359 r600_src->swizzle[2] = tgsi_src->Register.SwizzleZ;
1360 r600_src->swizzle[3] = tgsi_src->Register.SwizzleW;
1361 r600_src->neg = tgsi_src->Register.Negate;
1362 r600_src->abs = tgsi_src->Register.Absolute;
1363
1364 if (tgsi_src->Register.File == TGSI_FILE_IMMEDIATE) {
1365 int index;
1366 if ((tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleY) &&
1367 (tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleZ) &&
1368 (tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleW)) {
1369
1370 index = tgsi_src->Register.Index * 4 + tgsi_src->Register.SwizzleX;
1371 r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel, &r600_src->neg, r600_src->abs);
1372 if (r600_src->sel != V_SQ_ALU_SRC_LITERAL)
1373 return;
1374 }
1375 index = tgsi_src->Register.Index;
1376 r600_src->sel = V_SQ_ALU_SRC_LITERAL;
1377 memcpy(r600_src->value, ctx->literals + index * 4, sizeof(r600_src->value));
1378 } else if (tgsi_src->Register.File == TGSI_FILE_SYSTEM_VALUE) {
1379 if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEMASK) {
1380 r600_src->swizzle[0] = 2; // Z value
1381 r600_src->swizzle[1] = 2;
1382 r600_src->swizzle[2] = 2;
1383 r600_src->swizzle[3] = 2;
1384 r600_src->sel = ctx->face_gpr;
1385 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEID) {
1386 r600_src->swizzle[0] = 3; // W value
1387 r600_src->swizzle[1] = 3;
1388 r600_src->swizzle[2] = 3;
1389 r600_src->swizzle[3] = 3;
1390 r600_src->sel = ctx->fixed_pt_position_gpr;
1391 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEPOS) {
1392 r600_src->swizzle[0] = 0;
1393 r600_src->swizzle[1] = 1;
1394 r600_src->swizzle[2] = 4;
1395 r600_src->swizzle[3] = 4;
1396 r600_src->sel = load_sample_position(ctx, NULL, -1);
1397 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INSTANCEID) {
1398 r600_src->swizzle[0] = 3;
1399 r600_src->swizzle[1] = 3;
1400 r600_src->swizzle[2] = 3;
1401 r600_src->swizzle[3] = 3;
1402 r600_src->sel = 0;
1403 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTEXID) {
1404 r600_src->swizzle[0] = 0;
1405 r600_src->swizzle[1] = 0;
1406 r600_src->swizzle[2] = 0;
1407 r600_src->swizzle[3] = 0;
1408 r600_src->sel = 0;
1409 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_THREAD_ID) {
1410 r600_src->sel = 0;
1411 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_BLOCK_ID) {
1412 r600_src->sel = 1;
1413 } else if (ctx->type != PIPE_SHADER_TESS_CTRL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) {
1414 r600_src->swizzle[0] = 3;
1415 r600_src->swizzle[1] = 3;
1416 r600_src->swizzle[2] = 3;
1417 r600_src->swizzle[3] = 3;
1418 r600_src->sel = 1;
1419 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) {
1420 r600_src->swizzle[0] = 2;
1421 r600_src->swizzle[1] = 2;
1422 r600_src->swizzle[2] = 2;
1423 r600_src->swizzle[3] = 2;
1424 r600_src->sel = 0;
1425 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSCOORD) {
1426 r600_src->sel = 1;
1427 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSINNER) {
1428 r600_src->sel = 3;
1429 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_TESSOUTER) {
1430 r600_src->sel = 2;
1431 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTICESIN) {
1432 if (ctx->type == PIPE_SHADER_TESS_CTRL) {
1433 r600_src->sel = ctx->tess_input_info;
1434 r600_src->swizzle[0] = 2;
1435 r600_src->swizzle[1] = 2;
1436 r600_src->swizzle[2] = 2;
1437 r600_src->swizzle[3] = 2;
1438 } else {
1439 r600_src->sel = ctx->tess_input_info;
1440 r600_src->swizzle[0] = 3;
1441 r600_src->swizzle[1] = 3;
1442 r600_src->swizzle[2] = 3;
1443 r600_src->swizzle[3] = 3;
1444 }
1445 } else if (ctx->type == PIPE_SHADER_TESS_CTRL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_PRIMID) {
1446 r600_src->sel = 0;
1447 r600_src->swizzle[0] = 0;
1448 r600_src->swizzle[1] = 0;
1449 r600_src->swizzle[2] = 0;
1450 r600_src->swizzle[3] = 0;
1451 } else if (ctx->type == PIPE_SHADER_TESS_EVAL && ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_PRIMID) {
1452 r600_src->sel = 0;
1453 r600_src->swizzle[0] = 3;
1454 r600_src->swizzle[1] = 3;
1455 r600_src->swizzle[2] = 3;
1456 r600_src->swizzle[3] = 3;
1457 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_GRID_SIZE) {
1458 r600_src->sel = load_block_grid_size(ctx, false);
1459 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_BLOCK_SIZE) {
1460 r600_src->sel = load_block_grid_size(ctx, true);
1461 }
1462 } else {
1463 if (tgsi_src->Register.Indirect)
1464 r600_src->rel = V_SQ_REL_RELATIVE;
1465 r600_src->sel = tgsi_src->Register.Index;
1466 r600_src->sel += ctx->file_offset[tgsi_src->Register.File];
1467 }
1468 if (tgsi_src->Register.File == TGSI_FILE_CONSTANT) {
1469 if (tgsi_src->Register.Dimension) {
1470 r600_src->kc_bank = tgsi_src->Dimension.Index;
1471 if (tgsi_src->Dimension.Indirect) {
1472 r600_src->kc_rel = 1;
1473 }
1474 }
1475 }
1476 }
1477
1478 static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
1479 unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,
1480 unsigned int dst_reg)
1481 {
1482 struct r600_bytecode_vtx vtx;
1483 unsigned int ar_reg;
1484 int r;
1485
1486 if (offset) {
1487 struct r600_bytecode_alu alu;
1488
1489 memset(&alu, 0, sizeof(alu));
1490
1491 alu.op = ALU_OP2_ADD_INT;
1492 alu.src[0].sel = ctx->bc->ar_reg;
1493 alu.src[0].chan = ar_chan;
1494
1495 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1496 alu.src[1].value = offset;
1497
1498 alu.dst.sel = dst_reg;
1499 alu.dst.chan = ar_chan;
1500 alu.dst.write = 1;
1501 alu.last = 1;
1502
1503 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1504 return r;
1505
1506 ar_reg = dst_reg;
1507 } else {
1508 ar_reg = ctx->bc->ar_reg;
1509 }
1510
1511 memset(&vtx, 0, sizeof(vtx));
1512 vtx.buffer_id = cb_idx;
1513 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1514 vtx.src_gpr = ar_reg;
1515 vtx.src_sel_x = ar_chan;
1516 vtx.mega_fetch_count = 16;
1517 vtx.dst_gpr = dst_reg;
1518 vtx.dst_sel_x = 0; /* SEL_X */
1519 vtx.dst_sel_y = 1; /* SEL_Y */
1520 vtx.dst_sel_z = 2; /* SEL_Z */
1521 vtx.dst_sel_w = 3; /* SEL_W */
1522 vtx.data_format = FMT_32_32_32_32_FLOAT;
1523 vtx.num_format_all = 2; /* NUM_FORMAT_SCALED */
1524 vtx.format_comp_all = 1; /* FORMAT_COMP_SIGNED */
1525 vtx.endian = r600_endian_swap(32);
1526 vtx.buffer_index_mode = cb_rel; // cb_rel ? V_SQ_CF_INDEX_0 : V_SQ_CF_INDEX_NONE;
1527
1528 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1529 return r;
1530
1531 return 0;
1532 }
1533
1534 static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1535 {
1536 struct r600_bytecode_vtx vtx;
1537 int r;
1538 unsigned index = src->Register.Index;
1539 unsigned vtx_id = src->Dimension.Index;
1540 int offset_reg = ctx->gs_rotated_input[vtx_id / 3];
1541 int offset_chan = vtx_id % 3;
1542 int t2 = 0;
1543
1544 /* offsets of per-vertex data in ESGS ring are passed to GS in R0.x, R0.y,
1545 * R0.w, R1.x, R1.y, R1.z (it seems R0.z is used for PrimitiveID) */
1546
1547 if (offset_reg == ctx->gs_rotated_input[0] && offset_chan == 2)
1548 offset_chan = 3;
1549
1550 if (src->Dimension.Indirect || src->Register.Indirect)
1551 t2 = r600_get_temp(ctx);
1552
1553 if (src->Dimension.Indirect) {
1554 int treg[3];
1555 struct r600_bytecode_alu alu;
1556 int r, i;
1557 unsigned addr_reg;
1558 addr_reg = get_address_file_reg(ctx, src->DimIndirect.Index);
1559 if (src->DimIndirect.Index > 0) {
1560 r = single_alu_op2(ctx, ALU_OP1_MOV,
1561 ctx->bc->ar_reg, 0,
1562 addr_reg, 0,
1563 0, 0);
1564 if (r)
1565 return r;
1566 }
1567 /*
1568 we have to put the R0.x/y/w into Rt.x Rt+1.x Rt+2.x then index reg from Rt.
1569 at least this is what fglrx seems to do. */
1570 for (i = 0; i < 3; i++) {
1571 treg[i] = r600_get_temp(ctx);
1572 }
1573 r600_add_gpr_array(ctx->shader, treg[0], 3, 0x0F);
1574
1575 for (i = 0; i < 3; i++) {
1576 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1577 alu.op = ALU_OP1_MOV;
1578 alu.src[0].sel = ctx->gs_rotated_input[0];
1579 alu.src[0].chan = i == 2 ? 3 : i;
1580 alu.dst.sel = treg[i];
1581 alu.dst.chan = 0;
1582 alu.dst.write = 1;
1583 alu.last = 1;
1584 r = r600_bytecode_add_alu(ctx->bc, &alu);
1585 if (r)
1586 return r;
1587 }
1588 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1589 alu.op = ALU_OP1_MOV;
1590 alu.src[0].sel = treg[0];
1591 alu.src[0].rel = 1;
1592 alu.dst.sel = t2;
1593 alu.dst.write = 1;
1594 alu.last = 1;
1595 r = r600_bytecode_add_alu(ctx->bc, &alu);
1596 if (r)
1597 return r;
1598 offset_reg = t2;
1599 offset_chan = 0;
1600 }
1601
1602 if (src->Register.Indirect) {
1603 int addr_reg;
1604 unsigned first = ctx->info.input_array_first[src->Indirect.ArrayID];
1605
1606 addr_reg = get_address_file_reg(ctx, src->Indirect.Index);
1607
1608 /* pull the value from index_reg */
1609 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
1610 t2, 1,
1611 addr_reg, 0,
1612 V_SQ_ALU_SRC_LITERAL, first);
1613 if (r)
1614 return r;
1615 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
1616 t2, 0,
1617 t2, 1,
1618 V_SQ_ALU_SRC_LITERAL, 4,
1619 offset_reg, offset_chan);
1620 if (r)
1621 return r;
1622 offset_reg = t2;
1623 offset_chan = 0;
1624 index = src->Register.Index - first;
1625 }
1626
1627 memset(&vtx, 0, sizeof(vtx));
1628 vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
1629 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1630 vtx.src_gpr = offset_reg;
1631 vtx.src_sel_x = offset_chan;
1632 vtx.offset = index * 16; /*bytes*/
1633 vtx.mega_fetch_count = 16;
1634 vtx.dst_gpr = dst_reg;
1635 vtx.dst_sel_x = 0; /* SEL_X */
1636 vtx.dst_sel_y = 1; /* SEL_Y */
1637 vtx.dst_sel_z = 2; /* SEL_Z */
1638 vtx.dst_sel_w = 3; /* SEL_W */
1639 if (ctx->bc->chip_class >= EVERGREEN) {
1640 vtx.use_const_fields = 1;
1641 } else {
1642 vtx.data_format = FMT_32_32_32_32_FLOAT;
1643 }
1644
1645 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1646 return r;
1647
1648 return 0;
1649 }
1650
1651 static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx)
1652 {
1653 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1654 unsigned i;
1655
1656 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1657 struct tgsi_full_src_register *src = &inst->Src[i];
1658
1659 if (src->Register.File == TGSI_FILE_INPUT) {
1660 if (ctx->shader->input[src->Register.Index].name == TGSI_SEMANTIC_PRIMID) {
1661 /* primitive id is in R0.z */
1662 ctx->src[i].sel = 0;
1663 ctx->src[i].swizzle[0] = 2;
1664 }
1665 }
1666 if (src->Register.File == TGSI_FILE_INPUT && src->Register.Dimension) {
1667 int treg = r600_get_temp(ctx);
1668
1669 fetch_gs_input(ctx, src, treg);
1670 ctx->src[i].sel = treg;
1671 ctx->src[i].rel = 0;
1672 }
1673 }
1674 return 0;
1675 }
1676
1677
1678 /* Tessellation shaders pass outputs to the next shader using LDS.
1679 *
1680 * LS outputs = TCS(HS) inputs
1681 * TCS(HS) outputs = TES(DS) inputs
1682 *
1683 * The LDS layout is:
1684 * - TCS inputs for patch 0
1685 * - TCS inputs for patch 1
1686 * - TCS inputs for patch 2 = get_tcs_in_current_patch_offset (if RelPatchID==2)
1687 * - ...
1688 * - TCS outputs for patch 0 = get_tcs_out_patch0_offset
1689 * - Per-patch TCS outputs for patch 0 = get_tcs_out_patch0_patch_data_offset
1690 * - TCS outputs for patch 1
1691 * - Per-patch TCS outputs for patch 1
1692 * - TCS outputs for patch 2 = get_tcs_out_current_patch_offset (if RelPatchID==2)
1693 * - Per-patch TCS outputs for patch 2 = get_tcs_out_current_patch_data_offset (if RelPatchID==2)
1694 * - ...
1695 *
1696 * All three shaders VS(LS), TCS, TES share the same LDS space.
1697 */
1698 /* this will return with the dw address in temp_reg.x */
1699 static int r600_get_byte_address(struct r600_shader_ctx *ctx, int temp_reg,
1700 const struct tgsi_full_dst_register *dst,
1701 const struct tgsi_full_src_register *src,
1702 int stride_bytes_reg, int stride_bytes_chan)
1703 {
1704 struct tgsi_full_dst_register reg;
1705 ubyte *name, *index, *array_first;
1706 int r;
1707 int param;
1708 struct tgsi_shader_info *info = &ctx->info;
1709 /* Set the register description. The address computation is the same
1710 * for sources and destinations. */
1711 if (src) {
1712 reg.Register.File = src->Register.File;
1713 reg.Register.Index = src->Register.Index;
1714 reg.Register.Indirect = src->Register.Indirect;
1715 reg.Register.Dimension = src->Register.Dimension;
1716 reg.Indirect = src->Indirect;
1717 reg.Dimension = src->Dimension;
1718 reg.DimIndirect = src->DimIndirect;
1719 } else
1720 reg = *dst;
1721
1722 /* If the register is 2-dimensional (e.g. an array of vertices
1723 * in a primitive), calculate the base address of the vertex. */
1724 if (reg.Register.Dimension) {
1725 int sel, chan;
1726 if (reg.Dimension.Indirect) {
1727 unsigned addr_reg;
1728 assert (reg.DimIndirect.File == TGSI_FILE_ADDRESS);
1729
1730 addr_reg = get_address_file_reg(ctx, reg.DimIndirect.Index);
1731 /* pull the value from index_reg */
1732 sel = addr_reg;
1733 chan = 0;
1734 } else {
1735 sel = V_SQ_ALU_SRC_LITERAL;
1736 chan = reg.Dimension.Index;
1737 }
1738
1739 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
1740 temp_reg, 0,
1741 stride_bytes_reg, stride_bytes_chan,
1742 sel, chan,
1743 temp_reg, 0);
1744 if (r)
1745 return r;
1746 }
1747
1748 if (reg.Register.File == TGSI_FILE_INPUT) {
1749 name = info->input_semantic_name;
1750 index = info->input_semantic_index;
1751 array_first = info->input_array_first;
1752 } else if (reg.Register.File == TGSI_FILE_OUTPUT) {
1753 name = info->output_semantic_name;
1754 index = info->output_semantic_index;
1755 array_first = info->output_array_first;
1756 } else {
1757 assert(0);
1758 return -1;
1759 }
1760 if (reg.Register.Indirect) {
1761 int addr_reg;
1762 int first;
1763 /* Add the relative address of the element. */
1764 if (reg.Indirect.ArrayID)
1765 first = array_first[reg.Indirect.ArrayID];
1766 else
1767 first = reg.Register.Index;
1768
1769 addr_reg = get_address_file_reg(ctx, reg.Indirect.Index);
1770
1771 /* pull the value from index_reg */
1772 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
1773 temp_reg, 0,
1774 V_SQ_ALU_SRC_LITERAL, 16,
1775 addr_reg, 0,
1776 temp_reg, 0);
1777 if (r)
1778 return r;
1779
1780 param = r600_get_lds_unique_index(name[first],
1781 index[first]);
1782
1783 } else {
1784 param = r600_get_lds_unique_index(name[reg.Register.Index],
1785 index[reg.Register.Index]);
1786 }
1787
1788 /* add to base_addr - passed in temp_reg.x */
1789 if (param) {
1790 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
1791 temp_reg, 0,
1792 temp_reg, 0,
1793 V_SQ_ALU_SRC_LITERAL, param * 16);
1794 if (r)
1795 return r;
1796
1797 }
1798 return 0;
1799 }
1800
1801 static int do_lds_fetch_values(struct r600_shader_ctx *ctx, unsigned temp_reg,
1802 unsigned dst_reg, unsigned mask)
1803 {
1804 struct r600_bytecode_alu alu;
1805 int r, i, lasti;
1806
1807 if ((ctx->bc->cf_last->ndw>>1) >= 0x60)
1808 ctx->bc->force_add_cf = 1;
1809
1810 lasti = tgsi_last_instruction(mask);
1811 for (i = 1; i <= lasti; i++) {
1812 if (!(mask & (1 << i)))
1813 continue;
1814
1815 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
1816 temp_reg, i,
1817 temp_reg, 0,
1818 V_SQ_ALU_SRC_LITERAL, 4 * i);
1819 if (r)
1820 return r;
1821 }
1822 for (i = 0; i <= lasti; i++) {
1823 if (!(mask & (1 << i)))
1824 continue;
1825
1826 /* emit an LDS_READ_RET */
1827 memset(&alu, 0, sizeof(alu));
1828 alu.op = LDS_OP1_LDS_READ_RET;
1829 alu.src[0].sel = temp_reg;
1830 alu.src[0].chan = i;
1831 alu.src[1].sel = V_SQ_ALU_SRC_0;
1832 alu.src[2].sel = V_SQ_ALU_SRC_0;
1833 alu.dst.chan = 0;
1834 alu.is_lds_idx_op = true;
1835 alu.last = 1;
1836 r = r600_bytecode_add_alu(ctx->bc, &alu);
1837 if (r)
1838 return r;
1839 }
1840 for (i = 0; i <= lasti; i++) {
1841 if (!(mask & (1 << i)))
1842 continue;
1843
1844 /* then read from LDS_OQ_A_POP */
1845 memset(&alu, 0, sizeof(alu));
1846
1847 alu.op = ALU_OP1_MOV;
1848 alu.src[0].sel = EG_V_SQ_ALU_SRC_LDS_OQ_A_POP;
1849 alu.src[0].chan = 0;
1850 alu.dst.sel = dst_reg;
1851 alu.dst.chan = i;
1852 alu.dst.write = 1;
1853 alu.last = 1;
1854 r = r600_bytecode_add_alu(ctx->bc, &alu);
1855 if (r)
1856 return r;
1857 }
1858 return 0;
1859 }
1860
1861 static int fetch_mask(struct tgsi_src_register *reg)
1862 {
1863 int mask = 0;
1864 mask |= 1 << reg->SwizzleX;
1865 mask |= 1 << reg->SwizzleY;
1866 mask |= 1 << reg->SwizzleZ;
1867 mask |= 1 << reg->SwizzleW;
1868 return mask;
1869 }
1870
1871 static int fetch_tes_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1872 {
1873 int r;
1874 unsigned temp_reg = r600_get_temp(ctx);
1875
1876 r = get_lds_offset0(ctx, 2, temp_reg,
1877 src->Register.Dimension ? false : true);
1878 if (r)
1879 return r;
1880
1881 /* the base address is now in temp.x */
1882 r = r600_get_byte_address(ctx, temp_reg,
1883 NULL, src, ctx->tess_output_info, 1);
1884 if (r)
1885 return r;
1886
1887 r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register));
1888 if (r)
1889 return r;
1890 return 0;
1891 }
1892
1893 static int fetch_tcs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1894 {
1895 int r;
1896 unsigned temp_reg = r600_get_temp(ctx);
1897
1898 /* t.x = ips * r0.y */
1899 r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24,
1900 temp_reg, 0,
1901 ctx->tess_input_info, 0,
1902 0, 1);
1903
1904 if (r)
1905 return r;
1906
1907 /* the base address is now in temp.x */
1908 r = r600_get_byte_address(ctx, temp_reg,
1909 NULL, src, ctx->tess_input_info, 1);
1910 if (r)
1911 return r;
1912
1913 r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register));
1914 if (r)
1915 return r;
1916 return 0;
1917 }
1918
1919 static int fetch_tcs_output(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1920 {
1921 int r;
1922 unsigned temp_reg = r600_get_temp(ctx);
1923
1924 r = get_lds_offset0(ctx, 1, temp_reg,
1925 src->Register.Dimension ? false : true);
1926 if (r)
1927 return r;
1928 /* the base address is now in temp.x */
1929 r = r600_get_byte_address(ctx, temp_reg,
1930 NULL, src,
1931 ctx->tess_output_info, 1);
1932 if (r)
1933 return r;
1934
1935 r = do_lds_fetch_values(ctx, temp_reg, dst_reg, fetch_mask(&src->Register));
1936 if (r)
1937 return r;
1938 return 0;
1939 }
1940
1941 static int tgsi_split_lds_inputs(struct r600_shader_ctx *ctx)
1942 {
1943 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1944 unsigned i;
1945
1946 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1947 struct tgsi_full_src_register *src = &inst->Src[i];
1948
1949 if (ctx->type == PIPE_SHADER_TESS_EVAL && src->Register.File == TGSI_FILE_INPUT) {
1950 int treg = r600_get_temp(ctx);
1951 fetch_tes_input(ctx, src, treg);
1952 ctx->src[i].sel = treg;
1953 ctx->src[i].rel = 0;
1954 }
1955 if (ctx->type == PIPE_SHADER_TESS_CTRL && src->Register.File == TGSI_FILE_INPUT) {
1956 int treg = r600_get_temp(ctx);
1957 fetch_tcs_input(ctx, src, treg);
1958 ctx->src[i].sel = treg;
1959 ctx->src[i].rel = 0;
1960 }
1961 if (ctx->type == PIPE_SHADER_TESS_CTRL && src->Register.File == TGSI_FILE_OUTPUT) {
1962 int treg = r600_get_temp(ctx);
1963 fetch_tcs_output(ctx, src, treg);
1964 ctx->src[i].sel = treg;
1965 ctx->src[i].rel = 0;
1966 }
1967 }
1968 return 0;
1969 }
1970
1971 static int tgsi_split_constant(struct r600_shader_ctx *ctx)
1972 {
1973 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1974 struct r600_bytecode_alu alu;
1975 int i, j, k, nconst, r;
1976
1977 for (i = 0, nconst = 0; i < inst->Instruction.NumSrcRegs; i++) {
1978 if (inst->Src[i].Register.File == TGSI_FILE_CONSTANT) {
1979 nconst++;
1980 }
1981 tgsi_src(ctx, &inst->Src[i], &ctx->src[i]);
1982 }
1983 for (i = 0, j = nconst - 1; i < inst->Instruction.NumSrcRegs; i++) {
1984 if (inst->Src[i].Register.File != TGSI_FILE_CONSTANT) {
1985 continue;
1986 }
1987
1988 if (ctx->src[i].rel) {
1989 int chan = inst->Src[i].Indirect.Swizzle;
1990 int treg = r600_get_temp(ctx);
1991 if ((r = tgsi_fetch_rel_const(ctx, ctx->src[i].kc_bank, ctx->src[i].kc_rel, ctx->src[i].sel - 512, chan, treg)))
1992 return r;
1993
1994 ctx->src[i].kc_bank = 0;
1995 ctx->src[i].kc_rel = 0;
1996 ctx->src[i].sel = treg;
1997 ctx->src[i].rel = 0;
1998 j--;
1999 } else if (j > 0) {
2000 int treg = r600_get_temp(ctx);
2001 for (k = 0; k < 4; k++) {
2002 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2003 alu.op = ALU_OP1_MOV;
2004 alu.src[0].sel = ctx->src[i].sel;
2005 alu.src[0].chan = k;
2006 alu.src[0].rel = ctx->src[i].rel;
2007 alu.src[0].kc_bank = ctx->src[i].kc_bank;
2008 alu.src[0].kc_rel = ctx->src[i].kc_rel;
2009 alu.dst.sel = treg;
2010 alu.dst.chan = k;
2011 alu.dst.write = 1;
2012 if (k == 3)
2013 alu.last = 1;
2014 r = r600_bytecode_add_alu(ctx->bc, &alu);
2015 if (r)
2016 return r;
2017 }
2018 ctx->src[i].sel = treg;
2019 ctx->src[i].rel =0;
2020 j--;
2021 }
2022 }
2023 return 0;
2024 }
2025
2026 /* need to move any immediate into a temp - for trig functions which use literal for PI stuff */
2027 static int tgsi_split_literal_constant(struct r600_shader_ctx *ctx)
2028 {
2029 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2030 struct r600_bytecode_alu alu;
2031 int i, j, k, nliteral, r;
2032
2033 for (i = 0, nliteral = 0; i < inst->Instruction.NumSrcRegs; i++) {
2034 if (ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
2035 nliteral++;
2036 }
2037 }
2038 for (i = 0, j = nliteral - 1; i < inst->Instruction.NumSrcRegs; i++) {
2039 if (j > 0 && ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
2040 int treg = r600_get_temp(ctx);
2041 for (k = 0; k < 4; k++) {
2042 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2043 alu.op = ALU_OP1_MOV;
2044 alu.src[0].sel = ctx->src[i].sel;
2045 alu.src[0].chan = k;
2046 alu.src[0].value = ctx->src[i].value[k];
2047 alu.dst.sel = treg;
2048 alu.dst.chan = k;
2049 alu.dst.write = 1;
2050 if (k == 3)
2051 alu.last = 1;
2052 r = r600_bytecode_add_alu(ctx->bc, &alu);
2053 if (r)
2054 return r;
2055 }
2056 ctx->src[i].sel = treg;
2057 j--;
2058 }
2059 }
2060 return 0;
2061 }
2062
2063 static int process_twoside_color_inputs(struct r600_shader_ctx *ctx)
2064 {
2065 int i, r, count = ctx->shader->ninput;
2066
2067 for (i = 0; i < count; i++) {
2068 if (ctx->shader->input[i].name == TGSI_SEMANTIC_COLOR) {
2069 r = select_twoside_color(ctx, i, ctx->shader->input[i].back_color_input);
2070 if (r)
2071 return r;
2072 }
2073 }
2074 return 0;
2075 }
2076
2077 static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output_info *so,
2078 int stream, unsigned *stream_item_size UNUSED)
2079 {
2080 unsigned so_gpr[PIPE_MAX_SHADER_OUTPUTS];
2081 unsigned start_comp[PIPE_MAX_SHADER_OUTPUTS];
2082 int j, r;
2083 unsigned i;
2084
2085 /* Sanity checking. */
2086 if (so->num_outputs > PIPE_MAX_SO_OUTPUTS) {
2087 R600_ERR("Too many stream outputs: %d\n", so->num_outputs);
2088 r = -EINVAL;
2089 goto out_err;
2090 }
2091 for (i = 0; i < so->num_outputs; i++) {
2092 if (so->output[i].output_buffer >= 4) {
2093 R600_ERR("Exceeded the max number of stream output buffers, got: %d\n",
2094 so->output[i].output_buffer);
2095 r = -EINVAL;
2096 goto out_err;
2097 }
2098 }
2099
2100 /* Initialize locations where the outputs are stored. */
2101 for (i = 0; i < so->num_outputs; i++) {
2102
2103 so_gpr[i] = ctx->shader->output[so->output[i].register_index].gpr;
2104 start_comp[i] = so->output[i].start_component;
2105 /* Lower outputs with dst_offset < start_component.
2106 *
2107 * We can only output 4D vectors with a write mask, e.g. we can
2108 * only output the W component at offset 3, etc. If we want
2109 * to store Y, Z, or W at buffer offset 0, we need to use MOV
2110 * to move it to X and output X. */
2111 if (so->output[i].dst_offset < so->output[i].start_component) {
2112 unsigned tmp = r600_get_temp(ctx);
2113
2114 for (j = 0; j < so->output[i].num_components; j++) {
2115 struct r600_bytecode_alu alu;
2116 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2117 alu.op = ALU_OP1_MOV;
2118 alu.src[0].sel = so_gpr[i];
2119 alu.src[0].chan = so->output[i].start_component + j;
2120
2121 alu.dst.sel = tmp;
2122 alu.dst.chan = j;
2123 alu.dst.write = 1;
2124 if (j == so->output[i].num_components - 1)
2125 alu.last = 1;
2126 r = r600_bytecode_add_alu(ctx->bc, &alu);
2127 if (r)
2128 return r;
2129 }
2130 start_comp[i] = 0;
2131 so_gpr[i] = tmp;
2132 }
2133 }
2134
2135 /* Write outputs to buffers. */
2136 for (i = 0; i < so->num_outputs; i++) {
2137 struct r600_bytecode_output output;
2138
2139 if (stream != -1 && stream != so->output[i].output_buffer)
2140 continue;
2141
2142 memset(&output, 0, sizeof(struct r600_bytecode_output));
2143 output.gpr = so_gpr[i];
2144 output.elem_size = so->output[i].num_components - 1;
2145 if (output.elem_size == 2)
2146 output.elem_size = 3; // 3 not supported, write 4 with junk at end
2147 output.array_base = so->output[i].dst_offset - start_comp[i];
2148 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
2149 output.burst_count = 1;
2150 /* array_size is an upper limit for the burst_count
2151 * with MEM_STREAM instructions */
2152 output.array_size = 0xFFF;
2153 output.comp_mask = ((1 << so->output[i].num_components) - 1) << start_comp[i];
2154
2155 if (ctx->bc->chip_class >= EVERGREEN) {
2156 switch (so->output[i].output_buffer) {
2157 case 0:
2158 output.op = CF_OP_MEM_STREAM0_BUF0;
2159 break;
2160 case 1:
2161 output.op = CF_OP_MEM_STREAM0_BUF1;
2162 break;
2163 case 2:
2164 output.op = CF_OP_MEM_STREAM0_BUF2;
2165 break;
2166 case 3:
2167 output.op = CF_OP_MEM_STREAM0_BUF3;
2168 break;
2169 }
2170 output.op += so->output[i].stream * 4;
2171 assert(output.op >= CF_OP_MEM_STREAM0_BUF0 && output.op <= CF_OP_MEM_STREAM3_BUF3);
2172 ctx->enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer) << so->output[i].stream * 4;
2173 } else {
2174 switch (so->output[i].output_buffer) {
2175 case 0:
2176 output.op = CF_OP_MEM_STREAM0;
2177 break;
2178 case 1:
2179 output.op = CF_OP_MEM_STREAM1;
2180 break;
2181 case 2:
2182 output.op = CF_OP_MEM_STREAM2;
2183 break;
2184 case 3:
2185 output.op = CF_OP_MEM_STREAM3;
2186 break;
2187 }
2188 ctx->enabled_stream_buffers_mask |= 1 << so->output[i].output_buffer;
2189 }
2190 r = r600_bytecode_add_output(ctx->bc, &output);
2191 if (r)
2192 goto out_err;
2193 }
2194 return 0;
2195 out_err:
2196 return r;
2197 }
2198
2199 static void convert_edgeflag_to_int(struct r600_shader_ctx *ctx)
2200 {
2201 struct r600_bytecode_alu alu;
2202 unsigned reg;
2203
2204 if (!ctx->shader->vs_out_edgeflag)
2205 return;
2206
2207 reg = ctx->shader->output[ctx->edgeflag_output].gpr;
2208
2209 /* clamp(x, 0, 1) */
2210 memset(&alu, 0, sizeof(alu));
2211 alu.op = ALU_OP1_MOV;
2212 alu.src[0].sel = reg;
2213 alu.dst.sel = reg;
2214 alu.dst.write = 1;
2215 alu.dst.clamp = 1;
2216 alu.last = 1;
2217 r600_bytecode_add_alu(ctx->bc, &alu);
2218
2219 memset(&alu, 0, sizeof(alu));
2220 alu.op = ALU_OP1_FLT_TO_INT;
2221 alu.src[0].sel = reg;
2222 alu.dst.sel = reg;
2223 alu.dst.write = 1;
2224 alu.last = 1;
2225 r600_bytecode_add_alu(ctx->bc, &alu);
2226 }
2227
2228 static int generate_gs_copy_shader(struct r600_context *rctx,
2229 struct r600_pipe_shader *gs,
2230 struct pipe_stream_output_info *so)
2231 {
2232 struct r600_shader_ctx ctx = {};
2233 struct r600_shader *gs_shader = &gs->shader;
2234 struct r600_pipe_shader *cshader;
2235 unsigned ocnt = gs_shader->noutput;
2236 struct r600_bytecode_alu alu;
2237 struct r600_bytecode_vtx vtx;
2238 struct r600_bytecode_output output;
2239 struct r600_bytecode_cf *cf_jump, *cf_pop,
2240 *last_exp_pos = NULL, *last_exp_param = NULL;
2241 int next_clip_pos = 61, next_param = 0;
2242 unsigned i, j;
2243 int ring;
2244 bool only_ring_0 = true;
2245 cshader = calloc(1, sizeof(struct r600_pipe_shader));
2246 if (!cshader)
2247 return 0;
2248
2249 memcpy(cshader->shader.output, gs_shader->output, ocnt *
2250 sizeof(struct r600_shader_io));
2251
2252 cshader->shader.noutput = ocnt;
2253
2254 ctx.shader = &cshader->shader;
2255 ctx.bc = &ctx.shader->bc;
2256 ctx.type = ctx.bc->type = PIPE_SHADER_VERTEX;
2257
2258 r600_bytecode_init(ctx.bc, rctx->b.chip_class, rctx->b.family,
2259 rctx->screen->has_compressed_msaa_texturing);
2260
2261 ctx.bc->isa = rctx->isa;
2262
2263 cf_jump = NULL;
2264 memset(cshader->shader.ring_item_sizes, 0, sizeof(cshader->shader.ring_item_sizes));
2265
2266 /* R0.x = R0.x & 0x3fffffff */
2267 memset(&alu, 0, sizeof(alu));
2268 alu.op = ALU_OP2_AND_INT;
2269 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2270 alu.src[1].value = 0x3fffffff;
2271 alu.dst.write = 1;
2272 r600_bytecode_add_alu(ctx.bc, &alu);
2273
2274 /* R0.y = R0.x >> 30 */
2275 memset(&alu, 0, sizeof(alu));
2276 alu.op = ALU_OP2_LSHR_INT;
2277 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2278 alu.src[1].value = 0x1e;
2279 alu.dst.chan = 1;
2280 alu.dst.write = 1;
2281 alu.last = 1;
2282 r600_bytecode_add_alu(ctx.bc, &alu);
2283
2284 /* fetch vertex data from GSVS ring */
2285 for (i = 0; i < ocnt; ++i) {
2286 struct r600_shader_io *out = &ctx.shader->output[i];
2287
2288 out->gpr = i + 1;
2289 out->ring_offset = i * 16;
2290
2291 memset(&vtx, 0, sizeof(vtx));
2292 vtx.op = FETCH_OP_VFETCH;
2293 vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
2294 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
2295 vtx.mega_fetch_count = 16;
2296 vtx.offset = out->ring_offset;
2297 vtx.dst_gpr = out->gpr;
2298 vtx.src_gpr = 0;
2299 vtx.dst_sel_x = 0;
2300 vtx.dst_sel_y = 1;
2301 vtx.dst_sel_z = 2;
2302 vtx.dst_sel_w = 3;
2303 if (rctx->b.chip_class >= EVERGREEN) {
2304 vtx.use_const_fields = 1;
2305 } else {
2306 vtx.data_format = FMT_32_32_32_32_FLOAT;
2307 }
2308
2309 r600_bytecode_add_vtx(ctx.bc, &vtx);
2310 }
2311 ctx.temp_reg = i + 1;
2312 for (ring = 3; ring >= 0; --ring) {
2313 bool enabled = false;
2314 for (i = 0; i < so->num_outputs; i++) {
2315 if (so->output[i].stream == ring) {
2316 enabled = true;
2317 if (ring > 0)
2318 only_ring_0 = false;
2319 break;
2320 }
2321 }
2322 if (ring != 0 && !enabled) {
2323 cshader->shader.ring_item_sizes[ring] = 0;
2324 continue;
2325 }
2326
2327 if (cf_jump) {
2328 // Patch up jump label
2329 r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
2330 cf_pop = ctx.bc->cf_last;
2331
2332 cf_jump->cf_addr = cf_pop->id + 2;
2333 cf_jump->pop_count = 1;
2334 cf_pop->cf_addr = cf_pop->id + 2;
2335 cf_pop->pop_count = 1;
2336 }
2337
2338 /* PRED_SETE_INT __, R0.y, ring */
2339 memset(&alu, 0, sizeof(alu));
2340 alu.op = ALU_OP2_PRED_SETE_INT;
2341 alu.src[0].chan = 1;
2342 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2343 alu.src[1].value = ring;
2344 alu.execute_mask = 1;
2345 alu.update_pred = 1;
2346 alu.last = 1;
2347 r600_bytecode_add_alu_type(ctx.bc, &alu, CF_OP_ALU_PUSH_BEFORE);
2348
2349 r600_bytecode_add_cfinst(ctx.bc, CF_OP_JUMP);
2350 cf_jump = ctx.bc->cf_last;
2351
2352 if (enabled)
2353 emit_streamout(&ctx, so, only_ring_0 ? -1 : ring, &cshader->shader.ring_item_sizes[ring]);
2354 cshader->shader.ring_item_sizes[ring] = ocnt * 16;
2355 }
2356
2357 /* bc adds nops - copy it */
2358 if (ctx.bc->chip_class == R600) {
2359 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2360 alu.op = ALU_OP0_NOP;
2361 alu.last = 1;
2362 r600_bytecode_add_alu(ctx.bc, &alu);
2363
2364 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
2365 }
2366
2367 /* export vertex data */
2368 /* XXX factor out common code with r600_shader_from_tgsi ? */
2369 for (i = 0; i < ocnt; ++i) {
2370 struct r600_shader_io *out = &ctx.shader->output[i];
2371 bool instream0 = true;
2372 if (out->name == TGSI_SEMANTIC_CLIPVERTEX)
2373 continue;
2374
2375 for (j = 0; j < so->num_outputs; j++) {
2376 if (so->output[j].register_index == i) {
2377 if (so->output[j].stream == 0)
2378 break;
2379 if (so->output[j].stream > 0)
2380 instream0 = false;
2381 }
2382 }
2383 if (!instream0)
2384 continue;
2385 memset(&output, 0, sizeof(output));
2386 output.gpr = out->gpr;
2387 output.elem_size = 3;
2388 output.swizzle_x = 0;
2389 output.swizzle_y = 1;
2390 output.swizzle_z = 2;
2391 output.swizzle_w = 3;
2392 output.burst_count = 1;
2393 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2394 output.op = CF_OP_EXPORT;
2395 switch (out->name) {
2396 case TGSI_SEMANTIC_POSITION:
2397 output.array_base = 60;
2398 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2399 break;
2400
2401 case TGSI_SEMANTIC_PSIZE:
2402 output.array_base = 61;
2403 if (next_clip_pos == 61)
2404 next_clip_pos = 62;
2405 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2406 output.swizzle_y = 7;
2407 output.swizzle_z = 7;
2408 output.swizzle_w = 7;
2409 ctx.shader->vs_out_misc_write = 1;
2410 ctx.shader->vs_out_point_size = 1;
2411 break;
2412 case TGSI_SEMANTIC_LAYER:
2413 if (out->spi_sid) {
2414 /* duplicate it as PARAM to pass to the pixel shader */
2415 output.array_base = next_param++;
2416 r600_bytecode_add_output(ctx.bc, &output);
2417 last_exp_param = ctx.bc->cf_last;
2418 }
2419 output.array_base = 61;
2420 if (next_clip_pos == 61)
2421 next_clip_pos = 62;
2422 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2423 output.swizzle_x = 7;
2424 output.swizzle_y = 7;
2425 output.swizzle_z = 0;
2426 output.swizzle_w = 7;
2427 ctx.shader->vs_out_misc_write = 1;
2428 ctx.shader->vs_out_layer = 1;
2429 break;
2430 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2431 if (out->spi_sid) {
2432 /* duplicate it as PARAM to pass to the pixel shader */
2433 output.array_base = next_param++;
2434 r600_bytecode_add_output(ctx.bc, &output);
2435 last_exp_param = ctx.bc->cf_last;
2436 }
2437 output.array_base = 61;
2438 if (next_clip_pos == 61)
2439 next_clip_pos = 62;
2440 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2441 ctx.shader->vs_out_misc_write = 1;
2442 ctx.shader->vs_out_viewport = 1;
2443 output.swizzle_x = 7;
2444 output.swizzle_y = 7;
2445 output.swizzle_z = 7;
2446 output.swizzle_w = 0;
2447 break;
2448 case TGSI_SEMANTIC_CLIPDIST:
2449 /* spi_sid is 0 for clipdistance outputs that were generated
2450 * for clipvertex - we don't need to pass them to PS */
2451 ctx.shader->clip_dist_write = gs->shader.clip_dist_write;
2452 ctx.shader->cull_dist_write = gs->shader.cull_dist_write;
2453 ctx.shader->cc_dist_mask = gs->shader.cc_dist_mask;
2454 if (out->spi_sid) {
2455 /* duplicate it as PARAM to pass to the pixel shader */
2456 output.array_base = next_param++;
2457 r600_bytecode_add_output(ctx.bc, &output);
2458 last_exp_param = ctx.bc->cf_last;
2459 }
2460 output.array_base = next_clip_pos++;
2461 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2462 break;
2463 case TGSI_SEMANTIC_FOG:
2464 output.swizzle_y = 4; /* 0 */
2465 output.swizzle_z = 4; /* 0 */
2466 output.swizzle_w = 5; /* 1 */
2467 break;
2468 default:
2469 output.array_base = next_param++;
2470 break;
2471 }
2472 r600_bytecode_add_output(ctx.bc, &output);
2473 if (output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM)
2474 last_exp_param = ctx.bc->cf_last;
2475 else
2476 last_exp_pos = ctx.bc->cf_last;
2477 }
2478
2479 if (!last_exp_pos) {
2480 memset(&output, 0, sizeof(output));
2481 output.gpr = 0;
2482 output.elem_size = 3;
2483 output.swizzle_x = 7;
2484 output.swizzle_y = 7;
2485 output.swizzle_z = 7;
2486 output.swizzle_w = 7;
2487 output.burst_count = 1;
2488 output.type = 2;
2489 output.op = CF_OP_EXPORT;
2490 output.array_base = 60;
2491 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2492 r600_bytecode_add_output(ctx.bc, &output);
2493 last_exp_pos = ctx.bc->cf_last;
2494 }
2495
2496 if (!last_exp_param) {
2497 memset(&output, 0, sizeof(output));
2498 output.gpr = 0;
2499 output.elem_size = 3;
2500 output.swizzle_x = 7;
2501 output.swizzle_y = 7;
2502 output.swizzle_z = 7;
2503 output.swizzle_w = 7;
2504 output.burst_count = 1;
2505 output.type = 2;
2506 output.op = CF_OP_EXPORT;
2507 output.array_base = next_param++;
2508 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2509 r600_bytecode_add_output(ctx.bc, &output);
2510 last_exp_param = ctx.bc->cf_last;
2511 }
2512
2513 last_exp_pos->op = CF_OP_EXPORT_DONE;
2514 last_exp_param->op = CF_OP_EXPORT_DONE;
2515
2516 r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
2517 cf_pop = ctx.bc->cf_last;
2518
2519 cf_jump->cf_addr = cf_pop->id + 2;
2520 cf_jump->pop_count = 1;
2521 cf_pop->cf_addr = cf_pop->id + 2;
2522 cf_pop->pop_count = 1;
2523
2524 if (ctx.bc->chip_class == CAYMAN)
2525 cm_bytecode_add_cf_end(ctx.bc);
2526 else {
2527 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
2528 ctx.bc->cf_last->end_of_program = 1;
2529 }
2530
2531 gs->gs_copy_shader = cshader;
2532 cshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
2533
2534 ctx.bc->nstack = 1;
2535
2536 return r600_bytecode_build(ctx.bc);
2537 }
2538
2539 static int emit_inc_ring_offset(struct r600_shader_ctx *ctx, int idx, bool ind)
2540 {
2541 if (ind) {
2542 struct r600_bytecode_alu alu;
2543 int r;
2544
2545 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2546 alu.op = ALU_OP2_ADD_INT;
2547 alu.src[0].sel = ctx->gs_export_gpr_tregs[idx];
2548 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2549 alu.src[1].value = ctx->gs_out_ring_offset >> 4;
2550 alu.dst.sel = ctx->gs_export_gpr_tregs[idx];
2551 alu.dst.write = 1;
2552 alu.last = 1;
2553 r = r600_bytecode_add_alu(ctx->bc, &alu);
2554 if (r)
2555 return r;
2556 }
2557 return 0;
2558 }
2559
2560 static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so UNUSED, int stream, bool ind)
2561 {
2562 struct r600_bytecode_output output;
2563 int ring_offset;
2564 unsigned i, k;
2565 int effective_stream = stream == -1 ? 0 : stream;
2566 int idx = 0;
2567
2568 for (i = 0; i < ctx->shader->noutput; i++) {
2569 if (ctx->gs_for_vs) {
2570 /* for ES we need to lookup corresponding ring offset expected by GS
2571 * (map this output to GS input by name and sid) */
2572 /* FIXME precompute offsets */
2573 ring_offset = -1;
2574 for(k = 0; k < ctx->gs_for_vs->ninput; ++k) {
2575 struct r600_shader_io *in = &ctx->gs_for_vs->input[k];
2576 struct r600_shader_io *out = &ctx->shader->output[i];
2577 if (in->name == out->name && in->sid == out->sid)
2578 ring_offset = in->ring_offset;
2579 }
2580
2581 if (ring_offset == -1)
2582 continue;
2583 } else {
2584 ring_offset = idx * 16;
2585 idx++;
2586 }
2587
2588 if (stream > 0 && ctx->shader->output[i].name == TGSI_SEMANTIC_POSITION)
2589 continue;
2590 /* next_ring_offset after parsing input decls contains total size of
2591 * single vertex data, gs_next_vertex - current vertex index */
2592 if (!ind)
2593 ring_offset += ctx->gs_out_ring_offset * ctx->gs_next_vertex;
2594
2595 memset(&output, 0, sizeof(struct r600_bytecode_output));
2596 output.gpr = ctx->shader->output[i].gpr;
2597 output.elem_size = 3;
2598 output.comp_mask = 0xF;
2599 output.burst_count = 1;
2600
2601 if (ind)
2602 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
2603 else
2604 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
2605
2606 switch (stream) {
2607 default:
2608 case 0:
2609 output.op = CF_OP_MEM_RING; break;
2610 case 1:
2611 output.op = CF_OP_MEM_RING1; break;
2612 case 2:
2613 output.op = CF_OP_MEM_RING2; break;
2614 case 3:
2615 output.op = CF_OP_MEM_RING3; break;
2616 }
2617
2618 if (ind) {
2619 output.array_base = ring_offset >> 2; /* in dwords */
2620 output.array_size = 0xfff;
2621 output.index_gpr = ctx->gs_export_gpr_tregs[effective_stream];
2622 } else
2623 output.array_base = ring_offset >> 2; /* in dwords */
2624 r600_bytecode_add_output(ctx->bc, &output);
2625 }
2626
2627 ++ctx->gs_next_vertex;
2628 return 0;
2629 }
2630
2631
2632 static int r600_fetch_tess_io_info(struct r600_shader_ctx *ctx)
2633 {
2634 int r;
2635 struct r600_bytecode_vtx vtx;
2636 int temp_val = ctx->temp_reg;
2637 /* need to store the TCS output somewhere */
2638 r = single_alu_op2(ctx, ALU_OP1_MOV,
2639 temp_val, 0,
2640 V_SQ_ALU_SRC_LITERAL, 0,
2641 0, 0);
2642 if (r)
2643 return r;
2644
2645 /* used by VS/TCS */
2646 if (ctx->tess_input_info) {
2647 /* fetch tcs input values into resv space */
2648 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
2649 vtx.op = FETCH_OP_VFETCH;
2650 vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER;
2651 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
2652 vtx.mega_fetch_count = 16;
2653 vtx.data_format = FMT_32_32_32_32;
2654 vtx.num_format_all = 2;
2655 vtx.format_comp_all = 1;
2656 vtx.use_const_fields = 0;
2657 vtx.endian = r600_endian_swap(32);
2658 vtx.srf_mode_all = 1;
2659 vtx.offset = 0;
2660 vtx.dst_gpr = ctx->tess_input_info;
2661 vtx.dst_sel_x = 0;
2662 vtx.dst_sel_y = 1;
2663 vtx.dst_sel_z = 2;
2664 vtx.dst_sel_w = 3;
2665 vtx.src_gpr = temp_val;
2666 vtx.src_sel_x = 0;
2667
2668 r = r600_bytecode_add_vtx(ctx->bc, &vtx);
2669 if (r)
2670 return r;
2671 }
2672
2673 /* used by TCS/TES */
2674 if (ctx->tess_output_info) {
2675 /* fetch tcs output values into resv space */
2676 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
2677 vtx.op = FETCH_OP_VFETCH;
2678 vtx.buffer_id = R600_LDS_INFO_CONST_BUFFER;
2679 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
2680 vtx.mega_fetch_count = 16;
2681 vtx.data_format = FMT_32_32_32_32;
2682 vtx.num_format_all = 2;
2683 vtx.format_comp_all = 1;
2684 vtx.use_const_fields = 0;
2685 vtx.endian = r600_endian_swap(32);
2686 vtx.srf_mode_all = 1;
2687 vtx.offset = 16;
2688 vtx.dst_gpr = ctx->tess_output_info;
2689 vtx.dst_sel_x = 0;
2690 vtx.dst_sel_y = 1;
2691 vtx.dst_sel_z = 2;
2692 vtx.dst_sel_w = 3;
2693 vtx.src_gpr = temp_val;
2694 vtx.src_sel_x = 0;
2695
2696 r = r600_bytecode_add_vtx(ctx->bc, &vtx);
2697 if (r)
2698 return r;
2699 }
2700 return 0;
2701 }
2702
2703 static int emit_lds_vs_writes(struct r600_shader_ctx *ctx)
2704 {
2705 int j, r;
2706 int temp_reg;
2707 unsigned i;
2708
2709 /* fetch tcs input values into input_vals */
2710 ctx->tess_input_info = r600_get_temp(ctx);
2711 ctx->tess_output_info = 0;
2712 r = r600_fetch_tess_io_info(ctx);
2713 if (r)
2714 return r;
2715
2716 temp_reg = r600_get_temp(ctx);
2717 /* dst reg contains LDS address stride * idx */
2718 /* MUL vertexID, vertex_dw_stride */
2719 r = single_alu_op2(ctx, ALU_OP2_MUL_UINT24,
2720 temp_reg, 0,
2721 ctx->tess_input_info, 1,
2722 0, 1); /* rel id in r0.y? */
2723 if (r)
2724 return r;
2725
2726 for (i = 0; i < ctx->shader->noutput; i++) {
2727 struct r600_bytecode_alu alu;
2728 int param = r600_get_lds_unique_index(ctx->shader->output[i].name, ctx->shader->output[i].sid);
2729
2730 if (param) {
2731 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2732 temp_reg, 1,
2733 temp_reg, 0,
2734 V_SQ_ALU_SRC_LITERAL, param * 16);
2735 if (r)
2736 return r;
2737 }
2738
2739 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2740 temp_reg, 2,
2741 temp_reg, param ? 1 : 0,
2742 V_SQ_ALU_SRC_LITERAL, 8);
2743 if (r)
2744 return r;
2745
2746
2747 for (j = 0; j < 2; j++) {
2748 int chan = (j == 1) ? 2 : (param ? 1 : 0);
2749 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2750 alu.op = LDS_OP3_LDS_WRITE_REL;
2751 alu.src[0].sel = temp_reg;
2752 alu.src[0].chan = chan;
2753 alu.src[1].sel = ctx->shader->output[i].gpr;
2754 alu.src[1].chan = j * 2;
2755 alu.src[2].sel = ctx->shader->output[i].gpr;
2756 alu.src[2].chan = (j * 2) + 1;
2757 alu.last = 1;
2758 alu.dst.chan = 0;
2759 alu.lds_idx = 1;
2760 alu.is_lds_idx_op = true;
2761 r = r600_bytecode_add_alu(ctx->bc, &alu);
2762 if (r)
2763 return r;
2764 }
2765 }
2766 return 0;
2767 }
2768
2769 static int r600_store_tcs_output(struct r600_shader_ctx *ctx)
2770 {
2771 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2772 const struct tgsi_full_dst_register *dst = &inst->Dst[0];
2773 int i, r, lasti;
2774 int temp_reg = r600_get_temp(ctx);
2775 struct r600_bytecode_alu alu;
2776 unsigned write_mask = dst->Register.WriteMask;
2777
2778 if (inst->Dst[0].Register.File != TGSI_FILE_OUTPUT)
2779 return 0;
2780
2781 r = get_lds_offset0(ctx, 1, temp_reg, dst->Register.Dimension ? false : true);
2782 if (r)
2783 return r;
2784
2785 /* the base address is now in temp.x */
2786 r = r600_get_byte_address(ctx, temp_reg,
2787 &inst->Dst[0], NULL, ctx->tess_output_info, 1);
2788 if (r)
2789 return r;
2790
2791 /* LDS write */
2792 lasti = tgsi_last_instruction(write_mask);
2793 for (i = 1; i <= lasti; i++) {
2794
2795 if (!(write_mask & (1 << i)))
2796 continue;
2797 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2798 temp_reg, i,
2799 temp_reg, 0,
2800 V_SQ_ALU_SRC_LITERAL, 4 * i);
2801 if (r)
2802 return r;
2803 }
2804
2805 for (i = 0; i <= lasti; i++) {
2806 if (!(write_mask & (1 << i)))
2807 continue;
2808
2809 if ((i == 0 && ((write_mask & 3) == 3)) ||
2810 (i == 2 && ((write_mask & 0xc) == 0xc))) {
2811 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2812 alu.op = LDS_OP3_LDS_WRITE_REL;
2813 alu.src[0].sel = temp_reg;
2814 alu.src[0].chan = i;
2815
2816 alu.src[1].sel = dst->Register.Index;
2817 alu.src[1].sel += ctx->file_offset[dst->Register.File];
2818 alu.src[1].chan = i;
2819
2820 alu.src[2].sel = dst->Register.Index;
2821 alu.src[2].sel += ctx->file_offset[dst->Register.File];
2822 alu.src[2].chan = i + 1;
2823 alu.lds_idx = 1;
2824 alu.dst.chan = 0;
2825 alu.last = 1;
2826 alu.is_lds_idx_op = true;
2827 r = r600_bytecode_add_alu(ctx->bc, &alu);
2828 if (r)
2829 return r;
2830 i += 1;
2831 continue;
2832 }
2833 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2834 alu.op = LDS_OP2_LDS_WRITE;
2835 alu.src[0].sel = temp_reg;
2836 alu.src[0].chan = i;
2837
2838 alu.src[1].sel = dst->Register.Index;
2839 alu.src[1].sel += ctx->file_offset[dst->Register.File];
2840 alu.src[1].chan = i;
2841
2842 alu.src[2].sel = V_SQ_ALU_SRC_0;
2843 alu.dst.chan = 0;
2844 alu.last = 1;
2845 alu.is_lds_idx_op = true;
2846 r = r600_bytecode_add_alu(ctx->bc, &alu);
2847 if (r)
2848 return r;
2849 }
2850 return 0;
2851 }
2852
2853 static int r600_tess_factor_read(struct r600_shader_ctx *ctx,
2854 int output_idx, int nc)
2855 {
2856 int param;
2857 unsigned temp_reg = r600_get_temp(ctx);
2858 unsigned name = ctx->shader->output[output_idx].name;
2859 int dreg = ctx->shader->output[output_idx].gpr;
2860 int r;
2861
2862 param = r600_get_lds_unique_index(name, 0);
2863 r = get_lds_offset0(ctx, 1, temp_reg, true);
2864 if (r)
2865 return r;
2866
2867 if (param) {
2868 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2869 temp_reg, 0,
2870 temp_reg, 0,
2871 V_SQ_ALU_SRC_LITERAL, param * 16);
2872 if (r)
2873 return r;
2874 }
2875
2876 do_lds_fetch_values(ctx, temp_reg, dreg, ((1u << nc) - 1));
2877 return 0;
2878 }
2879
2880 static int r600_emit_tess_factor(struct r600_shader_ctx *ctx)
2881 {
2882 int stride, outer_comps, inner_comps;
2883 int tessinner_idx = -1, tessouter_idx = -1;
2884 int i, r;
2885 unsigned j;
2886 int temp_reg = r600_get_temp(ctx);
2887 int treg[3] = {-1, -1, -1};
2888 struct r600_bytecode_alu alu;
2889 struct r600_bytecode_cf *cf_jump, *cf_pop;
2890
2891 /* only execute factor emission for invocation 0 */
2892 /* PRED_SETE_INT __, R0.x, 0 */
2893 memset(&alu, 0, sizeof(alu));
2894 alu.op = ALU_OP2_PRED_SETE_INT;
2895 alu.src[0].chan = 2;
2896 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2897 alu.execute_mask = 1;
2898 alu.update_pred = 1;
2899 alu.last = 1;
2900 r600_bytecode_add_alu_type(ctx->bc, &alu, CF_OP_ALU_PUSH_BEFORE);
2901
2902 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
2903 cf_jump = ctx->bc->cf_last;
2904
2905 treg[0] = r600_get_temp(ctx);
2906 switch (ctx->shader->tcs_prim_mode) {
2907 case PIPE_PRIM_LINES:
2908 stride = 8; /* 2 dwords, 1 vec2 store */
2909 outer_comps = 2;
2910 inner_comps = 0;
2911 break;
2912 case PIPE_PRIM_TRIANGLES:
2913 stride = 16; /* 4 dwords, 1 vec4 store */
2914 outer_comps = 3;
2915 inner_comps = 1;
2916 treg[1] = r600_get_temp(ctx);
2917 break;
2918 case PIPE_PRIM_QUADS:
2919 stride = 24; /* 6 dwords, 2 stores (vec4 + vec2) */
2920 outer_comps = 4;
2921 inner_comps = 2;
2922 treg[1] = r600_get_temp(ctx);
2923 treg[2] = r600_get_temp(ctx);
2924 break;
2925 default:
2926 assert(0);
2927 return -1;
2928 }
2929
2930 /* R0 is InvocationID, RelPatchID, PatchID, tf_base */
2931 /* TF_WRITE takes index in R.x, value in R.y */
2932 for (j = 0; j < ctx->shader->noutput; j++) {
2933 if (ctx->shader->output[j].name == TGSI_SEMANTIC_TESSINNER)
2934 tessinner_idx = j;
2935 if (ctx->shader->output[j].name == TGSI_SEMANTIC_TESSOUTER)
2936 tessouter_idx = j;
2937 }
2938
2939 if (tessouter_idx == -1)
2940 return -1;
2941
2942 if (tessinner_idx == -1 && inner_comps)
2943 return -1;
2944
2945 if (tessouter_idx != -1) {
2946 r = r600_tess_factor_read(ctx, tessouter_idx, outer_comps);
2947 if (r)
2948 return r;
2949 }
2950
2951 if (tessinner_idx != -1) {
2952 r = r600_tess_factor_read(ctx, tessinner_idx, inner_comps);
2953 if (r)
2954 return r;
2955 }
2956
2957 /* r.x = tf_base(r0.w) + relpatchid(r0.y) * tf_stride */
2958 /* r.x = relpatchid(r0.y) * tf_stride */
2959
2960 /* multiply incoming r0.y * stride - t.x = r0.y * stride */
2961 /* add incoming r0.w to it: t.x = t.x + r0.w */
2962 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
2963 temp_reg, 0,
2964 0, 1,
2965 V_SQ_ALU_SRC_LITERAL, stride,
2966 0, 3);
2967 if (r)
2968 return r;
2969
2970 for (i = 0; i < outer_comps + inner_comps; i++) {
2971 int out_idx = i >= outer_comps ? tessinner_idx : tessouter_idx;
2972 int out_comp = i >= outer_comps ? i - outer_comps : i;
2973
2974 if (ctx->shader->tcs_prim_mode == PIPE_PRIM_LINES) {
2975 if (out_comp == 1)
2976 out_comp = 0;
2977 else if (out_comp == 0)
2978 out_comp = 1;
2979 }
2980
2981 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
2982 treg[i / 2], (2 * (i % 2)),
2983 temp_reg, 0,
2984 V_SQ_ALU_SRC_LITERAL, 4 * i);
2985 if (r)
2986 return r;
2987 r = single_alu_op2(ctx, ALU_OP1_MOV,
2988 treg[i / 2], 1 + (2 * (i%2)),
2989 ctx->shader->output[out_idx].gpr, out_comp,
2990 0, 0);
2991 if (r)
2992 return r;
2993 }
2994 for (i = 0; i < outer_comps + inner_comps; i++) {
2995 struct r600_bytecode_gds gds;
2996
2997 memset(&gds, 0, sizeof(struct r600_bytecode_gds));
2998 gds.src_gpr = treg[i / 2];
2999 gds.src_sel_x = 2 * (i % 2);
3000 gds.src_sel_y = 1 + (2 * (i % 2));
3001 gds.src_sel_z = 4;
3002 gds.dst_sel_x = 7;
3003 gds.dst_sel_y = 7;
3004 gds.dst_sel_z = 7;
3005 gds.dst_sel_w = 7;
3006 gds.op = FETCH_OP_TF_WRITE;
3007 r = r600_bytecode_add_gds(ctx->bc, &gds);
3008 if (r)
3009 return r;
3010 }
3011
3012 // Patch up jump label
3013 r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP);
3014 cf_pop = ctx->bc->cf_last;
3015
3016 cf_jump->cf_addr = cf_pop->id + 2;
3017 cf_jump->pop_count = 1;
3018 cf_pop->cf_addr = cf_pop->id + 2;
3019 cf_pop->pop_count = 1;
3020
3021 return 0;
3022 }
3023
3024 /*
3025 * We have to work out the thread ID for load and atomic
3026 * operations, which store the returned value to an index
3027 * in an intermediate buffer.
3028 * The index is calculated by taking the thread id,
3029 * calculated from the MBCNT instructions.
3030 * Then the shader engine ID is multiplied by 256,
3031 * and the wave id is added.
3032 * Then the result is multipled by 64 and thread id is
3033 * added.
3034 */
3035 static int load_thread_id_gpr(struct r600_shader_ctx *ctx)
3036 {
3037 struct r600_bytecode_alu alu;
3038 int r;
3039
3040 if (ctx->thread_id_gpr_loaded)
3041 return 0;
3042
3043 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3044 alu.op = ALU_OP1_MBCNT_32LO_ACCUM_PREV_INT;
3045 alu.dst.sel = ctx->temp_reg;
3046 alu.dst.chan = 0;
3047 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3048 alu.src[0].value = 0xffffffff;
3049 alu.dst.write = 1;
3050 r = r600_bytecode_add_alu(ctx->bc, &alu);
3051 if (r)
3052 return r;
3053
3054 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3055 alu.op = ALU_OP1_MBCNT_32HI_INT;
3056 alu.dst.sel = ctx->temp_reg;
3057 alu.dst.chan = 1;
3058 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3059 alu.src[0].value = 0xffffffff;
3060 alu.dst.write = 1;
3061 r = r600_bytecode_add_alu(ctx->bc, &alu);
3062 if (r)
3063 return r;
3064
3065 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3066 alu.op = ALU_OP3_MULADD_UINT24;
3067 alu.dst.sel = ctx->temp_reg;
3068 alu.dst.chan = 2;
3069 alu.src[0].sel = EG_V_SQ_ALU_SRC_SE_ID;
3070 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
3071 alu.src[1].value = 256;
3072 alu.src[2].sel = EG_V_SQ_ALU_SRC_HW_WAVE_ID;
3073 alu.dst.write = 1;
3074 alu.is_op3 = 1;
3075 alu.last = 1;
3076 r = r600_bytecode_add_alu(ctx->bc, &alu);
3077 if (r)
3078 return r;
3079
3080 r = single_alu_op3(ctx, ALU_OP3_MULADD_UINT24,
3081 ctx->thread_id_gpr, 1,
3082 ctx->temp_reg, 2,
3083 V_SQ_ALU_SRC_LITERAL, 0x40,
3084 ctx->temp_reg, 0);
3085 if (r)
3086 return r;
3087 ctx->thread_id_gpr_loaded = true;
3088 return 0;
3089 }
3090
3091 static int r600_shader_from_tgsi(struct r600_context *rctx,
3092 struct r600_pipe_shader *pipeshader,
3093 union r600_shader_key key)
3094 {
3095 struct r600_screen *rscreen = rctx->screen;
3096 struct r600_shader *shader = &pipeshader->shader;
3097 struct tgsi_token *tokens = pipeshader->selector->tokens;
3098 struct pipe_stream_output_info so = pipeshader->selector->so;
3099 struct tgsi_full_immediate *immediate;
3100 struct r600_shader_ctx ctx;
3101 struct r600_bytecode_output output[ARRAY_SIZE(shader->output)];
3102 unsigned output_done, noutput;
3103 unsigned opcode;
3104 int j, k, r = 0;
3105 unsigned i;
3106 int next_param_base = 0, next_clip_base;
3107 int max_color_exports = MAX2(key.ps.nr_cbufs, 1);
3108 bool indirect_gprs;
3109 bool ring_outputs = false;
3110 bool lds_outputs = false;
3111 bool lds_inputs = false;
3112 bool pos_emitted = false;
3113
3114 ctx.bc = &shader->bc;
3115 ctx.shader = shader;
3116
3117 r600_bytecode_init(ctx.bc, rscreen->b.chip_class, rscreen->b.family,
3118 rscreen->has_compressed_msaa_texturing);
3119 ctx.tokens = tokens;
3120 tgsi_scan_shader(tokens, &ctx.info);
3121 shader->indirect_files = ctx.info.indirect_files;
3122
3123 shader->uses_doubles = ctx.info.uses_doubles;
3124 shader->uses_atomics = ctx.info.file_mask[TGSI_FILE_HW_ATOMIC];
3125 shader->nsys_inputs = 0;
3126
3127 shader->uses_images = ctx.info.file_count[TGSI_FILE_IMAGE] > 0 ||
3128 ctx.info.file_count[TGSI_FILE_BUFFER] > 0;
3129 indirect_gprs = ctx.info.indirect_files & ~((1 << TGSI_FILE_CONSTANT) | (1 << TGSI_FILE_SAMPLER));
3130 tgsi_parse_init(&ctx.parse, tokens);
3131 ctx.type = ctx.info.processor;
3132 shader->processor_type = ctx.type;
3133 ctx.bc->type = shader->processor_type;
3134
3135 switch (ctx.type) {
3136 case PIPE_SHADER_VERTEX:
3137 shader->vs_as_gs_a = key.vs.as_gs_a;
3138 shader->vs_as_es = key.vs.as_es;
3139 shader->vs_as_ls = key.vs.as_ls;
3140 shader->atomic_base = key.vs.first_atomic_counter;
3141 if (shader->vs_as_es)
3142 ring_outputs = true;
3143 if (shader->vs_as_ls)
3144 lds_outputs = true;
3145 break;
3146 case PIPE_SHADER_GEOMETRY:
3147 ring_outputs = true;
3148 shader->atomic_base = key.gs.first_atomic_counter;
3149 shader->gs_tri_strip_adj_fix = key.gs.tri_strip_adj_fix;
3150 break;
3151 case PIPE_SHADER_TESS_CTRL:
3152 shader->tcs_prim_mode = key.tcs.prim_mode;
3153 shader->atomic_base = key.tcs.first_atomic_counter;
3154 lds_outputs = true;
3155 lds_inputs = true;
3156 break;
3157 case PIPE_SHADER_TESS_EVAL:
3158 shader->tes_as_es = key.tes.as_es;
3159 shader->atomic_base = key.tes.first_atomic_counter;
3160 lds_inputs = true;
3161 if (shader->tes_as_es)
3162 ring_outputs = true;
3163 break;
3164 case PIPE_SHADER_FRAGMENT:
3165 shader->two_side = key.ps.color_two_side;
3166 shader->atomic_base = key.ps.first_atomic_counter;
3167 shader->rat_base = key.ps.nr_cbufs;
3168 shader->image_size_const_offset = key.ps.image_size_const_offset;
3169 break;
3170 case PIPE_SHADER_COMPUTE:
3171 shader->rat_base = 0;
3172 shader->image_size_const_offset = 0;
3173 break;
3174 default:
3175 break;
3176 }
3177
3178 if (shader->vs_as_es || shader->tes_as_es) {
3179 ctx.gs_for_vs = &rctx->gs_shader->current->shader;
3180 } else {
3181 ctx.gs_for_vs = NULL;
3182 }
3183
3184 ctx.next_ring_offset = 0;
3185 ctx.gs_out_ring_offset = 0;
3186 ctx.gs_next_vertex = 0;
3187 ctx.gs_stream_output_info = &so;
3188
3189 ctx.face_gpr = -1;
3190 ctx.fixed_pt_position_gpr = -1;
3191 ctx.fragcoord_input = -1;
3192 ctx.colors_used = 0;
3193 ctx.clip_vertex_write = 0;
3194 ctx.thread_id_gpr_loaded = false;
3195
3196 ctx.cs_block_size_reg = -1;
3197 ctx.cs_grid_size_reg = -1;
3198 ctx.cs_block_size_loaded = false;
3199 ctx.cs_grid_size_loaded = false;
3200
3201 shader->nr_ps_color_exports = 0;
3202 shader->nr_ps_max_color_exports = 0;
3203
3204
3205 /* register allocations */
3206 /* Values [0,127] correspond to GPR[0..127].
3207 * Values [128,159] correspond to constant buffer bank 0
3208 * Values [160,191] correspond to constant buffer bank 1
3209 * Values [256,511] correspond to cfile constants c[0..255]. (Gone on EG)
3210 * Values [256,287] correspond to constant buffer bank 2 (EG)
3211 * Values [288,319] correspond to constant buffer bank 3 (EG)
3212 * Other special values are shown in the list below.
3213 * 244 ALU_SRC_1_DBL_L: special constant 1.0 double-float, LSW. (RV670+)
3214 * 245 ALU_SRC_1_DBL_M: special constant 1.0 double-float, MSW. (RV670+)
3215 * 246 ALU_SRC_0_5_DBL_L: special constant 0.5 double-float, LSW. (RV670+)
3216 * 247 ALU_SRC_0_5_DBL_M: special constant 0.5 double-float, MSW. (RV670+)
3217 * 248 SQ_ALU_SRC_0: special constant 0.0.
3218 * 249 SQ_ALU_SRC_1: special constant 1.0 float.
3219 * 250 SQ_ALU_SRC_1_INT: special constant 1 integer.
3220 * 251 SQ_ALU_SRC_M_1_INT: special constant -1 integer.
3221 * 252 SQ_ALU_SRC_0_5: special constant 0.5 float.
3222 * 253 SQ_ALU_SRC_LITERAL: literal constant.
3223 * 254 SQ_ALU_SRC_PV: previous vector result.
3224 * 255 SQ_ALU_SRC_PS: previous scalar result.
3225 */
3226 for (i = 0; i < TGSI_FILE_COUNT; i++) {
3227 ctx.file_offset[i] = 0;
3228 }
3229
3230 if (ctx.type == PIPE_SHADER_VERTEX) {
3231
3232 ctx.file_offset[TGSI_FILE_INPUT] = 1;
3233 if (ctx.info.num_inputs)
3234 r600_bytecode_add_cfinst(ctx.bc, CF_OP_CALL_FS);
3235 }
3236 if (ctx.type == PIPE_SHADER_FRAGMENT) {
3237 if (ctx.bc->chip_class >= EVERGREEN)
3238 ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx);
3239 else
3240 ctx.file_offset[TGSI_FILE_INPUT] = allocate_system_value_inputs(&ctx, ctx.file_offset[TGSI_FILE_INPUT]);
3241 }
3242 if (ctx.type == PIPE_SHADER_GEOMETRY) {
3243 /* FIXME 1 would be enough in some cases (3 or less input vertices) */
3244 ctx.file_offset[TGSI_FILE_INPUT] = 2;
3245 }
3246 if (ctx.type == PIPE_SHADER_TESS_CTRL)
3247 ctx.file_offset[TGSI_FILE_INPUT] = 1;
3248 if (ctx.type == PIPE_SHADER_TESS_EVAL) {
3249 bool add_tesscoord = false, add_tess_inout = false;
3250 ctx.file_offset[TGSI_FILE_INPUT] = 1;
3251 for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) {
3252 /* if we have tesscoord save one reg */
3253 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSCOORD)
3254 add_tesscoord = true;
3255 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSINNER ||
3256 ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_TESSOUTER)
3257 add_tess_inout = true;
3258 }
3259 if (add_tesscoord || add_tess_inout)
3260 ctx.file_offset[TGSI_FILE_INPUT]++;
3261 if (add_tess_inout)
3262 ctx.file_offset[TGSI_FILE_INPUT]+=2;
3263 }
3264 if (ctx.type == PIPE_SHADER_COMPUTE) {
3265 ctx.file_offset[TGSI_FILE_INPUT] = 2;
3266 for (i = 0; i < PIPE_MAX_SHADER_INPUTS; i++) {
3267 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_GRID_SIZE)
3268 ctx.cs_grid_size_reg = ctx.file_offset[TGSI_FILE_INPUT]++;
3269 if (ctx.info.system_value_semantic_name[i] == TGSI_SEMANTIC_BLOCK_SIZE)
3270 ctx.cs_block_size_reg = ctx.file_offset[TGSI_FILE_INPUT]++;
3271 }
3272 }
3273
3274 ctx.file_offset[TGSI_FILE_OUTPUT] =
3275 ctx.file_offset[TGSI_FILE_INPUT] +
3276 ctx.info.file_max[TGSI_FILE_INPUT] + 1;
3277 ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] +
3278 ctx.info.file_max[TGSI_FILE_OUTPUT] + 1;
3279
3280 /* Outside the GPR range. This will be translated to one of the
3281 * kcache banks later. */
3282 ctx.file_offset[TGSI_FILE_CONSTANT] = 512;
3283
3284 ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL;
3285 ctx.bc->ar_reg = ctx.file_offset[TGSI_FILE_TEMPORARY] +
3286 ctx.info.file_max[TGSI_FILE_TEMPORARY] + 1;
3287 ctx.bc->index_reg[0] = ctx.bc->ar_reg + 1;
3288 ctx.bc->index_reg[1] = ctx.bc->ar_reg + 2;
3289
3290 if (ctx.type == PIPE_SHADER_TESS_CTRL) {
3291 ctx.tess_input_info = ctx.bc->ar_reg + 3;
3292 ctx.tess_output_info = ctx.bc->ar_reg + 4;
3293 ctx.temp_reg = ctx.bc->ar_reg + 5;
3294 } else if (ctx.type == PIPE_SHADER_TESS_EVAL) {
3295 ctx.tess_input_info = 0;
3296 ctx.tess_output_info = ctx.bc->ar_reg + 3;
3297 ctx.temp_reg = ctx.bc->ar_reg + 4;
3298 } else if (ctx.type == PIPE_SHADER_GEOMETRY) {
3299 ctx.gs_export_gpr_tregs[0] = ctx.bc->ar_reg + 3;
3300 ctx.gs_export_gpr_tregs[1] = ctx.bc->ar_reg + 4;
3301 ctx.gs_export_gpr_tregs[2] = ctx.bc->ar_reg + 5;
3302 ctx.gs_export_gpr_tregs[3] = ctx.bc->ar_reg + 6;
3303 ctx.temp_reg = ctx.bc->ar_reg + 7;
3304 if (ctx.shader->gs_tri_strip_adj_fix) {
3305 ctx.gs_rotated_input[0] = ctx.bc->ar_reg + 7;
3306 ctx.gs_rotated_input[1] = ctx.bc->ar_reg + 8;
3307 ctx.temp_reg += 2;
3308 } else {
3309 ctx.gs_rotated_input[0] = 0;
3310 ctx.gs_rotated_input[1] = 1;
3311 }
3312 } else {
3313 ctx.temp_reg = ctx.bc->ar_reg + 3;
3314 }
3315
3316 if (shader->uses_images) {
3317 ctx.thread_id_gpr = ctx.temp_reg++;
3318 ctx.thread_id_gpr_loaded = false;
3319 }
3320
3321 shader->max_arrays = 0;
3322 shader->num_arrays = 0;
3323 if (indirect_gprs) {
3324
3325 if (ctx.info.indirect_files & (1 << TGSI_FILE_INPUT)) {
3326 r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_INPUT],
3327 ctx.file_offset[TGSI_FILE_OUTPUT] -
3328 ctx.file_offset[TGSI_FILE_INPUT],
3329 0x0F);
3330 }
3331 if (ctx.info.indirect_files & (1 << TGSI_FILE_OUTPUT)) {
3332 r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_OUTPUT],
3333 ctx.file_offset[TGSI_FILE_TEMPORARY] -
3334 ctx.file_offset[TGSI_FILE_OUTPUT],
3335 0x0F);
3336 }
3337 }
3338
3339 ctx.nliterals = 0;
3340 ctx.literals = NULL;
3341 ctx.max_driver_temp_used = 0;
3342
3343 shader->fs_write_all = ctx.info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS] &&
3344 ctx.info.colors_written == 1;
3345 shader->vs_position_window_space = ctx.info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
3346 shader->ps_conservative_z = (uint8_t)ctx.info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT];
3347
3348 if (ctx.type == PIPE_SHADER_VERTEX ||
3349 ctx.type == PIPE_SHADER_GEOMETRY ||
3350 ctx.type == PIPE_SHADER_TESS_EVAL) {
3351 shader->cc_dist_mask = (1 << (ctx.info.properties[TGSI_PROPERTY_NUM_CULLDIST_ENABLED] +
3352 ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED])) - 1;
3353 shader->clip_dist_write = (1 << ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED]) - 1;
3354 shader->cull_dist_write = ((1 << ctx.info.properties[TGSI_PROPERTY_NUM_CULLDIST_ENABLED]) - 1) << ctx.info.properties[TGSI_PROPERTY_NUM_CLIPDIST_ENABLED];
3355 }
3356
3357 if (shader->vs_as_gs_a)
3358 vs_add_primid_output(&ctx, key.vs.prim_id_out);
3359
3360 if (ctx.type == PIPE_SHADER_TESS_EVAL)
3361 r600_fetch_tess_io_info(&ctx);
3362
3363 while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
3364 tgsi_parse_token(&ctx.parse);
3365 switch (ctx.parse.FullToken.Token.Type) {
3366 case TGSI_TOKEN_TYPE_IMMEDIATE:
3367 immediate = &ctx.parse.FullToken.FullImmediate;
3368 ctx.literals = realloc(ctx.literals, (ctx.nliterals + 1) * 16);
3369 if(ctx.literals == NULL) {
3370 r = -ENOMEM;
3371 goto out_err;
3372 }
3373 ctx.literals[ctx.nliterals * 4 + 0] = immediate->u[0].Uint;
3374 ctx.literals[ctx.nliterals * 4 + 1] = immediate->u[1].Uint;
3375 ctx.literals[ctx.nliterals * 4 + 2] = immediate->u[2].Uint;
3376 ctx.literals[ctx.nliterals * 4 + 3] = immediate->u[3].Uint;
3377 ctx.nliterals++;
3378 break;
3379 case TGSI_TOKEN_TYPE_DECLARATION:
3380 r = tgsi_declaration(&ctx);
3381 if (r)
3382 goto out_err;
3383 break;
3384 case TGSI_TOKEN_TYPE_INSTRUCTION:
3385 case TGSI_TOKEN_TYPE_PROPERTY:
3386 break;
3387 default:
3388 R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type);
3389 r = -EINVAL;
3390 goto out_err;
3391 }
3392 }
3393
3394 shader->ring_item_sizes[0] = ctx.next_ring_offset;
3395 shader->ring_item_sizes[1] = 0;
3396 shader->ring_item_sizes[2] = 0;
3397 shader->ring_item_sizes[3] = 0;
3398
3399 /* Process two side if needed */
3400 if (shader->two_side && ctx.colors_used) {
3401 int i, count = ctx.shader->ninput;
3402 unsigned next_lds_loc = ctx.shader->nlds;
3403
3404 /* additional inputs will be allocated right after the existing inputs,
3405 * we won't need them after the color selection, so we don't need to
3406 * reserve these gprs for the rest of the shader code and to adjust
3407 * output offsets etc. */
3408 int gpr = ctx.file_offset[TGSI_FILE_INPUT] +
3409 ctx.info.file_max[TGSI_FILE_INPUT] + 1;
3410
3411 /* if two sided and neither face or sample mask is used by shader, ensure face_gpr is emitted */
3412 if (ctx.face_gpr == -1) {
3413 i = ctx.shader->ninput++;
3414 ctx.shader->input[i].name = TGSI_SEMANTIC_FACE;
3415 ctx.shader->input[i].spi_sid = 0;
3416 ctx.shader->input[i].gpr = gpr++;
3417 ctx.face_gpr = ctx.shader->input[i].gpr;
3418 }
3419
3420 for (i = 0; i < count; i++) {
3421 if (ctx.shader->input[i].name == TGSI_SEMANTIC_COLOR) {
3422 int ni = ctx.shader->ninput++;
3423 memcpy(&ctx.shader->input[ni],&ctx.shader->input[i], sizeof(struct r600_shader_io));
3424 ctx.shader->input[ni].name = TGSI_SEMANTIC_BCOLOR;
3425 ctx.shader->input[ni].spi_sid = r600_spi_sid(&ctx.shader->input[ni]);
3426 ctx.shader->input[ni].gpr = gpr++;
3427 // TGSI to LLVM needs to know the lds position of inputs.
3428 // Non LLVM path computes it later (in process_twoside_color)
3429 ctx.shader->input[ni].lds_pos = next_lds_loc++;
3430 ctx.shader->input[i].back_color_input = ni;
3431 if (ctx.bc->chip_class >= EVERGREEN) {
3432 if ((r = evergreen_interp_input(&ctx, ni)))
3433 return r;
3434 }
3435 }
3436 }
3437 }
3438
3439 if (shader->fs_write_all && rscreen->b.chip_class >= EVERGREEN)
3440 shader->nr_ps_max_color_exports = 8;
3441
3442 if (ctx.fragcoord_input >= 0) {
3443 if (ctx.bc->chip_class == CAYMAN) {
3444 for (j = 0 ; j < 4; j++) {
3445 struct r600_bytecode_alu alu;
3446 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3447 alu.op = ALU_OP1_RECIP_IEEE;
3448 alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
3449 alu.src[0].chan = 3;
3450
3451 alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
3452 alu.dst.chan = j;
3453 alu.dst.write = (j == 3);
3454 alu.last = 1;
3455 if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
3456 return r;
3457 }
3458 } else {
3459 struct r600_bytecode_alu alu;
3460 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3461 alu.op = ALU_OP1_RECIP_IEEE;
3462 alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
3463 alu.src[0].chan = 3;
3464
3465 alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
3466 alu.dst.chan = 3;
3467 alu.dst.write = 1;
3468 alu.last = 1;
3469 if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
3470 return r;
3471 }
3472 }
3473
3474 if (ctx.type == PIPE_SHADER_GEOMETRY) {
3475 struct r600_bytecode_alu alu;
3476 int r;
3477
3478 /* GS thread with no output workaround - emit a cut at start of GS */
3479 if (ctx.bc->chip_class == R600)
3480 r600_bytecode_add_cfinst(ctx.bc, CF_OP_CUT_VERTEX);
3481
3482 for (j = 0; j < 4; j++) {
3483 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3484 alu.op = ALU_OP1_MOV;
3485 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3486 alu.src[0].value = 0;
3487 alu.dst.sel = ctx.gs_export_gpr_tregs[j];
3488 alu.dst.write = 1;
3489 alu.last = 1;
3490 r = r600_bytecode_add_alu(ctx.bc, &alu);
3491 if (r)
3492 return r;
3493 }
3494
3495 if (ctx.shader->gs_tri_strip_adj_fix) {
3496 r = single_alu_op2(&ctx, ALU_OP2_AND_INT,
3497 ctx.gs_rotated_input[0], 2,
3498 0, 2,
3499 V_SQ_ALU_SRC_LITERAL, 1);
3500 if (r)
3501 return r;
3502
3503 for (i = 0; i < 6; i++) {
3504 int rotated = (i + 4) % 6;
3505 int offset_reg = i / 3;
3506 int offset_chan = i % 3;
3507 int rotated_offset_reg = rotated / 3;
3508 int rotated_offset_chan = rotated % 3;
3509
3510 if (offset_reg == 0 && offset_chan == 2)
3511 offset_chan = 3;
3512 if (rotated_offset_reg == 0 && rotated_offset_chan == 2)
3513 rotated_offset_chan = 3;
3514
3515 r = single_alu_op3(&ctx, ALU_OP3_CNDE_INT,
3516 ctx.gs_rotated_input[offset_reg], offset_chan,
3517 ctx.gs_rotated_input[0], 2,
3518 offset_reg, offset_chan,
3519 rotated_offset_reg, rotated_offset_chan);
3520 if (r)
3521 return r;
3522 }
3523 }
3524 }
3525
3526 if (ctx.type == PIPE_SHADER_TESS_CTRL)
3527 r600_fetch_tess_io_info(&ctx);
3528
3529 if (shader->two_side && ctx.colors_used) {
3530 if ((r = process_twoside_color_inputs(&ctx)))
3531 return r;
3532 }
3533
3534 tgsi_parse_init(&ctx.parse, tokens);
3535 while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
3536 tgsi_parse_token(&ctx.parse);
3537 switch (ctx.parse.FullToken.Token.Type) {
3538 case TGSI_TOKEN_TYPE_INSTRUCTION:
3539 r = tgsi_is_supported(&ctx);
3540 if (r)
3541 goto out_err;
3542 ctx.max_driver_temp_used = 0;
3543 /* reserve first tmp for everyone */
3544 r600_get_temp(&ctx);
3545
3546 opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode;
3547 if ((r = tgsi_split_constant(&ctx)))
3548 goto out_err;
3549 if ((r = tgsi_split_literal_constant(&ctx)))
3550 goto out_err;
3551 if (ctx.type == PIPE_SHADER_GEOMETRY) {
3552 if ((r = tgsi_split_gs_inputs(&ctx)))
3553 goto out_err;
3554 } else if (lds_inputs) {
3555 if ((r = tgsi_split_lds_inputs(&ctx)))
3556 goto out_err;
3557 }
3558 if (ctx.bc->chip_class == CAYMAN)
3559 ctx.inst_info = &cm_shader_tgsi_instruction[opcode];
3560 else if (ctx.bc->chip_class >= EVERGREEN)
3561 ctx.inst_info = &eg_shader_tgsi_instruction[opcode];
3562 else
3563 ctx.inst_info = &r600_shader_tgsi_instruction[opcode];
3564 r = ctx.inst_info->process(&ctx);
3565 if (r)
3566 goto out_err;
3567
3568 if (ctx.type == PIPE_SHADER_TESS_CTRL) {
3569 r = r600_store_tcs_output(&ctx);
3570 if (r)
3571 goto out_err;
3572 }
3573 break;
3574 default:
3575 break;
3576 }
3577 }
3578
3579 /* Reset the temporary register counter. */
3580 ctx.max_driver_temp_used = 0;
3581
3582 noutput = shader->noutput;
3583
3584 if (!ring_outputs && ctx.clip_vertex_write) {
3585 unsigned clipdist_temp[2];
3586
3587 clipdist_temp[0] = r600_get_temp(&ctx);
3588 clipdist_temp[1] = r600_get_temp(&ctx);
3589
3590 /* need to convert a clipvertex write into clipdistance writes and not export
3591 the clip vertex anymore */
3592
3593 memset(&shader->output[noutput], 0, 2*sizeof(struct r600_shader_io));
3594 shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
3595 shader->output[noutput].gpr = clipdist_temp[0];
3596 noutput++;
3597 shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
3598 shader->output[noutput].gpr = clipdist_temp[1];
3599 noutput++;
3600
3601 /* reset spi_sid for clipvertex output to avoid confusing spi */
3602 shader->output[ctx.cv_output].spi_sid = 0;
3603
3604 shader->clip_dist_write = 0xFF;
3605 shader->cc_dist_mask = 0xFF;
3606
3607 for (i = 0; i < 8; i++) {
3608 int oreg = i >> 2;
3609 int ochan = i & 3;
3610
3611 for (j = 0; j < 4; j++) {
3612 struct r600_bytecode_alu alu;
3613 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3614 alu.op = ALU_OP2_DOT4;
3615 alu.src[0].sel = shader->output[ctx.cv_output].gpr;
3616 alu.src[0].chan = j;
3617
3618 alu.src[1].sel = 512 + i;
3619 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
3620 alu.src[1].chan = j;
3621
3622 alu.dst.sel = clipdist_temp[oreg];
3623 alu.dst.chan = j;
3624 alu.dst.write = (j == ochan);
3625 if (j == 3)
3626 alu.last = 1;
3627 r = r600_bytecode_add_alu(ctx.bc, &alu);
3628 if (r)
3629 return r;
3630 }
3631 }
3632 }
3633
3634 /* Add stream outputs. */
3635 if (so.num_outputs) {
3636 bool emit = false;
3637 if (!lds_outputs && !ring_outputs && ctx.type == PIPE_SHADER_VERTEX)
3638 emit = true;
3639 if (!ring_outputs && ctx.type == PIPE_SHADER_TESS_EVAL)
3640 emit = true;
3641 if (emit)
3642 emit_streamout(&ctx, &so, -1, NULL);
3643 }
3644 pipeshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
3645 convert_edgeflag_to_int(&ctx);
3646
3647 if (ctx.type == PIPE_SHADER_TESS_CTRL)
3648 r600_emit_tess_factor(&ctx);
3649
3650 if (lds_outputs) {
3651 if (ctx.type == PIPE_SHADER_VERTEX) {
3652 if (ctx.shader->noutput)
3653 emit_lds_vs_writes(&ctx);
3654 }
3655 } else if (ring_outputs) {
3656 if (shader->vs_as_es || shader->tes_as_es) {
3657 ctx.gs_export_gpr_tregs[0] = r600_get_temp(&ctx);
3658 ctx.gs_export_gpr_tregs[1] = -1;
3659 ctx.gs_export_gpr_tregs[2] = -1;
3660 ctx.gs_export_gpr_tregs[3] = -1;
3661
3662 emit_gs_ring_writes(&ctx, &so, -1, FALSE);
3663 }
3664 } else {
3665 /* Export output */
3666 next_clip_base = shader->vs_out_misc_write ? 62 : 61;
3667
3668 for (i = 0, j = 0; i < noutput; i++, j++) {
3669 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
3670 output[j].gpr = shader->output[i].gpr;
3671 output[j].elem_size = 3;
3672 output[j].swizzle_x = 0;
3673 output[j].swizzle_y = 1;
3674 output[j].swizzle_z = 2;
3675 output[j].swizzle_w = 3;
3676 output[j].burst_count = 1;
3677 output[j].type = 0xffffffff;
3678 output[j].op = CF_OP_EXPORT;
3679 switch (ctx.type) {
3680 case PIPE_SHADER_VERTEX:
3681 case PIPE_SHADER_TESS_EVAL:
3682 switch (shader->output[i].name) {
3683 case TGSI_SEMANTIC_POSITION:
3684 output[j].array_base = 60;
3685 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3686 pos_emitted = true;
3687 break;
3688
3689 case TGSI_SEMANTIC_PSIZE:
3690 output[j].array_base = 61;
3691 output[j].swizzle_y = 7;
3692 output[j].swizzle_z = 7;
3693 output[j].swizzle_w = 7;
3694 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3695 pos_emitted = true;
3696 break;
3697 case TGSI_SEMANTIC_EDGEFLAG:
3698 output[j].array_base = 61;
3699 output[j].swizzle_x = 7;
3700 output[j].swizzle_y = 0;
3701 output[j].swizzle_z = 7;
3702 output[j].swizzle_w = 7;
3703 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3704 pos_emitted = true;
3705 break;
3706 case TGSI_SEMANTIC_LAYER:
3707 /* spi_sid is 0 for outputs that are
3708 * not consumed by PS */
3709 if (shader->output[i].spi_sid) {
3710 output[j].array_base = next_param_base++;
3711 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
3712 j++;
3713 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
3714 }
3715 output[j].array_base = 61;
3716 output[j].swizzle_x = 7;
3717 output[j].swizzle_y = 7;
3718 output[j].swizzle_z = 0;
3719 output[j].swizzle_w = 7;
3720 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3721 pos_emitted = true;
3722 break;
3723 case TGSI_SEMANTIC_VIEWPORT_INDEX:
3724 /* spi_sid is 0 for outputs that are
3725 * not consumed by PS */
3726 if (shader->output[i].spi_sid) {
3727 output[j].array_base = next_param_base++;
3728 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
3729 j++;
3730 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
3731 }
3732 output[j].array_base = 61;
3733 output[j].swizzle_x = 7;
3734 output[j].swizzle_y = 7;
3735 output[j].swizzle_z = 7;
3736 output[j].swizzle_w = 0;
3737 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3738 pos_emitted = true;
3739 break;
3740 case TGSI_SEMANTIC_CLIPVERTEX:
3741 j--;
3742 break;
3743 case TGSI_SEMANTIC_CLIPDIST:
3744 output[j].array_base = next_clip_base++;
3745 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3746 pos_emitted = true;
3747 /* spi_sid is 0 for clipdistance outputs that were generated
3748 * for clipvertex - we don't need to pass them to PS */
3749 if (shader->output[i].spi_sid) {
3750 j++;
3751 /* duplicate it as PARAM to pass to the pixel shader */
3752 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
3753 output[j].array_base = next_param_base++;
3754 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
3755 }
3756 break;
3757 case TGSI_SEMANTIC_FOG:
3758 output[j].swizzle_y = 4; /* 0 */
3759 output[j].swizzle_z = 4; /* 0 */
3760 output[j].swizzle_w = 5; /* 1 */
3761 break;
3762 case TGSI_SEMANTIC_PRIMID:
3763 output[j].swizzle_x = 2;
3764 output[j].swizzle_y = 4; /* 0 */
3765 output[j].swizzle_z = 4; /* 0 */
3766 output[j].swizzle_w = 4; /* 0 */
3767 break;
3768 }
3769
3770 break;
3771 case PIPE_SHADER_FRAGMENT:
3772 if (shader->output[i].name == TGSI_SEMANTIC_COLOR) {
3773 /* never export more colors than the number of CBs */
3774 if (shader->output[i].sid >= max_color_exports) {
3775 /* skip export */
3776 j--;
3777 continue;
3778 }
3779 output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
3780 output[j].array_base = shader->output[i].sid;
3781 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3782 shader->nr_ps_color_exports++;
3783 if (shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN)) {
3784 for (k = 1; k < max_color_exports; k++) {
3785 j++;
3786 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
3787 output[j].gpr = shader->output[i].gpr;
3788 output[j].elem_size = 3;
3789 output[j].swizzle_x = 0;
3790 output[j].swizzle_y = 1;
3791 output[j].swizzle_z = 2;
3792 output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
3793 output[j].burst_count = 1;
3794 output[j].array_base = k;
3795 output[j].op = CF_OP_EXPORT;
3796 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3797 shader->nr_ps_color_exports++;
3798 }
3799 }
3800 } else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) {
3801 output[j].array_base = 61;
3802 output[j].swizzle_x = 2;
3803 output[j].swizzle_y = 7;
3804 output[j].swizzle_z = output[j].swizzle_w = 7;
3805 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3806 } else if (shader->output[i].name == TGSI_SEMANTIC_STENCIL) {
3807 output[j].array_base = 61;
3808 output[j].swizzle_x = 7;
3809 output[j].swizzle_y = 1;
3810 output[j].swizzle_z = output[j].swizzle_w = 7;
3811 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3812 } else if (shader->output[i].name == TGSI_SEMANTIC_SAMPLEMASK) {
3813 output[j].array_base = 61;
3814 output[j].swizzle_x = 7;
3815 output[j].swizzle_y = 7;
3816 output[j].swizzle_z = 0;
3817 output[j].swizzle_w = 7;
3818 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3819 } else {
3820 R600_ERR("unsupported fragment output name %d\n", shader->output[i].name);
3821 r = -EINVAL;
3822 goto out_err;
3823 }
3824 break;
3825 case PIPE_SHADER_TESS_CTRL:
3826 break;
3827 default:
3828 R600_ERR("unsupported processor type %d\n", ctx.type);
3829 r = -EINVAL;
3830 goto out_err;
3831 }
3832
3833 if (output[j].type == 0xffffffff) {
3834 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
3835 output[j].array_base = next_param_base++;
3836 }
3837 }
3838
3839 /* add fake position export */
3840 if ((ctx.type == PIPE_SHADER_VERTEX || ctx.type == PIPE_SHADER_TESS_EVAL) && pos_emitted == false) {
3841 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
3842 output[j].gpr = 0;
3843 output[j].elem_size = 3;
3844 output[j].swizzle_x = 7;
3845 output[j].swizzle_y = 7;
3846 output[j].swizzle_z = 7;
3847 output[j].swizzle_w = 7;
3848 output[j].burst_count = 1;
3849 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
3850 output[j].array_base = 60;
3851 output[j].op = CF_OP_EXPORT;
3852 j++;
3853 }
3854
3855 /* add fake param output for vertex shader if no param is exported */
3856 if ((ctx.type == PIPE_SHADER_VERTEX || ctx.type == PIPE_SHADER_TESS_EVAL) && next_param_base == 0) {
3857 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
3858 output[j].gpr = 0;
3859 output[j].elem_size = 3;
3860 output[j].swizzle_x = 7;
3861 output[j].swizzle_y = 7;
3862 output[j].swizzle_z = 7;
3863 output[j].swizzle_w = 7;
3864 output[j].burst_count = 1;
3865 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
3866 output[j].array_base = 0;
3867 output[j].op = CF_OP_EXPORT;
3868 j++;
3869 }
3870
3871 /* add fake pixel export */
3872 if (ctx.type == PIPE_SHADER_FRAGMENT && shader->nr_ps_color_exports == 0) {
3873 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
3874 output[j].gpr = 0;
3875 output[j].elem_size = 3;
3876 output[j].swizzle_x = 7;
3877 output[j].swizzle_y = 7;
3878 output[j].swizzle_z = 7;
3879 output[j].swizzle_w = 7;
3880 output[j].burst_count = 1;
3881 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
3882 output[j].array_base = 0;
3883 output[j].op = CF_OP_EXPORT;
3884 j++;
3885 shader->nr_ps_color_exports++;
3886 }
3887
3888 noutput = j;
3889
3890 /* set export done on last export of each type */
3891 for (k = noutput - 1, output_done = 0; k >= 0; k--) {
3892 if (!(output_done & (1 << output[k].type))) {
3893 output_done |= (1 << output[k].type);
3894 output[k].op = CF_OP_EXPORT_DONE;
3895 }
3896 }
3897 /* add output to bytecode */
3898 for (i = 0; i < noutput; i++) {
3899 r = r600_bytecode_add_output(ctx.bc, &output[i]);
3900 if (r)
3901 goto out_err;
3902 }
3903 }
3904
3905 /* add program end */
3906 if (ctx.bc->chip_class == CAYMAN)
3907 cm_bytecode_add_cf_end(ctx.bc);
3908 else {
3909 const struct cf_op_info *last = NULL;
3910
3911 if (ctx.bc->cf_last)
3912 last = r600_isa_cf(ctx.bc->cf_last->op);
3913
3914 /* alu clause instructions don't have EOP bit, so add NOP */
3915 if (!last || last->flags & CF_ALU || ctx.bc->cf_last->op == CF_OP_LOOP_END || ctx.bc->cf_last->op == CF_OP_POP)
3916 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
3917
3918 ctx.bc->cf_last->end_of_program = 1;
3919 }
3920
3921 /* check GPR limit - we have 124 = 128 - 4
3922 * (4 are reserved as alu clause temporary registers) */
3923 if (ctx.bc->ngpr > 124) {
3924 R600_ERR("GPR limit exceeded - shader requires %d registers\n", ctx.bc->ngpr);
3925 r = -ENOMEM;
3926 goto out_err;
3927 }
3928
3929 if (ctx.type == PIPE_SHADER_GEOMETRY) {
3930 if ((r = generate_gs_copy_shader(rctx, pipeshader, &so)))
3931 return r;
3932 }
3933
3934 free(ctx.literals);
3935 tgsi_parse_free(&ctx.parse);
3936 return 0;
3937 out_err:
3938 free(ctx.literals);
3939 tgsi_parse_free(&ctx.parse);
3940 return r;
3941 }
3942
3943 static int tgsi_unsupported(struct r600_shader_ctx *ctx)
3944 {
3945 const unsigned tgsi_opcode =
3946 ctx->parse.FullToken.FullInstruction.Instruction.Opcode;
3947 R600_ERR("%s tgsi opcode unsupported\n",
3948 tgsi_get_opcode_name(tgsi_opcode));
3949 return -EINVAL;
3950 }
3951
3952 static int tgsi_end(struct r600_shader_ctx *ctx UNUSED)
3953 {
3954 return 0;
3955 }
3956
3957 static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
3958 const struct r600_shader_src *shader_src,
3959 unsigned chan)
3960 {
3961 bc_src->sel = shader_src->sel;
3962 bc_src->chan = shader_src->swizzle[chan];
3963 bc_src->neg = shader_src->neg;
3964 bc_src->abs = shader_src->abs;
3965 bc_src->rel = shader_src->rel;
3966 bc_src->value = shader_src->value[bc_src->chan];
3967 bc_src->kc_bank = shader_src->kc_bank;
3968 bc_src->kc_rel = shader_src->kc_rel;
3969 }
3970
3971 static void r600_bytecode_src_set_abs(struct r600_bytecode_alu_src *bc_src)
3972 {
3973 bc_src->abs = 1;
3974 bc_src->neg = 0;
3975 }
3976
3977 static void r600_bytecode_src_toggle_neg(struct r600_bytecode_alu_src *bc_src)
3978 {
3979 bc_src->neg = !bc_src->neg;
3980 }
3981
3982 static void tgsi_dst(struct r600_shader_ctx *ctx,
3983 const struct tgsi_full_dst_register *tgsi_dst,
3984 unsigned swizzle,
3985 struct r600_bytecode_alu_dst *r600_dst)
3986 {
3987 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3988
3989 r600_dst->sel = tgsi_dst->Register.Index;
3990 r600_dst->sel += ctx->file_offset[tgsi_dst->Register.File];
3991 r600_dst->chan = swizzle;
3992 r600_dst->write = 1;
3993 if (inst->Instruction.Saturate) {
3994 r600_dst->clamp = 1;
3995 }
3996 if (ctx->type == PIPE_SHADER_TESS_CTRL) {
3997 if (tgsi_dst->Register.File == TGSI_FILE_OUTPUT) {
3998 return;
3999 }
4000 }
4001 if (tgsi_dst->Register.Indirect)
4002 r600_dst->rel = V_SQ_REL_RELATIVE;
4003
4004 }
4005
4006 static int tgsi_op2_64_params(struct r600_shader_ctx *ctx, bool singledest, bool swap, int dest_temp, int op_override)
4007 {
4008 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4009 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4010 struct r600_bytecode_alu alu;
4011 int i, j, r, lasti = tgsi_last_instruction(write_mask);
4012 int use_tmp = 0;
4013 int swizzle_x = inst->Src[0].Register.SwizzleX;
4014
4015 if (singledest) {
4016 switch (write_mask) {
4017 case 0x1:
4018 if (swizzle_x == 2) {
4019 write_mask = 0xc;
4020 use_tmp = 3;
4021 } else
4022 write_mask = 0x3;
4023 break;
4024 case 0x2:
4025 if (swizzle_x == 2) {
4026 write_mask = 0xc;
4027 use_tmp = 3;
4028 } else {
4029 write_mask = 0x3;
4030 use_tmp = 1;
4031 }
4032 break;
4033 case 0x4:
4034 if (swizzle_x == 0) {
4035 write_mask = 0x3;
4036 use_tmp = 1;
4037 } else
4038 write_mask = 0xc;
4039 break;
4040 case 0x8:
4041 if (swizzle_x == 0) {
4042 write_mask = 0x3;
4043 use_tmp = 1;
4044 } else {
4045 write_mask = 0xc;
4046 use_tmp = 3;
4047 }
4048 break;
4049 }
4050 }
4051
4052 lasti = tgsi_last_instruction(write_mask);
4053 for (i = 0; i <= lasti; i++) {
4054
4055 if (!(write_mask & (1 << i)))
4056 continue;
4057
4058 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4059
4060 if (singledest) {
4061 if (use_tmp || dest_temp) {
4062 alu.dst.sel = use_tmp ? ctx->temp_reg : dest_temp;
4063 alu.dst.chan = i;
4064 alu.dst.write = 1;
4065 } else {
4066 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4067 }
4068 if (i == 1 || i == 3)
4069 alu.dst.write = 0;
4070 } else
4071 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4072
4073 alu.op = op_override ? op_override : ctx->inst_info->op;
4074 if (ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DABS) {
4075 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4076 } else if (!swap) {
4077 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4078 r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
4079 }
4080 } else {
4081 r600_bytecode_src(&alu.src[0], &ctx->src[1], fp64_switch(i));
4082 r600_bytecode_src(&alu.src[1], &ctx->src[0], fp64_switch(i));
4083 }
4084
4085 /* handle some special cases */
4086 if (i == 1 || i == 3) {
4087 switch (ctx->parse.FullToken.FullInstruction.Instruction.Opcode) {
4088 case TGSI_OPCODE_DABS:
4089 r600_bytecode_src_set_abs(&alu.src[0]);
4090 break;
4091 default:
4092 break;
4093 }
4094 }
4095 if (i == lasti) {
4096 alu.last = 1;
4097 }
4098 r = r600_bytecode_add_alu(ctx->bc, &alu);
4099 if (r)
4100 return r;
4101 }
4102
4103 if (use_tmp) {
4104 write_mask = inst->Dst[0].Register.WriteMask;
4105
4106 lasti = tgsi_last_instruction(write_mask);
4107 /* move result from temp to dst */
4108 for (i = 0; i <= lasti; i++) {
4109 if (!(write_mask & (1 << i)))
4110 continue;
4111
4112 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4113 alu.op = ALU_OP1_MOV;
4114
4115 if (dest_temp) {
4116 alu.dst.sel = dest_temp;
4117 alu.dst.chan = i;
4118 alu.dst.write = 1;
4119 } else
4120 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4121 alu.src[0].sel = ctx->temp_reg;
4122 alu.src[0].chan = use_tmp - 1;
4123 alu.last = (i == lasti);
4124
4125 r = r600_bytecode_add_alu(ctx->bc, &alu);
4126 if (r)
4127 return r;
4128 }
4129 }
4130 return 0;
4131 }
4132
4133 static int tgsi_op2_64(struct r600_shader_ctx *ctx)
4134 {
4135 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4136 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4137 /* confirm writemasking */
4138 if ((write_mask & 0x3) != 0x3 &&
4139 (write_mask & 0xc) != 0xc) {
4140 fprintf(stderr, "illegal writemask for 64-bit: 0x%x\n", write_mask);
4141 return -1;
4142 }
4143 return tgsi_op2_64_params(ctx, false, false, 0, 0);
4144 }
4145
4146 static int tgsi_op2_64_single_dest(struct r600_shader_ctx *ctx)
4147 {
4148 return tgsi_op2_64_params(ctx, true, false, 0, 0);
4149 }
4150
4151 static int tgsi_op2_64_single_dest_s(struct r600_shader_ctx *ctx)
4152 {
4153 return tgsi_op2_64_params(ctx, true, true, 0, 0);
4154 }
4155
4156 static int tgsi_op3_64(struct r600_shader_ctx *ctx)
4157 {
4158 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4159 struct r600_bytecode_alu alu;
4160 int i, j, r;
4161 int lasti = 3;
4162 int tmp = r600_get_temp(ctx);
4163
4164 for (i = 0; i < lasti + 1; i++) {
4165
4166 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4167 alu.op = ctx->inst_info->op;
4168 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4169 r600_bytecode_src(&alu.src[j], &ctx->src[j], i == 3 ? 0 : 1);
4170 }
4171
4172 if (inst->Dst[0].Register.WriteMask & (1 << i))
4173 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4174 else
4175 alu.dst.sel = tmp;
4176
4177 alu.dst.chan = i;
4178 alu.is_op3 = 1;
4179 if (i == lasti) {
4180 alu.last = 1;
4181 }
4182 r = r600_bytecode_add_alu(ctx->bc, &alu);
4183 if (r)
4184 return r;
4185 }
4186 return 0;
4187 }
4188
4189 static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap, int trans_only)
4190 {
4191 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4192 struct r600_bytecode_alu alu;
4193 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4194 int i, j, r, lasti = tgsi_last_instruction(write_mask);
4195 /* use temp register if trans_only and more than one dst component */
4196 int use_tmp = trans_only && (write_mask ^ (1 << lasti));
4197 unsigned op = ctx->inst_info->op;
4198
4199 if (op == ALU_OP2_MUL_IEEE &&
4200 ctx->info.properties[TGSI_PROPERTY_MUL_ZERO_WINS])
4201 op = ALU_OP2_MUL;
4202
4203 for (i = 0; i <= lasti; i++) {
4204 if (!(write_mask & (1 << i)))
4205 continue;
4206
4207 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4208 if (use_tmp) {
4209 alu.dst.sel = ctx->temp_reg;
4210 alu.dst.chan = i;
4211 alu.dst.write = 1;
4212 } else
4213 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4214
4215 alu.op = op;
4216 if (!swap) {
4217 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4218 r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
4219 }
4220 } else {
4221 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4222 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4223 }
4224 if (i == lasti || trans_only) {
4225 alu.last = 1;
4226 }
4227 r = r600_bytecode_add_alu(ctx->bc, &alu);
4228 if (r)
4229 return r;
4230 }
4231
4232 if (use_tmp) {
4233 /* move result from temp to dst */
4234 for (i = 0; i <= lasti; i++) {
4235 if (!(write_mask & (1 << i)))
4236 continue;
4237
4238 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4239 alu.op = ALU_OP1_MOV;
4240 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4241 alu.src[0].sel = ctx->temp_reg;
4242 alu.src[0].chan = i;
4243 alu.last = (i == lasti);
4244
4245 r = r600_bytecode_add_alu(ctx->bc, &alu);
4246 if (r)
4247 return r;
4248 }
4249 }
4250 return 0;
4251 }
4252
4253 static int tgsi_op2(struct r600_shader_ctx *ctx)
4254 {
4255 return tgsi_op2_s(ctx, 0, 0);
4256 }
4257
4258 static int tgsi_op2_swap(struct r600_shader_ctx *ctx)
4259 {
4260 return tgsi_op2_s(ctx, 1, 0);
4261 }
4262
4263 static int tgsi_op2_trans(struct r600_shader_ctx *ctx)
4264 {
4265 return tgsi_op2_s(ctx, 0, 1);
4266 }
4267
4268 static int tgsi_ineg(struct r600_shader_ctx *ctx)
4269 {
4270 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4271 struct r600_bytecode_alu alu;
4272 int i, r;
4273 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4274
4275 for (i = 0; i < lasti + 1; i++) {
4276
4277 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4278 continue;
4279 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4280 alu.op = ctx->inst_info->op;
4281
4282 alu.src[0].sel = V_SQ_ALU_SRC_0;
4283
4284 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4285
4286 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4287
4288 if (i == lasti) {
4289 alu.last = 1;
4290 }
4291 r = r600_bytecode_add_alu(ctx->bc, &alu);
4292 if (r)
4293 return r;
4294 }
4295 return 0;
4296
4297 }
4298
4299 static int tgsi_dneg(struct r600_shader_ctx *ctx)
4300 {
4301 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4302 struct r600_bytecode_alu alu;
4303 int i, r;
4304 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4305
4306 for (i = 0; i < lasti + 1; i++) {
4307
4308 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4309 continue;
4310 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4311 alu.op = ALU_OP1_MOV;
4312
4313 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4314
4315 if (i == 1 || i == 3)
4316 r600_bytecode_src_toggle_neg(&alu.src[0]);
4317 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4318
4319 if (i == lasti) {
4320 alu.last = 1;
4321 }
4322 r = r600_bytecode_add_alu(ctx->bc, &alu);
4323 if (r)
4324 return r;
4325 }
4326 return 0;
4327
4328 }
4329
4330 static int tgsi_dfracexp(struct r600_shader_ctx *ctx)
4331 {
4332 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4333 struct r600_bytecode_alu alu;
4334 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4335 int i, j, r;
4336
4337 for (i = 0; i <= 3; i++) {
4338 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4339 alu.op = ctx->inst_info->op;
4340
4341 alu.dst.sel = ctx->temp_reg;
4342 alu.dst.chan = i;
4343 alu.dst.write = 1;
4344 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4345 r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
4346 }
4347
4348 if (i == 3)
4349 alu.last = 1;
4350
4351 r = r600_bytecode_add_alu(ctx->bc, &alu);
4352 if (r)
4353 return r;
4354 }
4355
4356 /* Replicate significand result across channels. */
4357 for (i = 0; i <= 3; i++) {
4358 if (!(write_mask & (1 << i)))
4359 continue;
4360
4361 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4362 alu.op = ALU_OP1_MOV;
4363 alu.src[0].chan = (i & 1) + 2;
4364 alu.src[0].sel = ctx->temp_reg;
4365
4366 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4367 alu.dst.write = 1;
4368 alu.last = 1;
4369 r = r600_bytecode_add_alu(ctx->bc, &alu);
4370 if (r)
4371 return r;
4372 }
4373
4374 for (i = 0; i <= 3; i++) {
4375 if (inst->Dst[1].Register.WriteMask & (1 << i)) {
4376 /* MOV third channels to writemask dst1 */
4377 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4378 alu.op = ALU_OP1_MOV;
4379 alu.src[0].chan = 1;
4380 alu.src[0].sel = ctx->temp_reg;
4381
4382 tgsi_dst(ctx, &inst->Dst[1], i, &alu.dst);
4383 alu.last = 1;
4384 r = r600_bytecode_add_alu(ctx->bc, &alu);
4385 if (r)
4386 return r;
4387 break;
4388 }
4389 }
4390 return 0;
4391 }
4392
4393
4394 static int egcm_int_to_double(struct r600_shader_ctx *ctx)
4395 {
4396 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4397 struct r600_bytecode_alu alu;
4398 int i, r;
4399 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4400
4401 assert(inst->Instruction.Opcode == TGSI_OPCODE_I2D ||
4402 inst->Instruction.Opcode == TGSI_OPCODE_U2D);
4403
4404 for (i = 0; i <= (lasti+1)/2; i++) {
4405 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4406 alu.op = ctx->inst_info->op;
4407
4408 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4409 alu.dst.sel = ctx->temp_reg;
4410 alu.dst.chan = i;
4411 alu.dst.write = 1;
4412 alu.last = 1;
4413
4414 r = r600_bytecode_add_alu(ctx->bc, &alu);
4415 if (r)
4416 return r;
4417 }
4418
4419 for (i = 0; i <= lasti; i++) {
4420 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4421 alu.op = ALU_OP1_FLT32_TO_FLT64;
4422
4423 alu.src[0].chan = i/2;
4424 if (i%2 == 0)
4425 alu.src[0].sel = ctx->temp_reg;
4426 else {
4427 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
4428 alu.src[0].value = 0x0;
4429 }
4430 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4431 alu.last = i == lasti;
4432
4433 r = r600_bytecode_add_alu(ctx->bc, &alu);
4434 if (r)
4435 return r;
4436 }
4437
4438 return 0;
4439 }
4440
4441 static int egcm_double_to_int(struct r600_shader_ctx *ctx)
4442 {
4443 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4444 struct r600_bytecode_alu alu;
4445 int i, r;
4446 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4447 int treg = r600_get_temp(ctx);
4448 assert(inst->Instruction.Opcode == TGSI_OPCODE_D2I ||
4449 inst->Instruction.Opcode == TGSI_OPCODE_D2U);
4450
4451 /* do a 64->32 into a temp register */
4452 r = tgsi_op2_64_params(ctx, true, false, treg, ALU_OP1_FLT64_TO_FLT32);
4453 if (r)
4454 return r;
4455
4456 for (i = 0; i <= lasti; i++) {
4457 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4458 continue;
4459 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4460 alu.op = ctx->inst_info->op;
4461
4462 alu.src[0].chan = i;
4463 alu.src[0].sel = treg;
4464 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4465 alu.last = (i == lasti);
4466
4467 r = r600_bytecode_add_alu(ctx->bc, &alu);
4468 if (r)
4469 return r;
4470 }
4471
4472 return 0;
4473 }
4474
4475 static int cayman_emit_unary_double_raw(struct r600_bytecode *bc,
4476 unsigned op,
4477 int dst_reg,
4478 struct r600_shader_src *src,
4479 bool abs)
4480 {
4481 struct r600_bytecode_alu alu;
4482 const int last_slot = 3;
4483 int r;
4484
4485 /* these have to write the result to X/Y by the looks of it */
4486 for (int i = 0 ; i < last_slot; i++) {
4487 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4488 alu.op = op;
4489
4490 r600_bytecode_src(&alu.src[0], src, 1);
4491 r600_bytecode_src(&alu.src[1], src, 0);
4492
4493 if (abs)
4494 r600_bytecode_src_set_abs(&alu.src[1]);
4495
4496 alu.dst.sel = dst_reg;
4497 alu.dst.chan = i;
4498 alu.dst.write = (i == 0 || i == 1);
4499
4500 if (bc->chip_class != CAYMAN || i == last_slot - 1)
4501 alu.last = 1;
4502 r = r600_bytecode_add_alu(bc, &alu);
4503 if (r)
4504 return r;
4505 }
4506
4507 return 0;
4508 }
4509
4510 static int cayman_emit_double_instr(struct r600_shader_ctx *ctx)
4511 {
4512 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4513 int i, r;
4514 struct r600_bytecode_alu alu;
4515 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4516 int t1 = ctx->temp_reg;
4517
4518 /* should only be one src regs */
4519 assert(inst->Instruction.NumSrcRegs == 1);
4520
4521 /* only support one double at a time */
4522 assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
4523 inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);
4524
4525 r = cayman_emit_unary_double_raw(
4526 ctx->bc, ctx->inst_info->op, t1,
4527 &ctx->src[0],
4528 ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DRSQ ||
4529 ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DSQRT);
4530 if (r)
4531 return r;
4532
4533 for (i = 0 ; i <= lasti; i++) {
4534 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4535 continue;
4536 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4537 alu.op = ALU_OP1_MOV;
4538 alu.src[0].sel = t1;
4539 alu.src[0].chan = (i == 0 || i == 2) ? 0 : 1;
4540 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4541 alu.dst.write = 1;
4542 if (i == lasti)
4543 alu.last = 1;
4544 r = r600_bytecode_add_alu(ctx->bc, &alu);
4545 if (r)
4546 return r;
4547 }
4548 return 0;
4549 }
4550
4551 static int cayman_emit_float_instr(struct r600_shader_ctx *ctx)
4552 {
4553 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4554 int i, j, r;
4555 struct r600_bytecode_alu alu;
4556 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
4557
4558 for (i = 0 ; i < last_slot; i++) {
4559 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4560 alu.op = ctx->inst_info->op;
4561 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4562 r600_bytecode_src(&alu.src[j], &ctx->src[j], 0);
4563
4564 /* RSQ should take the absolute value of src */
4565 if (inst->Instruction.Opcode == TGSI_OPCODE_RSQ) {
4566 r600_bytecode_src_set_abs(&alu.src[j]);
4567 }
4568 }
4569 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4570 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
4571
4572 if (i == last_slot - 1)
4573 alu.last = 1;
4574 r = r600_bytecode_add_alu(ctx->bc, &alu);
4575 if (r)
4576 return r;
4577 }
4578 return 0;
4579 }
4580
4581 static int cayman_mul_int_instr(struct r600_shader_ctx *ctx)
4582 {
4583 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4584 int i, j, k, r;
4585 struct r600_bytecode_alu alu;
4586 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4587 int t1 = ctx->temp_reg;
4588
4589 for (k = 0; k <= lasti; k++) {
4590 if (!(inst->Dst[0].Register.WriteMask & (1 << k)))
4591 continue;
4592
4593 for (i = 0 ; i < 4; i++) {
4594 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4595 alu.op = ctx->inst_info->op;
4596 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4597 r600_bytecode_src(&alu.src[j], &ctx->src[j], k);
4598 }
4599 alu.dst.sel = t1;
4600 alu.dst.chan = i;
4601 alu.dst.write = (i == k);
4602 if (i == 3)
4603 alu.last = 1;
4604 r = r600_bytecode_add_alu(ctx->bc, &alu);
4605 if (r)
4606 return r;
4607 }
4608 }
4609
4610 for (i = 0 ; i <= lasti; i++) {
4611 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4612 continue;
4613 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4614 alu.op = ALU_OP1_MOV;
4615 alu.src[0].sel = t1;
4616 alu.src[0].chan = i;
4617 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4618 alu.dst.write = 1;
4619 if (i == lasti)
4620 alu.last = 1;
4621 r = r600_bytecode_add_alu(ctx->bc, &alu);
4622 if (r)
4623 return r;
4624 }
4625
4626 return 0;
4627 }
4628
4629
4630 static int cayman_mul_double_instr(struct r600_shader_ctx *ctx)
4631 {
4632 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4633 int i, j, k, r;
4634 struct r600_bytecode_alu alu;
4635 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4636 int t1 = ctx->temp_reg;
4637
4638 /* t1 would get overwritten below if we actually tried to
4639 * multiply two pairs of doubles at a time. */
4640 assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
4641 inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);
4642
4643 k = inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ? 0 : 1;
4644
4645 for (i = 0; i < 4; i++) {
4646 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4647 alu.op = ctx->inst_info->op;
4648 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4649 r600_bytecode_src(&alu.src[j], &ctx->src[j], k * 2 + ((i == 3) ? 0 : 1));
4650 }
4651 alu.dst.sel = t1;
4652 alu.dst.chan = i;
4653 alu.dst.write = 1;
4654 if (i == 3)
4655 alu.last = 1;
4656 r = r600_bytecode_add_alu(ctx->bc, &alu);
4657 if (r)
4658 return r;
4659 }
4660
4661 for (i = 0; i <= lasti; i++) {
4662 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4663 continue;
4664 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4665 alu.op = ALU_OP1_MOV;
4666 alu.src[0].sel = t1;
4667 alu.src[0].chan = i;
4668 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4669 alu.dst.write = 1;
4670 if (i == lasti)
4671 alu.last = 1;
4672 r = r600_bytecode_add_alu(ctx->bc, &alu);
4673 if (r)
4674 return r;
4675 }
4676
4677 return 0;
4678 }
4679
4680 /*
4681 * Emit RECIP_64 + MUL_64 to implement division.
4682 */
4683 static int cayman_ddiv_instr(struct r600_shader_ctx *ctx)
4684 {
4685 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4686 int r;
4687 struct r600_bytecode_alu alu;
4688 int t1 = ctx->temp_reg;
4689 int k;
4690
4691 /* Only support one double at a time. This is the same constraint as
4692 * in DMUL lowering. */
4693 assert(inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ||
4694 inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_ZW);
4695
4696 k = inst->Dst[0].Register.WriteMask == TGSI_WRITEMASK_XY ? 0 : 1;
4697
4698 r = cayman_emit_unary_double_raw(ctx->bc, ALU_OP2_RECIP_64, t1, &ctx->src[1], false);
4699 if (r)
4700 return r;
4701
4702 for (int i = 0; i < 4; i++) {
4703 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4704 alu.op = ALU_OP2_MUL_64;
4705
4706 r600_bytecode_src(&alu.src[0], &ctx->src[0], k * 2 + ((i == 3) ? 0 : 1));
4707
4708 alu.src[1].sel = t1;
4709 alu.src[1].chan = (i == 3) ? 0 : 1;
4710
4711 alu.dst.sel = t1;
4712 alu.dst.chan = i;
4713 alu.dst.write = 1;
4714 if (i == 3)
4715 alu.last = 1;
4716 r = r600_bytecode_add_alu(ctx->bc, &alu);
4717 if (r)
4718 return r;
4719 }
4720
4721 for (int i = 0; i < 2; i++) {
4722 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4723 alu.op = ALU_OP1_MOV;
4724 alu.src[0].sel = t1;
4725 alu.src[0].chan = i;
4726 tgsi_dst(ctx, &inst->Dst[0], k * 2 + i, &alu.dst);
4727 alu.dst.write = 1;
4728 if (i == 1)
4729 alu.last = 1;
4730 r = r600_bytecode_add_alu(ctx->bc, &alu);
4731 if (r)
4732 return r;
4733 }
4734 return 0;
4735 }
4736
4737 /*
4738 * r600 - trunc to -PI..PI range
4739 * r700 - normalize by dividing by 2PI
4740 * see fdo bug 27901
4741 */
4742 static int tgsi_setup_trig(struct r600_shader_ctx *ctx)
4743 {
4744 int r;
4745 struct r600_bytecode_alu alu;
4746
4747 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4748 alu.op = ALU_OP3_MULADD;
4749 alu.is_op3 = 1;
4750
4751 alu.dst.chan = 0;
4752 alu.dst.sel = ctx->temp_reg;
4753 alu.dst.write = 1;
4754
4755 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
4756
4757 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4758 alu.src[1].chan = 0;
4759 alu.src[1].value = u_bitcast_f2u(0.5f * M_1_PI);
4760 alu.src[2].sel = V_SQ_ALU_SRC_0_5;
4761 alu.src[2].chan = 0;
4762 alu.last = 1;
4763 r = r600_bytecode_add_alu(ctx->bc, &alu);
4764 if (r)
4765 return r;
4766
4767 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4768 alu.op = ALU_OP1_FRACT;
4769
4770 alu.dst.chan = 0;
4771 alu.dst.sel = ctx->temp_reg;
4772 alu.dst.write = 1;
4773
4774 alu.src[0].sel = ctx->temp_reg;
4775 alu.src[0].chan = 0;
4776 alu.last = 1;
4777 r = r600_bytecode_add_alu(ctx->bc, &alu);
4778 if (r)
4779 return r;
4780
4781 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4782 alu.op = ALU_OP3_MULADD;
4783 alu.is_op3 = 1;
4784
4785 alu.dst.chan = 0;
4786 alu.dst.sel = ctx->temp_reg;
4787 alu.dst.write = 1;
4788
4789 alu.src[0].sel = ctx->temp_reg;
4790 alu.src[0].chan = 0;
4791
4792 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4793 alu.src[1].chan = 0;
4794 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
4795 alu.src[2].chan = 0;
4796
4797 if (ctx->bc->chip_class == R600) {
4798 alu.src[1].value = u_bitcast_f2u(2.0f * M_PI);
4799 alu.src[2].value = u_bitcast_f2u(-M_PI);
4800 } else {
4801 alu.src[1].sel = V_SQ_ALU_SRC_1;
4802 alu.src[2].sel = V_SQ_ALU_SRC_0_5;
4803 alu.src[2].neg = 1;
4804 }
4805
4806 alu.last = 1;
4807 r = r600_bytecode_add_alu(ctx->bc, &alu);
4808 if (r)
4809 return r;
4810 return 0;
4811 }
4812
4813 static int cayman_trig(struct r600_shader_ctx *ctx)
4814 {
4815 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4816 struct r600_bytecode_alu alu;
4817 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
4818 int i, r;
4819
4820 r = tgsi_setup_trig(ctx);
4821 if (r)
4822 return r;
4823
4824
4825 for (i = 0; i < last_slot; i++) {
4826 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4827 alu.op = ctx->inst_info->op;
4828 alu.dst.chan = i;
4829
4830 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4831 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
4832
4833 alu.src[0].sel = ctx->temp_reg;
4834 alu.src[0].chan = 0;
4835 if (i == last_slot - 1)
4836 alu.last = 1;
4837 r = r600_bytecode_add_alu(ctx->bc, &alu);
4838 if (r)
4839 return r;
4840 }
4841 return 0;
4842 }
4843
4844 static int tgsi_trig(struct r600_shader_ctx *ctx)
4845 {
4846 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4847 struct r600_bytecode_alu alu;
4848 int i, r;
4849 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4850
4851 r = tgsi_setup_trig(ctx);
4852 if (r)
4853 return r;
4854
4855 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4856 alu.op = ctx->inst_info->op;
4857 alu.dst.chan = 0;
4858 alu.dst.sel = ctx->temp_reg;
4859 alu.dst.write = 1;
4860
4861 alu.src[0].sel = ctx->temp_reg;
4862 alu.src[0].chan = 0;
4863 alu.last = 1;
4864 r = r600_bytecode_add_alu(ctx->bc, &alu);
4865 if (r)
4866 return r;
4867
4868 /* replicate result */
4869 for (i = 0; i < lasti + 1; i++) {
4870 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4871 continue;
4872
4873 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4874 alu.op = ALU_OP1_MOV;
4875
4876 alu.src[0].sel = ctx->temp_reg;
4877 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4878 if (i == lasti)
4879 alu.last = 1;
4880 r = r600_bytecode_add_alu(ctx->bc, &alu);
4881 if (r)
4882 return r;
4883 }
4884 return 0;
4885 }
4886
4887 static int tgsi_kill(struct r600_shader_ctx *ctx)
4888 {
4889 const struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4890 struct r600_bytecode_alu alu;
4891 int i, r;
4892
4893 for (i = 0; i < 4; i++) {
4894 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4895 alu.op = ctx->inst_info->op;
4896
4897 alu.dst.chan = i;
4898
4899 alu.src[0].sel = V_SQ_ALU_SRC_0;
4900
4901 if (inst->Instruction.Opcode == TGSI_OPCODE_KILL) {
4902 alu.src[1].sel = V_SQ_ALU_SRC_1;
4903 alu.src[1].neg = 1;
4904 } else {
4905 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4906 }
4907 if (i == 3) {
4908 alu.last = 1;
4909 }
4910 r = r600_bytecode_add_alu(ctx->bc, &alu);
4911 if (r)
4912 return r;
4913 }
4914
4915 /* kill must be last in ALU */
4916 ctx->bc->force_add_cf = 1;
4917 ctx->shader->uses_kill = TRUE;
4918 return 0;
4919 }
4920
4921 static int tgsi_lit(struct r600_shader_ctx *ctx)
4922 {
4923 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4924 struct r600_bytecode_alu alu;
4925 int r;
4926
4927 /* tmp.x = max(src.y, 0.0) */
4928 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4929 alu.op = ALU_OP2_MAX;
4930 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
4931 alu.src[1].sel = V_SQ_ALU_SRC_0; /*0.0*/
4932 alu.src[1].chan = 1;
4933
4934 alu.dst.sel = ctx->temp_reg;
4935 alu.dst.chan = 0;
4936 alu.dst.write = 1;
4937
4938 alu.last = 1;
4939 r = r600_bytecode_add_alu(ctx->bc, &alu);
4940 if (r)
4941 return r;
4942
4943 if (inst->Dst[0].Register.WriteMask & (1 << 2))
4944 {
4945 int chan;
4946 int sel;
4947 unsigned i;
4948
4949 if (ctx->bc->chip_class == CAYMAN) {
4950 for (i = 0; i < 3; i++) {
4951 /* tmp.z = log(tmp.x) */
4952 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4953 alu.op = ALU_OP1_LOG_CLAMPED;
4954 alu.src[0].sel = ctx->temp_reg;
4955 alu.src[0].chan = 0;
4956 alu.dst.sel = ctx->temp_reg;
4957 alu.dst.chan = i;
4958 if (i == 2) {
4959 alu.dst.write = 1;
4960 alu.last = 1;
4961 } else
4962 alu.dst.write = 0;
4963
4964 r = r600_bytecode_add_alu(ctx->bc, &alu);
4965 if (r)
4966 return r;
4967 }
4968 } else {
4969 /* tmp.z = log(tmp.x) */
4970 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4971 alu.op = ALU_OP1_LOG_CLAMPED;
4972 alu.src[0].sel = ctx->temp_reg;
4973 alu.src[0].chan = 0;
4974 alu.dst.sel = ctx->temp_reg;
4975 alu.dst.chan = 2;
4976 alu.dst.write = 1;
4977 alu.last = 1;
4978 r = r600_bytecode_add_alu(ctx->bc, &alu);
4979 if (r)
4980 return r;
4981 }
4982
4983 chan = alu.dst.chan;
4984 sel = alu.dst.sel;
4985
4986 /* tmp.x = amd MUL_LIT(tmp.z, src.w, src.x ) */
4987 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4988 alu.op = ALU_OP3_MUL_LIT;
4989 alu.src[0].sel = sel;
4990 alu.src[0].chan = chan;
4991 r600_bytecode_src(&alu.src[1], &ctx->src[0], 3);
4992 r600_bytecode_src(&alu.src[2], &ctx->src[0], 0);
4993 alu.dst.sel = ctx->temp_reg;
4994 alu.dst.chan = 0;
4995 alu.dst.write = 1;
4996 alu.is_op3 = 1;
4997 alu.last = 1;
4998 r = r600_bytecode_add_alu(ctx->bc, &alu);
4999 if (r)
5000 return r;
5001
5002 if (ctx->bc->chip_class == CAYMAN) {
5003 for (i = 0; i < 3; i++) {
5004 /* dst.z = exp(tmp.x) */
5005 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5006 alu.op = ALU_OP1_EXP_IEEE;
5007 alu.src[0].sel = ctx->temp_reg;
5008 alu.src[0].chan = 0;
5009 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5010 if (i == 2) {
5011 alu.dst.write = 1;
5012 alu.last = 1;
5013 } else
5014 alu.dst.write = 0;
5015 r = r600_bytecode_add_alu(ctx->bc, &alu);
5016 if (r)
5017 return r;
5018 }
5019 } else {
5020 /* dst.z = exp(tmp.x) */
5021 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5022 alu.op = ALU_OP1_EXP_IEEE;
5023 alu.src[0].sel = ctx->temp_reg;
5024 alu.src[0].chan = 0;
5025 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
5026 alu.last = 1;
5027 r = r600_bytecode_add_alu(ctx->bc, &alu);
5028 if (r)
5029 return r;
5030 }
5031 }
5032
5033 /* dst.x, <- 1.0 */
5034 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5035 alu.op = ALU_OP1_MOV;
5036 alu.src[0].sel = V_SQ_ALU_SRC_1; /*1.0*/
5037 alu.src[0].chan = 0;
5038 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
5039 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 0) & 1;
5040 r = r600_bytecode_add_alu(ctx->bc, &alu);
5041 if (r)
5042 return r;
5043
5044 /* dst.y = max(src.x, 0.0) */
5045 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5046 alu.op = ALU_OP2_MAX;
5047 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5048 alu.src[1].sel = V_SQ_ALU_SRC_0; /*0.0*/
5049 alu.src[1].chan = 0;
5050 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
5051 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 1) & 1;
5052 r = r600_bytecode_add_alu(ctx->bc, &alu);
5053 if (r)
5054 return r;
5055
5056 /* dst.w, <- 1.0 */
5057 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5058 alu.op = ALU_OP1_MOV;
5059 alu.src[0].sel = V_SQ_ALU_SRC_1;
5060 alu.src[0].chan = 0;
5061 tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
5062 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 3) & 1;
5063 alu.last = 1;
5064 r = r600_bytecode_add_alu(ctx->bc, &alu);
5065 if (r)
5066 return r;
5067
5068 return 0;
5069 }
5070
5071 static int tgsi_rsq(struct r600_shader_ctx *ctx)
5072 {
5073 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5074 struct r600_bytecode_alu alu;
5075 int i, r;
5076
5077 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5078
5079 alu.op = ALU_OP1_RECIPSQRT_IEEE;
5080
5081 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
5082 r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
5083 r600_bytecode_src_set_abs(&alu.src[i]);
5084 }
5085 alu.dst.sel = ctx->temp_reg;
5086 alu.dst.write = 1;
5087 alu.last = 1;
5088 r = r600_bytecode_add_alu(ctx->bc, &alu);
5089 if (r)
5090 return r;
5091 /* replicate result */
5092 return tgsi_helper_tempx_replicate(ctx);
5093 }
5094
5095 static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx)
5096 {
5097 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5098 struct r600_bytecode_alu alu;
5099 int i, r;
5100
5101 for (i = 0; i < 4; i++) {
5102 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5103 alu.src[0].sel = ctx->temp_reg;
5104 alu.op = ALU_OP1_MOV;
5105 alu.dst.chan = i;
5106 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5107 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
5108 if (i == 3)
5109 alu.last = 1;
5110 r = r600_bytecode_add_alu(ctx->bc, &alu);
5111 if (r)
5112 return r;
5113 }
5114 return 0;
5115 }
5116
5117 static int tgsi_trans_srcx_replicate(struct r600_shader_ctx *ctx)
5118 {
5119 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5120 struct r600_bytecode_alu alu;
5121 int i, r;
5122
5123 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5124 alu.op = ctx->inst_info->op;
5125 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
5126 r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
5127 }
5128 alu.dst.sel = ctx->temp_reg;
5129 alu.dst.write = 1;
5130 alu.last = 1;
5131 r = r600_bytecode_add_alu(ctx->bc, &alu);
5132 if (r)
5133 return r;
5134 /* replicate result */
5135 return tgsi_helper_tempx_replicate(ctx);
5136 }
5137
5138 static int cayman_pow(struct r600_shader_ctx *ctx)
5139 {
5140 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5141 int i, r;
5142 struct r600_bytecode_alu alu;
5143 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
5144
5145 for (i = 0; i < 3; i++) {
5146 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5147 alu.op = ALU_OP1_LOG_IEEE;
5148 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5149 alu.dst.sel = ctx->temp_reg;
5150 alu.dst.chan = i;
5151 alu.dst.write = 1;
5152 if (i == 2)
5153 alu.last = 1;
5154 r = r600_bytecode_add_alu(ctx->bc, &alu);
5155 if (r)
5156 return r;
5157 }
5158
5159 /* b * LOG2(a) */
5160 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5161 alu.op = ALU_OP2_MUL;
5162 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
5163 alu.src[1].sel = ctx->temp_reg;
5164 alu.dst.sel = ctx->temp_reg;
5165 alu.dst.write = 1;
5166 alu.last = 1;
5167 r = r600_bytecode_add_alu(ctx->bc, &alu);
5168 if (r)
5169 return r;
5170
5171 for (i = 0; i < last_slot; i++) {
5172 /* POW(a,b) = EXP2(b * LOG2(a))*/
5173 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5174 alu.op = ALU_OP1_EXP_IEEE;
5175 alu.src[0].sel = ctx->temp_reg;
5176
5177 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5178 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
5179 if (i == last_slot - 1)
5180 alu.last = 1;
5181 r = r600_bytecode_add_alu(ctx->bc, &alu);
5182 if (r)
5183 return r;
5184 }
5185 return 0;
5186 }
5187
5188 static int tgsi_pow(struct r600_shader_ctx *ctx)
5189 {
5190 struct r600_bytecode_alu alu;
5191 int r;
5192
5193 /* LOG2(a) */
5194 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5195 alu.op = ALU_OP1_LOG_IEEE;
5196 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5197 alu.dst.sel = ctx->temp_reg;
5198 alu.dst.write = 1;
5199 alu.last = 1;
5200 r = r600_bytecode_add_alu(ctx->bc, &alu);
5201 if (r)
5202 return r;
5203 /* b * LOG2(a) */
5204 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5205 alu.op = ALU_OP2_MUL;
5206 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
5207 alu.src[1].sel = ctx->temp_reg;
5208 alu.dst.sel = ctx->temp_reg;
5209 alu.dst.write = 1;
5210 alu.last = 1;
5211 r = r600_bytecode_add_alu(ctx->bc, &alu);
5212 if (r)
5213 return r;
5214 /* POW(a,b) = EXP2(b * LOG2(a))*/
5215 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5216 alu.op = ALU_OP1_EXP_IEEE;
5217 alu.src[0].sel = ctx->temp_reg;
5218 alu.dst.sel = ctx->temp_reg;
5219 alu.dst.write = 1;
5220 alu.last = 1;
5221 r = r600_bytecode_add_alu(ctx->bc, &alu);
5222 if (r)
5223 return r;
5224 return tgsi_helper_tempx_replicate(ctx);
5225 }
5226
5227 static int emit_mul_int_op(struct r600_bytecode *bc,
5228 struct r600_bytecode_alu *alu_src)
5229 {
5230 struct r600_bytecode_alu alu;
5231 int i, r;
5232 alu = *alu_src;
5233 if (bc->chip_class == CAYMAN) {
5234 for (i = 0; i < 4; i++) {
5235 alu.dst.chan = i;
5236 alu.dst.write = (i == alu_src->dst.chan);
5237 alu.last = (i == 3);
5238
5239 r = r600_bytecode_add_alu(bc, &alu);
5240 if (r)
5241 return r;
5242 }
5243 } else {
5244 alu.last = 1;
5245 r = r600_bytecode_add_alu(bc, &alu);
5246 if (r)
5247 return r;
5248 }
5249 return 0;
5250 }
5251
5252 static int tgsi_divmod(struct r600_shader_ctx *ctx, int mod, int signed_op)
5253 {
5254 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5255 struct r600_bytecode_alu alu;
5256 int i, r, j;
5257 unsigned write_mask = inst->Dst[0].Register.WriteMask;
5258 int tmp0 = ctx->temp_reg;
5259 int tmp1 = r600_get_temp(ctx);
5260 int tmp2 = r600_get_temp(ctx);
5261 int tmp3 = r600_get_temp(ctx);
5262 /* Unsigned path:
5263 *
5264 * we need to represent src1 as src2*q + r, where q - quotient, r - remainder
5265 *
5266 * 1. tmp0.x = rcp (src2) = 2^32/src2 + e, where e is rounding error
5267 * 2. tmp0.z = lo (tmp0.x * src2)
5268 * 3. tmp0.w = -tmp0.z
5269 * 4. tmp0.y = hi (tmp0.x * src2)
5270 * 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z) = abs(lo(rcp*src2))
5271 * 6. tmp0.w = hi (tmp0.z * tmp0.x) = e, rounding error
5272 * 7. tmp1.x = tmp0.x - tmp0.w
5273 * 8. tmp1.y = tmp0.x + tmp0.w
5274 * 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x)
5275 * 10. tmp0.z = hi(tmp0.x * src1) = q
5276 * 11. tmp0.y = lo (tmp0.z * src2) = src2*q = src1 - r
5277 *
5278 * 12. tmp0.w = src1 - tmp0.y = r
5279 * 13. tmp1.x = tmp0.w >= src2 = r >= src2 (uint comparison)
5280 * 14. tmp1.y = src1 >= tmp0.y = r >= 0 (uint comparison)
5281 *
5282 * if DIV
5283 *
5284 * 15. tmp1.z = tmp0.z + 1 = q + 1
5285 * 16. tmp1.w = tmp0.z - 1 = q - 1
5286 *
5287 * else MOD
5288 *
5289 * 15. tmp1.z = tmp0.w - src2 = r - src2
5290 * 16. tmp1.w = tmp0.w + src2 = r + src2
5291 *
5292 * endif
5293 *
5294 * 17. tmp1.x = tmp1.x & tmp1.y
5295 *
5296 * DIV: 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z
5297 * MOD: 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z
5298 *
5299 * 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z
5300 * 20. dst = src2==0 ? MAX_UINT : tmp0.z
5301 *
5302 * Signed path:
5303 *
5304 * Same as unsigned, using abs values of the operands,
5305 * and fixing the sign of the result in the end.
5306 */
5307
5308 for (i = 0; i < 4; i++) {
5309 if (!(write_mask & (1<<i)))
5310 continue;
5311
5312 if (signed_op) {
5313
5314 /* tmp2.x = -src0 */
5315 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5316 alu.op = ALU_OP2_SUB_INT;
5317
5318 alu.dst.sel = tmp2;
5319 alu.dst.chan = 0;
5320 alu.dst.write = 1;
5321
5322 alu.src[0].sel = V_SQ_ALU_SRC_0;
5323
5324 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5325
5326 alu.last = 1;
5327 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5328 return r;
5329
5330 /* tmp2.y = -src1 */
5331 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5332 alu.op = ALU_OP2_SUB_INT;
5333
5334 alu.dst.sel = tmp2;
5335 alu.dst.chan = 1;
5336 alu.dst.write = 1;
5337
5338 alu.src[0].sel = V_SQ_ALU_SRC_0;
5339
5340 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5341
5342 alu.last = 1;
5343 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5344 return r;
5345
5346 /* tmp2.z sign bit is set if src0 and src2 signs are different */
5347 /* it will be a sign of the quotient */
5348 if (!mod) {
5349
5350 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5351 alu.op = ALU_OP2_XOR_INT;
5352
5353 alu.dst.sel = tmp2;
5354 alu.dst.chan = 2;
5355 alu.dst.write = 1;
5356
5357 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5358 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5359
5360 alu.last = 1;
5361 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5362 return r;
5363 }
5364
5365 /* tmp2.x = |src0| */
5366 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5367 alu.op = ALU_OP3_CNDGE_INT;
5368 alu.is_op3 = 1;
5369
5370 alu.dst.sel = tmp2;
5371 alu.dst.chan = 0;
5372 alu.dst.write = 1;
5373
5374 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5375 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5376 alu.src[2].sel = tmp2;
5377 alu.src[2].chan = 0;
5378
5379 alu.last = 1;
5380 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5381 return r;
5382
5383 /* tmp2.y = |src1| */
5384 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5385 alu.op = ALU_OP3_CNDGE_INT;
5386 alu.is_op3 = 1;
5387
5388 alu.dst.sel = tmp2;
5389 alu.dst.chan = 1;
5390 alu.dst.write = 1;
5391
5392 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5393 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5394 alu.src[2].sel = tmp2;
5395 alu.src[2].chan = 1;
5396
5397 alu.last = 1;
5398 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5399 return r;
5400
5401 }
5402
5403 /* 1. tmp0.x = rcp_u (src2) = 2^32/src2 + e, where e is rounding error */
5404 if (ctx->bc->chip_class == CAYMAN) {
5405 /* tmp3.x = u2f(src2) */
5406 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5407 alu.op = ALU_OP1_UINT_TO_FLT;
5408
5409 alu.dst.sel = tmp3;
5410 alu.dst.chan = 0;
5411 alu.dst.write = 1;
5412
5413 if (signed_op) {
5414 alu.src[0].sel = tmp2;
5415 alu.src[0].chan = 1;
5416 } else {
5417 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5418 }
5419
5420 alu.last = 1;
5421 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5422 return r;
5423
5424 /* tmp0.x = recip(tmp3.x) */
5425 for (j = 0 ; j < 3; j++) {
5426 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5427 alu.op = ALU_OP1_RECIP_IEEE;
5428
5429 alu.dst.sel = tmp0;
5430 alu.dst.chan = j;
5431 alu.dst.write = (j == 0);
5432
5433 alu.src[0].sel = tmp3;
5434 alu.src[0].chan = 0;
5435
5436 if (j == 2)
5437 alu.last = 1;
5438 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5439 return r;
5440 }
5441
5442 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5443 alu.op = ALU_OP2_MUL;
5444
5445 alu.src[0].sel = tmp0;
5446 alu.src[0].chan = 0;
5447
5448 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
5449 alu.src[1].value = 0x4f800000;
5450
5451 alu.dst.sel = tmp3;
5452 alu.dst.write = 1;
5453 alu.last = 1;
5454 r = r600_bytecode_add_alu(ctx->bc, &alu);
5455 if (r)
5456 return r;
5457
5458 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5459 alu.op = ALU_OP1_FLT_TO_UINT;
5460
5461 alu.dst.sel = tmp0;
5462 alu.dst.chan = 0;
5463 alu.dst.write = 1;
5464
5465 alu.src[0].sel = tmp3;
5466 alu.src[0].chan = 0;
5467
5468 alu.last = 1;
5469 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5470 return r;
5471
5472 } else {
5473 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5474 alu.op = ALU_OP1_RECIP_UINT;
5475
5476 alu.dst.sel = tmp0;
5477 alu.dst.chan = 0;
5478 alu.dst.write = 1;
5479
5480 if (signed_op) {
5481 alu.src[0].sel = tmp2;
5482 alu.src[0].chan = 1;
5483 } else {
5484 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5485 }
5486
5487 alu.last = 1;
5488 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5489 return r;
5490 }
5491
5492 /* 2. tmp0.z = lo (tmp0.x * src2) */
5493 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5494 alu.op = ALU_OP2_MULLO_UINT;
5495
5496 alu.dst.sel = tmp0;
5497 alu.dst.chan = 2;
5498 alu.dst.write = 1;
5499
5500 alu.src[0].sel = tmp0;
5501 alu.src[0].chan = 0;
5502 if (signed_op) {
5503 alu.src[1].sel = tmp2;
5504 alu.src[1].chan = 1;
5505 } else {
5506 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5507 }
5508
5509 if ((r = emit_mul_int_op(ctx->bc, &alu)))
5510 return r;
5511
5512 /* 3. tmp0.w = -tmp0.z */
5513 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5514 alu.op = ALU_OP2_SUB_INT;
5515
5516 alu.dst.sel = tmp0;
5517 alu.dst.chan = 3;
5518 alu.dst.write = 1;
5519
5520 alu.src[0].sel = V_SQ_ALU_SRC_0;
5521 alu.src[1].sel = tmp0;
5522 alu.src[1].chan = 2;
5523
5524 alu.last = 1;
5525 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5526 return r;
5527
5528 /* 4. tmp0.y = hi (tmp0.x * src2) */
5529 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5530 alu.op = ALU_OP2_MULHI_UINT;
5531
5532 alu.dst.sel = tmp0;
5533 alu.dst.chan = 1;
5534 alu.dst.write = 1;
5535
5536 alu.src[0].sel = tmp0;
5537 alu.src[0].chan = 0;
5538
5539 if (signed_op) {
5540 alu.src[1].sel = tmp2;
5541 alu.src[1].chan = 1;
5542 } else {
5543 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5544 }
5545
5546 if ((r = emit_mul_int_op(ctx->bc, &alu)))
5547 return r;
5548
5549 /* 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z) = abs(lo(rcp*src)) */
5550 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5551 alu.op = ALU_OP3_CNDE_INT;
5552 alu.is_op3 = 1;
5553
5554 alu.dst.sel = tmp0;
5555 alu.dst.chan = 2;
5556 alu.dst.write = 1;
5557
5558 alu.src[0].sel = tmp0;
5559 alu.src[0].chan = 1;
5560 alu.src[1].sel = tmp0;
5561 alu.src[1].chan = 3;
5562 alu.src[2].sel = tmp0;
5563 alu.src[2].chan = 2;
5564
5565 alu.last = 1;
5566 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5567 return r;
5568
5569 /* 6. tmp0.w = hi (tmp0.z * tmp0.x) = e, rounding error */
5570 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5571 alu.op = ALU_OP2_MULHI_UINT;
5572
5573 alu.dst.sel = tmp0;
5574 alu.dst.chan = 3;
5575 alu.dst.write = 1;
5576
5577 alu.src[0].sel = tmp0;
5578 alu.src[0].chan = 2;
5579
5580 alu.src[1].sel = tmp0;
5581 alu.src[1].chan = 0;
5582
5583 if ((r = emit_mul_int_op(ctx->bc, &alu)))
5584 return r;
5585
5586 /* 7. tmp1.x = tmp0.x - tmp0.w */
5587 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5588 alu.op = ALU_OP2_SUB_INT;
5589
5590 alu.dst.sel = tmp1;
5591 alu.dst.chan = 0;
5592 alu.dst.write = 1;
5593
5594 alu.src[0].sel = tmp0;
5595 alu.src[0].chan = 0;
5596 alu.src[1].sel = tmp0;
5597 alu.src[1].chan = 3;
5598
5599 alu.last = 1;
5600 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5601 return r;
5602
5603 /* 8. tmp1.y = tmp0.x + tmp0.w */
5604 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5605 alu.op = ALU_OP2_ADD_INT;
5606
5607 alu.dst.sel = tmp1;
5608 alu.dst.chan = 1;
5609 alu.dst.write = 1;
5610
5611 alu.src[0].sel = tmp0;
5612 alu.src[0].chan = 0;
5613 alu.src[1].sel = tmp0;
5614 alu.src[1].chan = 3;
5615
5616 alu.last = 1;
5617 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5618 return r;
5619
5620 /* 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) */
5621 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5622 alu.op = ALU_OP3_CNDE_INT;
5623 alu.is_op3 = 1;
5624
5625 alu.dst.sel = tmp0;
5626 alu.dst.chan = 0;
5627 alu.dst.write = 1;
5628
5629 alu.src[0].sel = tmp0;
5630 alu.src[0].chan = 1;
5631 alu.src[1].sel = tmp1;
5632 alu.src[1].chan = 1;
5633 alu.src[2].sel = tmp1;
5634 alu.src[2].chan = 0;
5635
5636 alu.last = 1;
5637 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5638 return r;
5639
5640 /* 10. tmp0.z = hi(tmp0.x * src1) = q */
5641 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5642 alu.op = ALU_OP2_MULHI_UINT;
5643
5644 alu.dst.sel = tmp0;
5645 alu.dst.chan = 2;
5646 alu.dst.write = 1;
5647
5648 alu.src[0].sel = tmp0;
5649 alu.src[0].chan = 0;
5650
5651 if (signed_op) {
5652 alu.src[1].sel = tmp2;
5653 alu.src[1].chan = 0;
5654 } else {
5655 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5656 }
5657
5658 if ((r = emit_mul_int_op(ctx->bc, &alu)))
5659 return r;
5660
5661 /* 11. tmp0.y = lo (src2 * tmp0.z) = src2*q = src1 - r */
5662 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5663 alu.op = ALU_OP2_MULLO_UINT;
5664
5665 alu.dst.sel = tmp0;
5666 alu.dst.chan = 1;
5667 alu.dst.write = 1;
5668
5669 if (signed_op) {
5670 alu.src[0].sel = tmp2;
5671 alu.src[0].chan = 1;
5672 } else {
5673 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5674 }
5675
5676 alu.src[1].sel = tmp0;
5677 alu.src[1].chan = 2;
5678
5679 if ((r = emit_mul_int_op(ctx->bc, &alu)))
5680 return r;
5681
5682 /* 12. tmp0.w = src1 - tmp0.y = r */
5683 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5684 alu.op = ALU_OP2_SUB_INT;
5685
5686 alu.dst.sel = tmp0;
5687 alu.dst.chan = 3;
5688 alu.dst.write = 1;
5689
5690 if (signed_op) {
5691 alu.src[0].sel = tmp2;
5692 alu.src[0].chan = 0;
5693 } else {
5694 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5695 }
5696
5697 alu.src[1].sel = tmp0;
5698 alu.src[1].chan = 1;
5699
5700 alu.last = 1;
5701 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5702 return r;
5703
5704 /* 13. tmp1.x = tmp0.w >= src2 = r >= src2 */
5705 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5706 alu.op = ALU_OP2_SETGE_UINT;
5707
5708 alu.dst.sel = tmp1;
5709 alu.dst.chan = 0;
5710 alu.dst.write = 1;
5711
5712 alu.src[0].sel = tmp0;
5713 alu.src[0].chan = 3;
5714 if (signed_op) {
5715 alu.src[1].sel = tmp2;
5716 alu.src[1].chan = 1;
5717 } else {
5718 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5719 }
5720
5721 alu.last = 1;
5722 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5723 return r;
5724
5725 /* 14. tmp1.y = src1 >= tmp0.y = r >= 0 */
5726 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5727 alu.op = ALU_OP2_SETGE_UINT;
5728
5729 alu.dst.sel = tmp1;
5730 alu.dst.chan = 1;
5731 alu.dst.write = 1;
5732
5733 if (signed_op) {
5734 alu.src[0].sel = tmp2;
5735 alu.src[0].chan = 0;
5736 } else {
5737 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5738 }
5739
5740 alu.src[1].sel = tmp0;
5741 alu.src[1].chan = 1;
5742
5743 alu.last = 1;
5744 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5745 return r;
5746
5747 if (mod) { /* UMOD */
5748
5749 /* 15. tmp1.z = tmp0.w - src2 = r - src2 */
5750 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5751 alu.op = ALU_OP2_SUB_INT;
5752
5753 alu.dst.sel = tmp1;
5754 alu.dst.chan = 2;
5755 alu.dst.write = 1;
5756
5757 alu.src[0].sel = tmp0;
5758 alu.src[0].chan = 3;
5759
5760 if (signed_op) {
5761 alu.src[1].sel = tmp2;
5762 alu.src[1].chan = 1;
5763 } else {
5764 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5765 }
5766
5767 alu.last = 1;
5768 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5769 return r;
5770
5771 /* 16. tmp1.w = tmp0.w + src2 = r + src2 */
5772 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5773 alu.op = ALU_OP2_ADD_INT;
5774
5775 alu.dst.sel = tmp1;
5776 alu.dst.chan = 3;
5777 alu.dst.write = 1;
5778
5779 alu.src[0].sel = tmp0;
5780 alu.src[0].chan = 3;
5781 if (signed_op) {
5782 alu.src[1].sel = tmp2;
5783 alu.src[1].chan = 1;
5784 } else {
5785 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5786 }
5787
5788 alu.last = 1;
5789 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5790 return r;
5791
5792 } else { /* UDIV */
5793
5794 /* 15. tmp1.z = tmp0.z + 1 = q + 1 DIV */
5795 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5796 alu.op = ALU_OP2_ADD_INT;
5797
5798 alu.dst.sel = tmp1;
5799 alu.dst.chan = 2;
5800 alu.dst.write = 1;
5801
5802 alu.src[0].sel = tmp0;
5803 alu.src[0].chan = 2;
5804 alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
5805
5806 alu.last = 1;
5807 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5808 return r;
5809
5810 /* 16. tmp1.w = tmp0.z - 1 = q - 1 */
5811 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5812 alu.op = ALU_OP2_ADD_INT;
5813
5814 alu.dst.sel = tmp1;
5815 alu.dst.chan = 3;
5816 alu.dst.write = 1;
5817
5818 alu.src[0].sel = tmp0;
5819 alu.src[0].chan = 2;
5820 alu.src[1].sel = V_SQ_ALU_SRC_M_1_INT;
5821
5822 alu.last = 1;
5823 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5824 return r;
5825
5826 }
5827
5828 /* 17. tmp1.x = tmp1.x & tmp1.y */
5829 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5830 alu.op = ALU_OP2_AND_INT;
5831
5832 alu.dst.sel = tmp1;
5833 alu.dst.chan = 0;
5834 alu.dst.write = 1;
5835
5836 alu.src[0].sel = tmp1;
5837 alu.src[0].chan = 0;
5838 alu.src[1].sel = tmp1;
5839 alu.src[1].chan = 1;
5840
5841 alu.last = 1;
5842 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5843 return r;
5844
5845 /* 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z DIV */
5846 /* 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z MOD */
5847 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5848 alu.op = ALU_OP3_CNDE_INT;
5849 alu.is_op3 = 1;
5850
5851 alu.dst.sel = tmp0;
5852 alu.dst.chan = 2;
5853 alu.dst.write = 1;
5854
5855 alu.src[0].sel = tmp1;
5856 alu.src[0].chan = 0;
5857 alu.src[1].sel = tmp0;
5858 alu.src[1].chan = mod ? 3 : 2;
5859 alu.src[2].sel = tmp1;
5860 alu.src[2].chan = 2;
5861
5862 alu.last = 1;
5863 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5864 return r;
5865
5866 /* 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z */
5867 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5868 alu.op = ALU_OP3_CNDE_INT;
5869 alu.is_op3 = 1;
5870
5871 if (signed_op) {
5872 alu.dst.sel = tmp0;
5873 alu.dst.chan = 2;
5874 alu.dst.write = 1;
5875 } else {
5876 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5877 }
5878
5879 alu.src[0].sel = tmp1;
5880 alu.src[0].chan = 1;
5881 alu.src[1].sel = tmp1;
5882 alu.src[1].chan = 3;
5883 alu.src[2].sel = tmp0;
5884 alu.src[2].chan = 2;
5885
5886 alu.last = 1;
5887 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5888 return r;
5889
5890 if (signed_op) {
5891
5892 /* fix the sign of the result */
5893
5894 if (mod) {
5895
5896 /* tmp0.x = -tmp0.z */
5897 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5898 alu.op = ALU_OP2_SUB_INT;
5899
5900 alu.dst.sel = tmp0;
5901 alu.dst.chan = 0;
5902 alu.dst.write = 1;
5903
5904 alu.src[0].sel = V_SQ_ALU_SRC_0;
5905 alu.src[1].sel = tmp0;
5906 alu.src[1].chan = 2;
5907
5908 alu.last = 1;
5909 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5910 return r;
5911
5912 /* sign of the remainder is the same as the sign of src0 */
5913 /* tmp0.x = src0>=0 ? tmp0.z : tmp0.x */
5914 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5915 alu.op = ALU_OP3_CNDGE_INT;
5916 alu.is_op3 = 1;
5917
5918 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5919
5920 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5921 alu.src[1].sel = tmp0;
5922 alu.src[1].chan = 2;
5923 alu.src[2].sel = tmp0;
5924 alu.src[2].chan = 0;
5925
5926 alu.last = 1;
5927 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5928 return r;
5929
5930 } else {
5931
5932 /* tmp0.x = -tmp0.z */
5933 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5934 alu.op = ALU_OP2_SUB_INT;
5935
5936 alu.dst.sel = tmp0;
5937 alu.dst.chan = 0;
5938 alu.dst.write = 1;
5939
5940 alu.src[0].sel = V_SQ_ALU_SRC_0;
5941 alu.src[1].sel = tmp0;
5942 alu.src[1].chan = 2;
5943
5944 alu.last = 1;
5945 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5946 return r;
5947
5948 /* fix the quotient sign (same as the sign of src0*src1) */
5949 /* tmp0.x = tmp2.z>=0 ? tmp0.z : tmp0.x */
5950 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5951 alu.op = ALU_OP3_CNDGE_INT;
5952 alu.is_op3 = 1;
5953
5954 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5955
5956 alu.src[0].sel = tmp2;
5957 alu.src[0].chan = 2;
5958 alu.src[1].sel = tmp0;
5959 alu.src[1].chan = 2;
5960 alu.src[2].sel = tmp0;
5961 alu.src[2].chan = 0;
5962
5963 alu.last = 1;
5964 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5965 return r;
5966 }
5967 }
5968 }
5969 return 0;
5970 }
5971
5972 static int tgsi_udiv(struct r600_shader_ctx *ctx)
5973 {
5974 return tgsi_divmod(ctx, 0, 0);
5975 }
5976
5977 static int tgsi_umod(struct r600_shader_ctx *ctx)
5978 {
5979 return tgsi_divmod(ctx, 1, 0);
5980 }
5981
5982 static int tgsi_idiv(struct r600_shader_ctx *ctx)
5983 {
5984 return tgsi_divmod(ctx, 0, 1);
5985 }
5986
5987 static int tgsi_imod(struct r600_shader_ctx *ctx)
5988 {
5989 return tgsi_divmod(ctx, 1, 1);
5990 }
5991
5992
5993 static int tgsi_f2i(struct r600_shader_ctx *ctx)
5994 {
5995 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5996 struct r600_bytecode_alu alu;
5997 int i, r;
5998 unsigned write_mask = inst->Dst[0].Register.WriteMask;
5999 int last_inst = tgsi_last_instruction(write_mask);
6000
6001 for (i = 0; i < 4; i++) {
6002 if (!(write_mask & (1<<i)))
6003 continue;
6004
6005 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6006 alu.op = ALU_OP1_TRUNC;
6007
6008 alu.dst.sel = ctx->temp_reg;
6009 alu.dst.chan = i;
6010 alu.dst.write = 1;
6011
6012 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6013 if (i == last_inst)
6014 alu.last = 1;
6015 r = r600_bytecode_add_alu(ctx->bc, &alu);
6016 if (r)
6017 return r;
6018 }
6019
6020 for (i = 0; i < 4; i++) {
6021 if (!(write_mask & (1<<i)))
6022 continue;
6023
6024 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6025 alu.op = ctx->inst_info->op;
6026
6027 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6028
6029 alu.src[0].sel = ctx->temp_reg;
6030 alu.src[0].chan = i;
6031
6032 if (i == last_inst || alu.op == ALU_OP1_FLT_TO_UINT)
6033 alu.last = 1;
6034 r = r600_bytecode_add_alu(ctx->bc, &alu);
6035 if (r)
6036 return r;
6037 }
6038
6039 return 0;
6040 }
6041
6042 static int tgsi_iabs(struct r600_shader_ctx *ctx)
6043 {
6044 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6045 struct r600_bytecode_alu alu;
6046 int i, r;
6047 unsigned write_mask = inst->Dst[0].Register.WriteMask;
6048 int last_inst = tgsi_last_instruction(write_mask);
6049
6050 /* tmp = -src */
6051 for (i = 0; i < 4; i++) {
6052 if (!(write_mask & (1<<i)))
6053 continue;
6054
6055 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6056 alu.op = ALU_OP2_SUB_INT;
6057
6058 alu.dst.sel = ctx->temp_reg;
6059 alu.dst.chan = i;
6060 alu.dst.write = 1;
6061
6062 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6063 alu.src[0].sel = V_SQ_ALU_SRC_0;
6064
6065 if (i == last_inst)
6066 alu.last = 1;
6067 r = r600_bytecode_add_alu(ctx->bc, &alu);
6068 if (r)
6069 return r;
6070 }
6071
6072 /* dst = (src >= 0 ? src : tmp) */
6073 for (i = 0; i < 4; i++) {
6074 if (!(write_mask & (1<<i)))
6075 continue;
6076
6077 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6078 alu.op = ALU_OP3_CNDGE_INT;
6079 alu.is_op3 = 1;
6080 alu.dst.write = 1;
6081
6082 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6083
6084 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6085 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6086 alu.src[2].sel = ctx->temp_reg;
6087 alu.src[2].chan = i;
6088
6089 if (i == last_inst)
6090 alu.last = 1;
6091 r = r600_bytecode_add_alu(ctx->bc, &alu);
6092 if (r)
6093 return r;
6094 }
6095 return 0;
6096 }
6097
6098 static int tgsi_issg(struct r600_shader_ctx *ctx)
6099 {
6100 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6101 struct r600_bytecode_alu alu;
6102 int i, r;
6103 unsigned write_mask = inst->Dst[0].Register.WriteMask;
6104 int last_inst = tgsi_last_instruction(write_mask);
6105
6106 /* tmp = (src >= 0 ? src : -1) */
6107 for (i = 0; i < 4; i++) {
6108 if (!(write_mask & (1<<i)))
6109 continue;
6110
6111 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6112 alu.op = ALU_OP3_CNDGE_INT;
6113 alu.is_op3 = 1;
6114
6115 alu.dst.sel = ctx->temp_reg;
6116 alu.dst.chan = i;
6117 alu.dst.write = 1;
6118
6119 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6120 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6121 alu.src[2].sel = V_SQ_ALU_SRC_M_1_INT;
6122
6123 if (i == last_inst)
6124 alu.last = 1;
6125 r = r600_bytecode_add_alu(ctx->bc, &alu);
6126 if (r)
6127 return r;
6128 }
6129
6130 /* dst = (tmp > 0 ? 1 : tmp) */
6131 for (i = 0; i < 4; i++) {
6132 if (!(write_mask & (1<<i)))
6133 continue;
6134
6135 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6136 alu.op = ALU_OP3_CNDGT_INT;
6137 alu.is_op3 = 1;
6138 alu.dst.write = 1;
6139
6140 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6141
6142 alu.src[0].sel = ctx->temp_reg;
6143 alu.src[0].chan = i;
6144
6145 alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
6146
6147 alu.src[2].sel = ctx->temp_reg;
6148 alu.src[2].chan = i;
6149
6150 if (i == last_inst)
6151 alu.last = 1;
6152 r = r600_bytecode_add_alu(ctx->bc, &alu);
6153 if (r)
6154 return r;
6155 }
6156 return 0;
6157 }
6158
6159
6160
6161 static int tgsi_ssg(struct r600_shader_ctx *ctx)
6162 {
6163 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6164 struct r600_bytecode_alu alu;
6165 int i, r;
6166
6167 /* tmp = (src > 0 ? 1 : src) */
6168 for (i = 0; i < 4; i++) {
6169 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6170 alu.op = ALU_OP3_CNDGT;
6171 alu.is_op3 = 1;
6172
6173 alu.dst.sel = ctx->temp_reg;
6174 alu.dst.chan = i;
6175
6176 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6177 alu.src[1].sel = V_SQ_ALU_SRC_1;
6178 r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
6179
6180 if (i == 3)
6181 alu.last = 1;
6182 r = r600_bytecode_add_alu(ctx->bc, &alu);
6183 if (r)
6184 return r;
6185 }
6186
6187 /* dst = (-tmp > 0 ? -1 : tmp) */
6188 for (i = 0; i < 4; i++) {
6189 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6190 alu.op = ALU_OP3_CNDGT;
6191 alu.is_op3 = 1;
6192 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6193
6194 alu.src[0].sel = ctx->temp_reg;
6195 alu.src[0].chan = i;
6196 alu.src[0].neg = 1;
6197
6198 alu.src[1].sel = V_SQ_ALU_SRC_1;
6199 alu.src[1].neg = 1;
6200
6201 alu.src[2].sel = ctx->temp_reg;
6202 alu.src[2].chan = i;
6203
6204 if (i == 3)
6205 alu.last = 1;
6206 r = r600_bytecode_add_alu(ctx->bc, &alu);
6207 if (r)
6208 return r;
6209 }
6210 return 0;
6211 }
6212
6213 static int tgsi_bfi(struct r600_shader_ctx *ctx)
6214 {
6215 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6216 struct r600_bytecode_alu alu;
6217 int i, r, t1, t2;
6218
6219 unsigned write_mask = inst->Dst[0].Register.WriteMask;
6220 int last_inst = tgsi_last_instruction(write_mask);
6221
6222 t1 = r600_get_temp(ctx);
6223
6224 for (i = 0; i < 4; i++) {
6225 if (!(write_mask & (1<<i)))
6226 continue;
6227
6228 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6229 alu.op = ALU_OP2_SETGE_INT;
6230 r600_bytecode_src(&alu.src[0], &ctx->src[3], i);
6231 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6232 alu.src[1].value = 32;
6233 alu.dst.sel = ctx->temp_reg;
6234 alu.dst.chan = i;
6235 alu.dst.write = 1;
6236 alu.last = i == last_inst;
6237 r = r600_bytecode_add_alu(ctx->bc, &alu);
6238 if (r)
6239 return r;
6240 }
6241
6242 for (i = 0; i < 4; i++) {
6243 if (!(write_mask & (1<<i)))
6244 continue;
6245
6246 /* create mask tmp */
6247 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6248 alu.op = ALU_OP2_BFM_INT;
6249 alu.dst.sel = t1;
6250 alu.dst.chan = i;
6251 alu.dst.write = 1;
6252 alu.last = i == last_inst;
6253
6254 r600_bytecode_src(&alu.src[0], &ctx->src[3], i);
6255 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6256
6257 r = r600_bytecode_add_alu(ctx->bc, &alu);
6258 if (r)
6259 return r;
6260 }
6261
6262 t2 = r600_get_temp(ctx);
6263
6264 for (i = 0; i < 4; i++) {
6265 if (!(write_mask & (1<<i)))
6266 continue;
6267
6268 /* shift insert left */
6269 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6270 alu.op = ALU_OP2_LSHL_INT;
6271 alu.dst.sel = t2;
6272 alu.dst.chan = i;
6273 alu.dst.write = 1;
6274 alu.last = i == last_inst;
6275
6276 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
6277 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6278
6279 r = r600_bytecode_add_alu(ctx->bc, &alu);
6280 if (r)
6281 return r;
6282 }
6283
6284 for (i = 0; i < 4; i++) {
6285 if (!(write_mask & (1<<i)))
6286 continue;
6287
6288 /* actual bitfield insert */
6289 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6290 alu.op = ALU_OP3_BFI_INT;
6291 alu.is_op3 = 1;
6292 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6293 alu.dst.chan = i;
6294 alu.dst.write = 1;
6295 alu.last = i == last_inst;
6296
6297 alu.src[0].sel = t1;
6298 alu.src[0].chan = i;
6299 alu.src[1].sel = t2;
6300 alu.src[1].chan = i;
6301 r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
6302
6303 r = r600_bytecode_add_alu(ctx->bc, &alu);
6304 if (r)
6305 return r;
6306 }
6307
6308 for (i = 0; i < 4; i++) {
6309 if (!(write_mask & (1<<i)))
6310 continue;
6311 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6312 alu.op = ALU_OP3_CNDE_INT;
6313 alu.is_op3 = 1;
6314 alu.src[0].sel = ctx->temp_reg;
6315 alu.src[0].chan = i;
6316 r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
6317
6318 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6319
6320 alu.src[1].sel = alu.dst.sel;
6321 alu.src[1].chan = i;
6322
6323 alu.last = i == last_inst;
6324 r = r600_bytecode_add_alu(ctx->bc, &alu);
6325 if (r)
6326 return r;
6327 }
6328 return 0;
6329 }
6330
6331 static int tgsi_msb(struct r600_shader_ctx *ctx)
6332 {
6333 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6334 struct r600_bytecode_alu alu;
6335 int i, r, t1, t2;
6336
6337 unsigned write_mask = inst->Dst[0].Register.WriteMask;
6338 int last_inst = tgsi_last_instruction(write_mask);
6339
6340 assert(ctx->inst_info->op == ALU_OP1_FFBH_INT ||
6341 ctx->inst_info->op == ALU_OP1_FFBH_UINT);
6342
6343 t1 = ctx->temp_reg;
6344
6345 /* bit position is indexed from lsb by TGSI, and from msb by the hardware */
6346 for (i = 0; i < 4; i++) {
6347 if (!(write_mask & (1<<i)))
6348 continue;
6349
6350 /* t1 = FFBH_INT / FFBH_UINT */
6351 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6352 alu.op = ctx->inst_info->op;
6353 alu.dst.sel = t1;
6354 alu.dst.chan = i;
6355 alu.dst.write = 1;
6356 alu.last = i == last_inst;
6357
6358 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6359
6360 r = r600_bytecode_add_alu(ctx->bc, &alu);
6361 if (r)
6362 return r;
6363 }
6364
6365 t2 = r600_get_temp(ctx);
6366
6367 for (i = 0; i < 4; i++) {
6368 if (!(write_mask & (1<<i)))
6369 continue;
6370
6371 /* t2 = 31 - t1 */
6372 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6373 alu.op = ALU_OP2_SUB_INT;
6374 alu.dst.sel = t2;
6375 alu.dst.chan = i;
6376 alu.dst.write = 1;
6377 alu.last = i == last_inst;
6378
6379 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
6380 alu.src[0].value = 31;
6381 alu.src[1].sel = t1;
6382 alu.src[1].chan = i;
6383
6384 r = r600_bytecode_add_alu(ctx->bc, &alu);
6385 if (r)
6386 return r;
6387 }
6388
6389 for (i = 0; i < 4; i++) {
6390 if (!(write_mask & (1<<i)))
6391 continue;
6392
6393 /* result = t1 >= 0 ? t2 : t1 */
6394 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6395 alu.op = ALU_OP3_CNDGE_INT;
6396 alu.is_op3 = 1;
6397 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6398 alu.dst.chan = i;
6399 alu.dst.write = 1;
6400 alu.last = i == last_inst;
6401
6402 alu.src[0].sel = t1;
6403 alu.src[0].chan = i;
6404 alu.src[1].sel = t2;
6405 alu.src[1].chan = i;
6406 alu.src[2].sel = t1;
6407 alu.src[2].chan = i;
6408
6409 r = r600_bytecode_add_alu(ctx->bc, &alu);
6410 if (r)
6411 return r;
6412 }
6413
6414 return 0;
6415 }
6416
6417 static int tgsi_interp_egcm(struct r600_shader_ctx *ctx)
6418 {
6419 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6420 struct r600_bytecode_alu alu;
6421 int r, i = 0, k, interp_gpr, interp_base_chan, tmp, lasti;
6422 unsigned location;
6423 const int input = inst->Src[0].Register.Index + ctx->shader->nsys_inputs;
6424
6425 assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
6426
6427 /* Interpolators have been marked for use already by allocate_system_value_inputs */
6428 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
6429 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6430 location = TGSI_INTERPOLATE_LOC_CENTER; /* sample offset will be added explicitly */
6431 }
6432 else {
6433 location = TGSI_INTERPOLATE_LOC_CENTROID;
6434 }
6435
6436 k = eg_get_interpolator_index(ctx->shader->input[input].interpolate, location);
6437 if (k < 0)
6438 k = 0;
6439 interp_gpr = ctx->eg_interpolators[k].ij_index / 2;
6440 interp_base_chan = 2 * (ctx->eg_interpolators[k].ij_index % 2);
6441
6442 /* NOTE: currently offset is not perspective correct */
6443 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
6444 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6445 int sample_gpr = -1;
6446 int gradientsH, gradientsV;
6447 struct r600_bytecode_tex tex;
6448
6449 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6450 sample_gpr = load_sample_position(ctx, &ctx->src[1], ctx->src[1].swizzle[0]);
6451 }
6452
6453 gradientsH = r600_get_temp(ctx);
6454 gradientsV = r600_get_temp(ctx);
6455 for (i = 0; i < 2; i++) {
6456 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
6457 tex.op = i == 0 ? FETCH_OP_GET_GRADIENTS_H : FETCH_OP_GET_GRADIENTS_V;
6458 tex.src_gpr = interp_gpr;
6459 tex.src_sel_x = interp_base_chan + 0;
6460 tex.src_sel_y = interp_base_chan + 1;
6461 tex.src_sel_z = 0;
6462 tex.src_sel_w = 0;
6463 tex.dst_gpr = i == 0 ? gradientsH : gradientsV;
6464 tex.dst_sel_x = 0;
6465 tex.dst_sel_y = 1;
6466 tex.dst_sel_z = 7;
6467 tex.dst_sel_w = 7;
6468 tex.inst_mod = 1; // Use per pixel gradient calculation
6469 tex.sampler_id = 0;
6470 tex.resource_id = tex.sampler_id;
6471 r = r600_bytecode_add_tex(ctx->bc, &tex);
6472 if (r)
6473 return r;
6474 }
6475
6476 for (i = 0; i < 2; i++) {
6477 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6478 alu.op = ALU_OP3_MULADD;
6479 alu.is_op3 = 1;
6480 alu.src[0].sel = gradientsH;
6481 alu.src[0].chan = i;
6482 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6483 alu.src[1].sel = sample_gpr;
6484 alu.src[1].chan = 2;
6485 }
6486 else {
6487 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
6488 }
6489 alu.src[2].sel = interp_gpr;
6490 alu.src[2].chan = interp_base_chan + i;
6491 alu.dst.sel = ctx->temp_reg;
6492 alu.dst.chan = i;
6493 alu.last = i == 1;
6494
6495 r = r600_bytecode_add_alu(ctx->bc, &alu);
6496 if (r)
6497 return r;
6498 }
6499
6500 for (i = 0; i < 2; i++) {
6501 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6502 alu.op = ALU_OP3_MULADD;
6503 alu.is_op3 = 1;
6504 alu.src[0].sel = gradientsV;
6505 alu.src[0].chan = i;
6506 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6507 alu.src[1].sel = sample_gpr;
6508 alu.src[1].chan = 3;
6509 }
6510 else {
6511 r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
6512 }
6513 alu.src[2].sel = ctx->temp_reg;
6514 alu.src[2].chan = i;
6515 alu.dst.sel = ctx->temp_reg;
6516 alu.dst.chan = i;
6517 alu.last = i == 1;
6518
6519 r = r600_bytecode_add_alu(ctx->bc, &alu);
6520 if (r)
6521 return r;
6522 }
6523 }
6524
6525 tmp = r600_get_temp(ctx);
6526 for (i = 0; i < 8; i++) {
6527 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6528 alu.op = i < 4 ? ALU_OP2_INTERP_ZW : ALU_OP2_INTERP_XY;
6529
6530 alu.dst.sel = tmp;
6531 if ((i > 1 && i < 6)) {
6532 alu.dst.write = 1;
6533 }
6534 else {
6535 alu.dst.write = 0;
6536 }
6537 alu.dst.chan = i % 4;
6538
6539 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
6540 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
6541 alu.src[0].sel = ctx->temp_reg;
6542 alu.src[0].chan = 1 - (i % 2);
6543 } else {
6544 alu.src[0].sel = interp_gpr;
6545 alu.src[0].chan = interp_base_chan + 1 - (i % 2);
6546 }
6547 alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
6548 alu.src[1].chan = 0;
6549
6550 alu.last = i % 4 == 3;
6551 alu.bank_swizzle_force = SQ_ALU_VEC_210;
6552
6553 r = r600_bytecode_add_alu(ctx->bc, &alu);
6554 if (r)
6555 return r;
6556 }
6557
6558 // INTERP can't swizzle dst
6559 lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6560 for (i = 0; i <= lasti; i++) {
6561 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6562 continue;
6563
6564 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6565 alu.op = ALU_OP1_MOV;
6566 alu.src[0].sel = tmp;
6567 alu.src[0].chan = ctx->src[0].swizzle[i];
6568 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6569 alu.dst.write = 1;
6570 alu.last = i == lasti;
6571 r = r600_bytecode_add_alu(ctx->bc, &alu);
6572 if (r)
6573 return r;
6574 }
6575
6576 return 0;
6577 }
6578
6579
6580 static int tgsi_helper_copy(struct r600_shader_ctx *ctx, struct tgsi_full_instruction *inst)
6581 {
6582 struct r600_bytecode_alu alu;
6583 int i, r;
6584
6585 for (i = 0; i < 4; i++) {
6586 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6587 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) {
6588 alu.op = ALU_OP0_NOP;
6589 alu.dst.chan = i;
6590 } else {
6591 alu.op = ALU_OP1_MOV;
6592 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6593 alu.src[0].sel = ctx->temp_reg;
6594 alu.src[0].chan = i;
6595 }
6596 if (i == 3) {
6597 alu.last = 1;
6598 }
6599 r = r600_bytecode_add_alu(ctx->bc, &alu);
6600 if (r)
6601 return r;
6602 }
6603 return 0;
6604 }
6605
6606 static int tgsi_make_src_for_op3(struct r600_shader_ctx *ctx,
6607 unsigned temp, int chan,
6608 struct r600_bytecode_alu_src *bc_src,
6609 const struct r600_shader_src *shader_src)
6610 {
6611 struct r600_bytecode_alu alu;
6612 int r;
6613
6614 r600_bytecode_src(bc_src, shader_src, chan);
6615
6616 /* op3 operands don't support abs modifier */
6617 if (bc_src->abs) {
6618 assert(temp!=0); /* we actually need the extra register, make sure it is allocated. */
6619 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6620 alu.op = ALU_OP1_MOV;
6621 alu.dst.sel = temp;
6622 alu.dst.chan = chan;
6623 alu.dst.write = 1;
6624
6625 alu.src[0] = *bc_src;
6626 alu.last = true; // sufficient?
6627 r = r600_bytecode_add_alu(ctx->bc, &alu);
6628 if (r)
6629 return r;
6630
6631 memset(bc_src, 0, sizeof(*bc_src));
6632 bc_src->sel = temp;
6633 bc_src->chan = chan;
6634 }
6635 return 0;
6636 }
6637
6638 static int tgsi_op3_dst(struct r600_shader_ctx *ctx, int dst)
6639 {
6640 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6641 struct r600_bytecode_alu alu;
6642 int i, j, r;
6643 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6644 int temp_regs[4];
6645 unsigned op = ctx->inst_info->op;
6646
6647 if (op == ALU_OP3_MULADD_IEEE &&
6648 ctx->info.properties[TGSI_PROPERTY_MUL_ZERO_WINS])
6649 op = ALU_OP3_MULADD;
6650
6651 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
6652 temp_regs[j] = 0;
6653 if (ctx->src[j].abs)
6654 temp_regs[j] = r600_get_temp(ctx);
6655 }
6656 for (i = 0; i < lasti + 1; i++) {
6657 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6658 continue;
6659
6660 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6661 alu.op = op;
6662 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
6663 r = tgsi_make_src_for_op3(ctx, temp_regs[j], i, &alu.src[j], &ctx->src[j]);
6664 if (r)
6665 return r;
6666 }
6667
6668 if (dst == -1) {
6669 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6670 } else {
6671 alu.dst.sel = dst;
6672 }
6673 alu.dst.chan = i;
6674 alu.dst.write = 1;
6675 alu.is_op3 = 1;
6676 if (i == lasti) {
6677 alu.last = 1;
6678 }
6679 r = r600_bytecode_add_alu(ctx->bc, &alu);
6680 if (r)
6681 return r;
6682 }
6683 return 0;
6684 }
6685
6686 static int tgsi_op3(struct r600_shader_ctx *ctx)
6687 {
6688 return tgsi_op3_dst(ctx, -1);
6689 }
6690
6691 static int tgsi_dp(struct r600_shader_ctx *ctx)
6692 {
6693 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6694 struct r600_bytecode_alu alu;
6695 int i, j, r;
6696 unsigned op = ctx->inst_info->op;
6697 if (op == ALU_OP2_DOT4_IEEE &&
6698 ctx->info.properties[TGSI_PROPERTY_MUL_ZERO_WINS])
6699 op = ALU_OP2_DOT4;
6700
6701 for (i = 0; i < 4; i++) {
6702 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6703 alu.op = op;
6704 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
6705 r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
6706 }
6707
6708 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6709 alu.dst.chan = i;
6710 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
6711 /* handle some special cases */
6712 switch (inst->Instruction.Opcode) {
6713 case TGSI_OPCODE_DP2:
6714 if (i > 1) {
6715 alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
6716 alu.src[0].chan = alu.src[1].chan = 0;
6717 }
6718 break;
6719 case TGSI_OPCODE_DP3:
6720 if (i > 2) {
6721 alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
6722 alu.src[0].chan = alu.src[1].chan = 0;
6723 }
6724 break;
6725 default:
6726 break;
6727 }
6728 if (i == 3) {
6729 alu.last = 1;
6730 }
6731 r = r600_bytecode_add_alu(ctx->bc, &alu);
6732 if (r)
6733 return r;
6734 }
6735 return 0;
6736 }
6737
6738 static inline boolean tgsi_tex_src_requires_loading(struct r600_shader_ctx *ctx,
6739 unsigned index)
6740 {
6741 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6742 return (inst->Src[index].Register.File != TGSI_FILE_TEMPORARY &&
6743 inst->Src[index].Register.File != TGSI_FILE_INPUT &&
6744 inst->Src[index].Register.File != TGSI_FILE_OUTPUT) ||
6745 ctx->src[index].neg || ctx->src[index].abs ||
6746 (inst->Src[index].Register.File == TGSI_FILE_INPUT && ctx->type == PIPE_SHADER_GEOMETRY);
6747 }
6748
6749 static inline unsigned tgsi_tex_get_src_gpr(struct r600_shader_ctx *ctx,
6750 unsigned index)
6751 {
6752 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6753 return ctx->file_offset[inst->Src[index].Register.File] + inst->Src[index].Register.Index;
6754 }
6755
6756 static int do_vtx_fetch_inst(struct r600_shader_ctx *ctx, boolean src_requires_loading)
6757 {
6758 struct r600_bytecode_vtx vtx;
6759 struct r600_bytecode_alu alu;
6760 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6761 int src_gpr, r, i;
6762 int id = tgsi_tex_get_src_gpr(ctx, 1);
6763 int sampler_index_mode = inst->Src[1].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
6764
6765 src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
6766 if (src_requires_loading) {
6767 for (i = 0; i < 4; i++) {
6768 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6769 alu.op = ALU_OP1_MOV;
6770 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6771 alu.dst.sel = ctx->temp_reg;
6772 alu.dst.chan = i;
6773 if (i == 3)
6774 alu.last = 1;
6775 alu.dst.write = 1;
6776 r = r600_bytecode_add_alu(ctx->bc, &alu);
6777 if (r)
6778 return r;
6779 }
6780 src_gpr = ctx->temp_reg;
6781 }
6782
6783 memset(&vtx, 0, sizeof(vtx));
6784 vtx.op = FETCH_OP_VFETCH;
6785 vtx.buffer_id = id + R600_MAX_CONST_BUFFERS;
6786 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
6787 vtx.src_gpr = src_gpr;
6788 vtx.mega_fetch_count = 16;
6789 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
6790 vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; /* SEL_X */
6791 vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; /* SEL_Y */
6792 vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; /* SEL_Z */
6793 vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; /* SEL_W */
6794 vtx.use_const_fields = 1;
6795 vtx.buffer_index_mode = sampler_index_mode;
6796
6797 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
6798 return r;
6799
6800 if (ctx->bc->chip_class >= EVERGREEN)
6801 return 0;
6802
6803 for (i = 0; i < 4; i++) {
6804 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6805 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6806 continue;
6807
6808 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6809 alu.op = ALU_OP2_AND_INT;
6810
6811 alu.dst.chan = i;
6812 alu.dst.sel = vtx.dst_gpr;
6813 alu.dst.write = 1;
6814
6815 alu.src[0].sel = vtx.dst_gpr;
6816 alu.src[0].chan = i;
6817
6818 alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL;
6819 alu.src[1].sel += (id * 2);
6820 alu.src[1].chan = i % 4;
6821 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
6822
6823 if (i == lasti)
6824 alu.last = 1;
6825 r = r600_bytecode_add_alu(ctx->bc, &alu);
6826 if (r)
6827 return r;
6828 }
6829
6830 if (inst->Dst[0].Register.WriteMask & 3) {
6831 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6832 alu.op = ALU_OP2_OR_INT;
6833
6834 alu.dst.chan = 3;
6835 alu.dst.sel = vtx.dst_gpr;
6836 alu.dst.write = 1;
6837
6838 alu.src[0].sel = vtx.dst_gpr;
6839 alu.src[0].chan = 3;
6840
6841 alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL + (id * 2) + 1;
6842 alu.src[1].chan = 0;
6843 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
6844
6845 alu.last = 1;
6846 r = r600_bytecode_add_alu(ctx->bc, &alu);
6847 if (r)
6848 return r;
6849 }
6850 return 0;
6851 }
6852
6853 static int r600_do_buffer_txq(struct r600_shader_ctx *ctx, int reg_idx, int offset)
6854 {
6855 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6856 int r;
6857 int id = tgsi_tex_get_src_gpr(ctx, reg_idx) + offset;
6858 int sampler_index_mode = inst->Src[reg_idx].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
6859
6860 if (ctx->bc->chip_class < EVERGREEN) {
6861 struct r600_bytecode_alu alu;
6862 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6863 alu.op = ALU_OP1_MOV;
6864 alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
6865 /* r600 we have them at channel 2 of the second dword */
6866 alu.src[0].sel += (id * 2) + 1;
6867 alu.src[0].chan = 1;
6868 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
6869 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
6870 alu.last = 1;
6871 r = r600_bytecode_add_alu(ctx->bc, &alu);
6872 if (r)
6873 return r;
6874 return 0;
6875 } else {
6876 struct r600_bytecode_vtx vtx;
6877 memset(&vtx, 0, sizeof(vtx));
6878 vtx.op = FETCH_OP_GDS_MIN_UINT; /* aka GET_BUFFER_RESINFO */
6879 vtx.buffer_id = id + R600_MAX_CONST_BUFFERS;
6880 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
6881 vtx.src_gpr = 0;
6882 vtx.mega_fetch_count = 16; /* no idea here really... */
6883 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
6884 vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; /* SEL_X */
6885 vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 4 : 7; /* SEL_Y */
6886 vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 4 : 7; /* SEL_Z */
6887 vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 4 : 7; /* SEL_W */
6888 vtx.data_format = FMT_32_32_32_32;
6889 vtx.buffer_index_mode = sampler_index_mode;
6890
6891 if ((r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx)))
6892 return r;
6893 return 0;
6894 }
6895 }
6896
6897
6898 static int tgsi_tex(struct r600_shader_ctx *ctx)
6899 {
6900 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6901 struct r600_bytecode_tex tex;
6902 struct r600_bytecode_alu alu;
6903 unsigned src_gpr;
6904 int r, i, j;
6905 int opcode;
6906 bool read_compressed_msaa = ctx->bc->has_compressed_msaa_texturing &&
6907 inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
6908 (inst->Texture.Texture == TGSI_TEXTURE_2D_MSAA ||
6909 inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA);
6910
6911 bool txf_add_offsets = inst->Texture.NumOffsets &&
6912 inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
6913 inst->Texture.Texture != TGSI_TEXTURE_BUFFER;
6914
6915 /* Texture fetch instructions can only use gprs as source.
6916 * Also they cannot negate the source or take the absolute value */
6917 const boolean src_requires_loading = (inst->Instruction.Opcode != TGSI_OPCODE_TXQS &&
6918 tgsi_tex_src_requires_loading(ctx, 0)) ||
6919 read_compressed_msaa || txf_add_offsets;
6920
6921 boolean src_loaded = FALSE;
6922 unsigned sampler_src_reg = 1;
6923 int8_t offset_x = 0, offset_y = 0, offset_z = 0;
6924 boolean has_txq_cube_array_z = false;
6925 unsigned sampler_index_mode;
6926
6927 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ &&
6928 ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
6929 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)))
6930 if (inst->Dst[0].Register.WriteMask & 4) {
6931 ctx->shader->has_txq_cube_array_z_comp = true;
6932 has_txq_cube_array_z = true;
6933 }
6934
6935 if (inst->Instruction.Opcode == TGSI_OPCODE_TEX2 ||
6936 inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
6937 inst->Instruction.Opcode == TGSI_OPCODE_TXL2 ||
6938 inst->Instruction.Opcode == TGSI_OPCODE_TG4)
6939 sampler_src_reg = 2;
6940
6941 /* TGSI moves the sampler to src reg 3 for TXD */
6942 if (inst->Instruction.Opcode == TGSI_OPCODE_TXD)
6943 sampler_src_reg = 3;
6944
6945 sampler_index_mode = inst->Src[sampler_src_reg].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
6946
6947 src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
6948
6949 if (inst->Texture.Texture == TGSI_TEXTURE_BUFFER) {
6950 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) {
6951 if (ctx->bc->chip_class < EVERGREEN)
6952 ctx->shader->uses_tex_buffers = true;
6953 return r600_do_buffer_txq(ctx, 1, 0);
6954 }
6955 else if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
6956 if (ctx->bc->chip_class < EVERGREEN)
6957 ctx->shader->uses_tex_buffers = true;
6958 return do_vtx_fetch_inst(ctx, src_requires_loading);
6959 }
6960 }
6961
6962 if (inst->Instruction.Opcode == TGSI_OPCODE_TXP) {
6963 int out_chan;
6964 /* Add perspective divide */
6965 if (ctx->bc->chip_class == CAYMAN) {
6966 out_chan = 2;
6967 for (i = 0; i < 3; i++) {
6968 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6969 alu.op = ALU_OP1_RECIP_IEEE;
6970 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
6971
6972 alu.dst.sel = ctx->temp_reg;
6973 alu.dst.chan = i;
6974 if (i == 2)
6975 alu.last = 1;
6976 if (out_chan == i)
6977 alu.dst.write = 1;
6978 r = r600_bytecode_add_alu(ctx->bc, &alu);
6979 if (r)
6980 return r;
6981 }
6982
6983 } else {
6984 out_chan = 3;
6985 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6986 alu.op = ALU_OP1_RECIP_IEEE;
6987 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
6988
6989 alu.dst.sel = ctx->temp_reg;
6990 alu.dst.chan = out_chan;
6991 alu.last = 1;
6992 alu.dst.write = 1;
6993 r = r600_bytecode_add_alu(ctx->bc, &alu);
6994 if (r)
6995 return r;
6996 }
6997
6998 for (i = 0; i < 3; i++) {
6999 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7000 alu.op = ALU_OP2_MUL;
7001 alu.src[0].sel = ctx->temp_reg;
7002 alu.src[0].chan = out_chan;
7003 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
7004 alu.dst.sel = ctx->temp_reg;
7005 alu.dst.chan = i;
7006 alu.dst.write = 1;
7007 r = r600_bytecode_add_alu(ctx->bc, &alu);
7008 if (r)
7009 return r;
7010 }
7011 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7012 alu.op = ALU_OP1_MOV;
7013 alu.src[0].sel = V_SQ_ALU_SRC_1;
7014 alu.src[0].chan = 0;
7015 alu.dst.sel = ctx->temp_reg;
7016 alu.dst.chan = 3;
7017 alu.last = 1;
7018 alu.dst.write = 1;
7019 r = r600_bytecode_add_alu(ctx->bc, &alu);
7020 if (r)
7021 return r;
7022 src_loaded = TRUE;
7023 src_gpr = ctx->temp_reg;
7024 }
7025
7026
7027 if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
7028 inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7029 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
7030 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
7031 inst->Instruction.Opcode != TGSI_OPCODE_TXQ) {
7032
7033 static const unsigned src0_swizzle[] = {2, 2, 0, 1};
7034 static const unsigned src1_swizzle[] = {1, 0, 2, 2};
7035
7036 /* tmp1.xyzw = CUBE(R0.zzxy, R0.yxzz) */
7037 for (i = 0; i < 4; i++) {
7038 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7039 alu.op = ALU_OP2_CUBE;
7040 r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
7041 r600_bytecode_src(&alu.src[1], &ctx->src[0], src1_swizzle[i]);
7042 alu.dst.sel = ctx->temp_reg;
7043 alu.dst.chan = i;
7044 if (i == 3)
7045 alu.last = 1;
7046 alu.dst.write = 1;
7047 r = r600_bytecode_add_alu(ctx->bc, &alu);
7048 if (r)
7049 return r;
7050 }
7051
7052 /* tmp1.z = RCP_e(|tmp1.z|) */
7053 if (ctx->bc->chip_class == CAYMAN) {
7054 for (i = 0; i < 3; i++) {
7055 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7056 alu.op = ALU_OP1_RECIP_IEEE;
7057 alu.src[0].sel = ctx->temp_reg;
7058 alu.src[0].chan = 2;
7059 alu.src[0].abs = 1;
7060 alu.dst.sel = ctx->temp_reg;
7061 alu.dst.chan = i;
7062 if (i == 2)
7063 alu.dst.write = 1;
7064 if (i == 2)
7065 alu.last = 1;
7066 r = r600_bytecode_add_alu(ctx->bc, &alu);
7067 if (r)
7068 return r;
7069 }
7070 } else {
7071 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7072 alu.op = ALU_OP1_RECIP_IEEE;
7073 alu.src[0].sel = ctx->temp_reg;
7074 alu.src[0].chan = 2;
7075 alu.src[0].abs = 1;
7076 alu.dst.sel = ctx->temp_reg;
7077 alu.dst.chan = 2;
7078 alu.dst.write = 1;
7079 alu.last = 1;
7080 r = r600_bytecode_add_alu(ctx->bc, &alu);
7081 if (r)
7082 return r;
7083 }
7084
7085 /* MULADD R0.x, R0.x, PS1, (0x3FC00000, 1.5f).x
7086 * MULADD R0.y, R0.y, PS1, (0x3FC00000, 1.5f).x
7087 * muladd has no writemask, have to use another temp
7088 */
7089 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7090 alu.op = ALU_OP3_MULADD;
7091 alu.is_op3 = 1;
7092
7093 alu.src[0].sel = ctx->temp_reg;
7094 alu.src[0].chan = 0;
7095 alu.src[1].sel = ctx->temp_reg;
7096 alu.src[1].chan = 2;
7097
7098 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
7099 alu.src[2].chan = 0;
7100 alu.src[2].value = u_bitcast_f2u(1.5f);
7101
7102 alu.dst.sel = ctx->temp_reg;
7103 alu.dst.chan = 0;
7104 alu.dst.write = 1;
7105
7106 r = r600_bytecode_add_alu(ctx->bc, &alu);
7107 if (r)
7108 return r;
7109
7110 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7111 alu.op = ALU_OP3_MULADD;
7112 alu.is_op3 = 1;
7113
7114 alu.src[0].sel = ctx->temp_reg;
7115 alu.src[0].chan = 1;
7116 alu.src[1].sel = ctx->temp_reg;
7117 alu.src[1].chan = 2;
7118
7119 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
7120 alu.src[2].chan = 0;
7121 alu.src[2].value = u_bitcast_f2u(1.5f);
7122
7123 alu.dst.sel = ctx->temp_reg;
7124 alu.dst.chan = 1;
7125 alu.dst.write = 1;
7126
7127 alu.last = 1;
7128 r = r600_bytecode_add_alu(ctx->bc, &alu);
7129 if (r)
7130 return r;
7131 /* write initial compare value into Z component
7132 - W src 0 for shadow cube
7133 - X src 1 for shadow cube array */
7134 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
7135 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
7136 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7137 alu.op = ALU_OP1_MOV;
7138 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
7139 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
7140 else
7141 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7142 alu.dst.sel = ctx->temp_reg;
7143 alu.dst.chan = 2;
7144 alu.dst.write = 1;
7145 alu.last = 1;
7146 r = r600_bytecode_add_alu(ctx->bc, &alu);
7147 if (r)
7148 return r;
7149 }
7150
7151 if (inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7152 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
7153 if (ctx->bc->chip_class >= EVERGREEN) {
7154 int mytmp = r600_get_temp(ctx);
7155 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7156 alu.op = ALU_OP1_MOV;
7157 alu.src[0].sel = ctx->temp_reg;
7158 alu.src[0].chan = 3;
7159 alu.dst.sel = mytmp;
7160 alu.dst.chan = 0;
7161 alu.dst.write = 1;
7162 alu.last = 1;
7163 r = r600_bytecode_add_alu(ctx->bc, &alu);
7164 if (r)
7165 return r;
7166
7167 /* have to multiply original layer by 8 and add to face id (temp.w) in Z */
7168 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7169 alu.op = ALU_OP3_MULADD;
7170 alu.is_op3 = 1;
7171 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7172 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7173 alu.src[1].chan = 0;
7174 alu.src[1].value = u_bitcast_f2u(8.0f);
7175 alu.src[2].sel = mytmp;
7176 alu.src[2].chan = 0;
7177 alu.dst.sel = ctx->temp_reg;
7178 alu.dst.chan = 3;
7179 alu.dst.write = 1;
7180 alu.last = 1;
7181 r = r600_bytecode_add_alu(ctx->bc, &alu);
7182 if (r)
7183 return r;
7184 } else if (ctx->bc->chip_class < EVERGREEN) {
7185 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7186 tex.op = FETCH_OP_SET_CUBEMAP_INDEX;
7187 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7188 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7189 tex.src_gpr = r600_get_temp(ctx);
7190 tex.src_sel_x = 0;
7191 tex.src_sel_y = 0;
7192 tex.src_sel_z = 0;
7193 tex.src_sel_w = 0;
7194 tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
7195 tex.coord_type_x = 1;
7196 tex.coord_type_y = 1;
7197 tex.coord_type_z = 1;
7198 tex.coord_type_w = 1;
7199 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7200 alu.op = ALU_OP1_MOV;
7201 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7202 alu.dst.sel = tex.src_gpr;
7203 alu.dst.chan = 0;
7204 alu.last = 1;
7205 alu.dst.write = 1;
7206 r = r600_bytecode_add_alu(ctx->bc, &alu);
7207 if (r)
7208 return r;
7209
7210 r = r600_bytecode_add_tex(ctx->bc, &tex);
7211 if (r)
7212 return r;
7213 }
7214
7215 }
7216
7217 /* for cube forms of lod and bias we need to route things */
7218 if (inst->Instruction.Opcode == TGSI_OPCODE_TXB ||
7219 inst->Instruction.Opcode == TGSI_OPCODE_TXL ||
7220 inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
7221 inst->Instruction.Opcode == TGSI_OPCODE_TXL2) {
7222 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7223 alu.op = ALU_OP1_MOV;
7224 if (inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
7225 inst->Instruction.Opcode == TGSI_OPCODE_TXL2)
7226 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
7227 else
7228 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
7229 alu.dst.sel = ctx->temp_reg;
7230 alu.dst.chan = 2;
7231 alu.last = 1;
7232 alu.dst.write = 1;
7233 r = r600_bytecode_add_alu(ctx->bc, &alu);
7234 if (r)
7235 return r;
7236 }
7237
7238 src_loaded = TRUE;
7239 src_gpr = ctx->temp_reg;
7240 }
7241
7242 if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) {
7243 int temp_h = 0, temp_v = 0;
7244 int start_val = 0;
7245
7246 /* if we've already loaded the src (i.e. CUBE don't reload it). */
7247 if (src_loaded == TRUE)
7248 start_val = 1;
7249 else
7250 src_loaded = TRUE;
7251 for (i = start_val; i < 3; i++) {
7252 int treg = r600_get_temp(ctx);
7253
7254 if (i == 0)
7255 src_gpr = treg;
7256 else if (i == 1)
7257 temp_h = treg;
7258 else
7259 temp_v = treg;
7260
7261 for (j = 0; j < 4; j++) {
7262 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7263 alu.op = ALU_OP1_MOV;
7264 r600_bytecode_src(&alu.src[0], &ctx->src[i], j);
7265 alu.dst.sel = treg;
7266 alu.dst.chan = j;
7267 if (j == 3)
7268 alu.last = 1;
7269 alu.dst.write = 1;
7270 r = r600_bytecode_add_alu(ctx->bc, &alu);
7271 if (r)
7272 return r;
7273 }
7274 }
7275 for (i = 1; i < 3; i++) {
7276 /* set gradients h/v */
7277 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7278 tex.op = (i == 1) ? FETCH_OP_SET_GRADIENTS_H :
7279 FETCH_OP_SET_GRADIENTS_V;
7280 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7281 tex.sampler_index_mode = sampler_index_mode;
7282 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7283 tex.resource_index_mode = sampler_index_mode;
7284
7285 tex.src_gpr = (i == 1) ? temp_h : temp_v;
7286 tex.src_sel_x = 0;
7287 tex.src_sel_y = 1;
7288 tex.src_sel_z = 2;
7289 tex.src_sel_w = 3;
7290
7291 tex.dst_gpr = r600_get_temp(ctx); /* just to avoid confusing the asm scheduler */
7292 tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
7293 if (inst->Texture.Texture != TGSI_TEXTURE_RECT) {
7294 tex.coord_type_x = 1;
7295 tex.coord_type_y = 1;
7296 tex.coord_type_z = 1;
7297 tex.coord_type_w = 1;
7298 }
7299 r = r600_bytecode_add_tex(ctx->bc, &tex);
7300 if (r)
7301 return r;
7302 }
7303 }
7304
7305 if (src_requires_loading && !src_loaded) {
7306 for (i = 0; i < 4; i++) {
7307 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7308 alu.op = ALU_OP1_MOV;
7309 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7310 alu.dst.sel = ctx->temp_reg;
7311 alu.dst.chan = i;
7312 if (i == 3)
7313 alu.last = 1;
7314 alu.dst.write = 1;
7315 r = r600_bytecode_add_alu(ctx->bc, &alu);
7316 if (r)
7317 return r;
7318 }
7319 src_loaded = TRUE;
7320 src_gpr = ctx->temp_reg;
7321 }
7322
7323 /* get offset values */
7324 if (inst->Texture.NumOffsets) {
7325 assert(inst->Texture.NumOffsets == 1);
7326
7327 /* The texture offset feature doesn't work with the TXF instruction
7328 * and must be emulated by adding the offset to the texture coordinates. */
7329 if (txf_add_offsets) {
7330 const struct tgsi_texture_offset *off = inst->TexOffsets;
7331
7332 switch (inst->Texture.Texture) {
7333 case TGSI_TEXTURE_3D:
7334 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7335 alu.op = ALU_OP2_ADD_INT;
7336 alu.src[0].sel = src_gpr;
7337 alu.src[0].chan = 2;
7338 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7339 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleZ];
7340 alu.dst.sel = src_gpr;
7341 alu.dst.chan = 2;
7342 alu.dst.write = 1;
7343 alu.last = 1;
7344 r = r600_bytecode_add_alu(ctx->bc, &alu);
7345 if (r)
7346 return r;
7347 /* fall through */
7348
7349 case TGSI_TEXTURE_2D:
7350 case TGSI_TEXTURE_SHADOW2D:
7351 case TGSI_TEXTURE_RECT:
7352 case TGSI_TEXTURE_SHADOWRECT:
7353 case TGSI_TEXTURE_2D_ARRAY:
7354 case TGSI_TEXTURE_SHADOW2D_ARRAY:
7355 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7356 alu.op = ALU_OP2_ADD_INT;
7357 alu.src[0].sel = src_gpr;
7358 alu.src[0].chan = 1;
7359 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7360 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleY];
7361 alu.dst.sel = src_gpr;
7362 alu.dst.chan = 1;
7363 alu.dst.write = 1;
7364 alu.last = 1;
7365 r = r600_bytecode_add_alu(ctx->bc, &alu);
7366 if (r)
7367 return r;
7368 /* fall through */
7369
7370 case TGSI_TEXTURE_1D:
7371 case TGSI_TEXTURE_SHADOW1D:
7372 case TGSI_TEXTURE_1D_ARRAY:
7373 case TGSI_TEXTURE_SHADOW1D_ARRAY:
7374 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7375 alu.op = ALU_OP2_ADD_INT;
7376 alu.src[0].sel = src_gpr;
7377 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7378 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleX];
7379 alu.dst.sel = src_gpr;
7380 alu.dst.write = 1;
7381 alu.last = 1;
7382 r = r600_bytecode_add_alu(ctx->bc, &alu);
7383 if (r)
7384 return r;
7385 break;
7386 /* texture offsets do not apply to other texture targets */
7387 }
7388 } else {
7389 switch (inst->Texture.Texture) {
7390 case TGSI_TEXTURE_3D:
7391 offset_z = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleZ] << 1;
7392 /* fallthrough */
7393 case TGSI_TEXTURE_2D:
7394 case TGSI_TEXTURE_SHADOW2D:
7395 case TGSI_TEXTURE_RECT:
7396 case TGSI_TEXTURE_SHADOWRECT:
7397 case TGSI_TEXTURE_2D_ARRAY:
7398 case TGSI_TEXTURE_SHADOW2D_ARRAY:
7399 offset_y = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleY] << 1;
7400 /* fallthrough */
7401 case TGSI_TEXTURE_1D:
7402 case TGSI_TEXTURE_SHADOW1D:
7403 case TGSI_TEXTURE_1D_ARRAY:
7404 case TGSI_TEXTURE_SHADOW1D_ARRAY:
7405 offset_x = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleX] << 1;
7406 }
7407 }
7408 }
7409
7410 /* Obtain the sample index for reading a compressed MSAA color texture.
7411 * To read the FMASK, we use the ldfptr instruction, which tells us
7412 * where the samples are stored.
7413 * For uncompressed 8x MSAA surfaces, ldfptr should return 0x76543210,
7414 * which is the identity mapping. Each nibble says which physical sample
7415 * should be fetched to get that sample.
7416 *
7417 * Assume src.z contains the sample index. It should be modified like this:
7418 * src.z = (ldfptr() >> (src.z * 4)) & 0xF;
7419 * Then fetch the texel with src.
7420 */
7421 if (read_compressed_msaa) {
7422 unsigned sample_chan = 3;
7423 unsigned temp = r600_get_temp(ctx);
7424 assert(src_loaded);
7425
7426 /* temp.w = ldfptr() */
7427 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7428 tex.op = FETCH_OP_LD;
7429 tex.inst_mod = 1; /* to indicate this is ldfptr */
7430 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7431 tex.sampler_index_mode = sampler_index_mode;
7432 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7433 tex.resource_index_mode = sampler_index_mode;
7434 tex.src_gpr = src_gpr;
7435 tex.dst_gpr = temp;
7436 tex.dst_sel_x = 7; /* mask out these components */
7437 tex.dst_sel_y = 7;
7438 tex.dst_sel_z = 7;
7439 tex.dst_sel_w = 0; /* store X */
7440 tex.src_sel_x = 0;
7441 tex.src_sel_y = 1;
7442 tex.src_sel_z = 2;
7443 tex.src_sel_w = 3;
7444 tex.offset_x = offset_x;
7445 tex.offset_y = offset_y;
7446 tex.offset_z = offset_z;
7447 r = r600_bytecode_add_tex(ctx->bc, &tex);
7448 if (r)
7449 return r;
7450
7451 /* temp.x = sample_index*4 */
7452 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7453 alu.op = ALU_OP2_MULLO_INT;
7454 alu.src[0].sel = src_gpr;
7455 alu.src[0].chan = sample_chan;
7456 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7457 alu.src[1].value = 4;
7458 alu.dst.sel = temp;
7459 alu.dst.chan = 0;
7460 alu.dst.write = 1;
7461 r = emit_mul_int_op(ctx->bc, &alu);
7462 if (r)
7463 return r;
7464
7465 /* sample_index = temp.w >> temp.x */
7466 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7467 alu.op = ALU_OP2_LSHR_INT;
7468 alu.src[0].sel = temp;
7469 alu.src[0].chan = 3;
7470 alu.src[1].sel = temp;
7471 alu.src[1].chan = 0;
7472 alu.dst.sel = src_gpr;
7473 alu.dst.chan = sample_chan;
7474 alu.dst.write = 1;
7475 alu.last = 1;
7476 r = r600_bytecode_add_alu(ctx->bc, &alu);
7477 if (r)
7478 return r;
7479
7480 /* sample_index & 0xF */
7481 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7482 alu.op = ALU_OP2_AND_INT;
7483 alu.src[0].sel = src_gpr;
7484 alu.src[0].chan = sample_chan;
7485 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7486 alu.src[1].value = 0xF;
7487 alu.dst.sel = src_gpr;
7488 alu.dst.chan = sample_chan;
7489 alu.dst.write = 1;
7490 alu.last = 1;
7491 r = r600_bytecode_add_alu(ctx->bc, &alu);
7492 if (r)
7493 return r;
7494 #if 0
7495 /* visualize the FMASK */
7496 for (i = 0; i < 4; i++) {
7497 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7498 alu.op = ALU_OP1_INT_TO_FLT;
7499 alu.src[0].sel = src_gpr;
7500 alu.src[0].chan = sample_chan;
7501 alu.dst.sel = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
7502 alu.dst.chan = i;
7503 alu.dst.write = 1;
7504 alu.last = 1;
7505 r = r600_bytecode_add_alu(ctx->bc, &alu);
7506 if (r)
7507 return r;
7508 }
7509 return 0;
7510 #endif
7511 }
7512
7513 /* does this shader want a num layers from TXQ for a cube array? */
7514 if (has_txq_cube_array_z) {
7515 int id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7516
7517 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7518 alu.op = ALU_OP1_MOV;
7519
7520 alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
7521 if (ctx->bc->chip_class >= EVERGREEN) {
7522 /* with eg each dword is number of cubes */
7523 alu.src[0].sel += id / 4;
7524 alu.src[0].chan = id % 4;
7525 } else {
7526 /* r600 we have them at channel 2 of the second dword */
7527 alu.src[0].sel += (id * 2) + 1;
7528 alu.src[0].chan = 2;
7529 }
7530 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
7531 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
7532 alu.last = 1;
7533 r = r600_bytecode_add_alu(ctx->bc, &alu);
7534 if (r)
7535 return r;
7536 /* disable writemask from texture instruction */
7537 inst->Dst[0].Register.WriteMask &= ~4;
7538 }
7539
7540 opcode = ctx->inst_info->op;
7541 if (opcode == FETCH_OP_GATHER4 &&
7542 inst->TexOffsets[0].File != TGSI_FILE_NULL &&
7543 inst->TexOffsets[0].File != TGSI_FILE_IMMEDIATE) {
7544 opcode = FETCH_OP_GATHER4_O;
7545
7546 /* GATHER4_O/GATHER4_C_O use offset values loaded by
7547 SET_TEXTURE_OFFSETS instruction. The immediate offset values
7548 encoded in the instruction are ignored. */
7549 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7550 tex.op = FETCH_OP_SET_TEXTURE_OFFSETS;
7551 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7552 tex.sampler_index_mode = sampler_index_mode;
7553 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7554 tex.resource_index_mode = sampler_index_mode;
7555
7556 tex.src_gpr = ctx->file_offset[inst->TexOffsets[0].File] + inst->TexOffsets[0].Index;
7557 tex.src_sel_x = inst->TexOffsets[0].SwizzleX;
7558 tex.src_sel_y = inst->TexOffsets[0].SwizzleY;
7559 tex.src_sel_z = inst->TexOffsets[0].SwizzleZ;
7560 tex.src_sel_w = 4;
7561
7562 tex.dst_sel_x = 7;
7563 tex.dst_sel_y = 7;
7564 tex.dst_sel_z = 7;
7565 tex.dst_sel_w = 7;
7566
7567 r = r600_bytecode_add_tex(ctx->bc, &tex);
7568 if (r)
7569 return r;
7570 }
7571
7572 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
7573 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
7574 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
7575 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
7576 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY ||
7577 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
7578 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
7579 switch (opcode) {
7580 case FETCH_OP_SAMPLE:
7581 opcode = FETCH_OP_SAMPLE_C;
7582 break;
7583 case FETCH_OP_SAMPLE_L:
7584 opcode = FETCH_OP_SAMPLE_C_L;
7585 break;
7586 case FETCH_OP_SAMPLE_LB:
7587 opcode = FETCH_OP_SAMPLE_C_LB;
7588 break;
7589 case FETCH_OP_SAMPLE_G:
7590 opcode = FETCH_OP_SAMPLE_C_G;
7591 break;
7592 /* Texture gather variants */
7593 case FETCH_OP_GATHER4:
7594 opcode = FETCH_OP_GATHER4_C;
7595 break;
7596 case FETCH_OP_GATHER4_O:
7597 opcode = FETCH_OP_GATHER4_C_O;
7598 break;
7599 }
7600 }
7601
7602 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
7603 tex.op = opcode;
7604
7605 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
7606 tex.sampler_index_mode = sampler_index_mode;
7607 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
7608 tex.resource_index_mode = sampler_index_mode;
7609 tex.src_gpr = src_gpr;
7610 tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
7611
7612 if (inst->Instruction.Opcode == TGSI_OPCODE_DDX_FINE ||
7613 inst->Instruction.Opcode == TGSI_OPCODE_DDY_FINE) {
7614 tex.inst_mod = 1; /* per pixel gradient calculation instead of per 2x2 quad */
7615 }
7616
7617 if (inst->Instruction.Opcode == TGSI_OPCODE_TG4) {
7618 int8_t texture_component_select = ctx->literals[4 * inst->Src[1].Register.Index + inst->Src[1].Register.SwizzleX];
7619 tex.inst_mod = texture_component_select;
7620
7621 if (ctx->bc->chip_class == CAYMAN) {
7622 /* GATHER4 result order is different from TGSI TG4 */
7623 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 0 : 7;
7624 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 4) ? 1 : 7;
7625 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 1) ? 2 : 7;
7626 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
7627 } else {
7628 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
7629 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
7630 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
7631 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
7632 }
7633 }
7634 else if (inst->Instruction.Opcode == TGSI_OPCODE_LODQ) {
7635 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
7636 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
7637 tex.dst_sel_z = 7;
7638 tex.dst_sel_w = 7;
7639 }
7640 else if (inst->Instruction.Opcode == TGSI_OPCODE_TXQS) {
7641 tex.dst_sel_x = 3;
7642 tex.dst_sel_y = 7;
7643 tex.dst_sel_z = 7;
7644 tex.dst_sel_w = 7;
7645 }
7646 else {
7647 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
7648 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
7649 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
7650 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
7651 }
7652
7653
7654 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQS) {
7655 tex.src_sel_x = 4;
7656 tex.src_sel_y = 4;
7657 tex.src_sel_z = 4;
7658 tex.src_sel_w = 4;
7659 } else if (src_loaded) {
7660 tex.src_sel_x = 0;
7661 tex.src_sel_y = 1;
7662 tex.src_sel_z = 2;
7663 tex.src_sel_w = 3;
7664 } else {
7665 tex.src_sel_x = ctx->src[0].swizzle[0];
7666 tex.src_sel_y = ctx->src[0].swizzle[1];
7667 tex.src_sel_z = ctx->src[0].swizzle[2];
7668 tex.src_sel_w = ctx->src[0].swizzle[3];
7669 tex.src_rel = ctx->src[0].rel;
7670 }
7671
7672 if (inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
7673 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
7674 inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7675 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
7676 tex.src_sel_x = 1;
7677 tex.src_sel_y = 0;
7678 tex.src_sel_z = 3;
7679 tex.src_sel_w = 2; /* route Z compare or Lod value into W */
7680 }
7681
7682 if (inst->Texture.Texture != TGSI_TEXTURE_RECT &&
7683 inst->Texture.Texture != TGSI_TEXTURE_SHADOWRECT) {
7684 tex.coord_type_x = 1;
7685 tex.coord_type_y = 1;
7686 }
7687 tex.coord_type_z = 1;
7688 tex.coord_type_w = 1;
7689
7690 tex.offset_x = offset_x;
7691 tex.offset_y = offset_y;
7692 if (inst->Instruction.Opcode == TGSI_OPCODE_TG4 &&
7693 (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
7694 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY)) {
7695 tex.offset_z = 0;
7696 }
7697 else {
7698 tex.offset_z = offset_z;
7699 }
7700
7701 /* Put the depth for comparison in W.
7702 * TGSI_TEXTURE_SHADOW2D_ARRAY already has the depth in W.
7703 * Some instructions expect the depth in Z. */
7704 if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
7705 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
7706 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
7707 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) &&
7708 opcode != FETCH_OP_SAMPLE_C_L &&
7709 opcode != FETCH_OP_SAMPLE_C_LB) {
7710 tex.src_sel_w = tex.src_sel_z;
7711 }
7712
7713 if (inst->Texture.Texture == TGSI_TEXTURE_1D_ARRAY ||
7714 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) {
7715 if (opcode == FETCH_OP_SAMPLE_C_L ||
7716 opcode == FETCH_OP_SAMPLE_C_LB) {
7717 /* the array index is read from Y */
7718 tex.coord_type_y = 0;
7719 } else {
7720 /* the array index is read from Z */
7721 tex.coord_type_z = 0;
7722 tex.src_sel_z = tex.src_sel_y;
7723 }
7724 } else if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
7725 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
7726 ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
7727 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
7728 (ctx->bc->chip_class >= EVERGREEN)))
7729 /* the array index is read from Z */
7730 tex.coord_type_z = 0;
7731
7732 /* mask unused source components */
7733 if (opcode == FETCH_OP_SAMPLE || opcode == FETCH_OP_GATHER4) {
7734 switch (inst->Texture.Texture) {
7735 case TGSI_TEXTURE_2D:
7736 case TGSI_TEXTURE_RECT:
7737 tex.src_sel_z = 7;
7738 tex.src_sel_w = 7;
7739 break;
7740 case TGSI_TEXTURE_1D_ARRAY:
7741 tex.src_sel_y = 7;
7742 tex.src_sel_w = 7;
7743 break;
7744 case TGSI_TEXTURE_1D:
7745 tex.src_sel_y = 7;
7746 tex.src_sel_z = 7;
7747 tex.src_sel_w = 7;
7748 break;
7749 }
7750 }
7751
7752 r = r600_bytecode_add_tex(ctx->bc, &tex);
7753 if (r)
7754 return r;
7755
7756 /* add shadow ambient support - gallium doesn't do it yet */
7757 return 0;
7758 }
7759
7760 static int find_hw_atomic_counter(struct r600_shader_ctx *ctx,
7761 struct tgsi_full_src_register *src)
7762 {
7763 unsigned i;
7764
7765 if (src->Register.Indirect) {
7766 for (i = 0; i < ctx->shader->nhwatomic_ranges; i++) {
7767 if (src->Indirect.ArrayID == ctx->shader->atomics[i].array_id)
7768 return ctx->shader->atomics[i].hw_idx;
7769 }
7770 } else {
7771 uint32_t index = src->Register.Index;
7772 for (i = 0; i < ctx->shader->nhwatomic_ranges; i++) {
7773 if (ctx->shader->atomics[i].buffer_id != (unsigned)src->Dimension.Index)
7774 continue;
7775 if (index > ctx->shader->atomics[i].end)
7776 continue;
7777 if (index < ctx->shader->atomics[i].start)
7778 continue;
7779 uint32_t offset = (index - ctx->shader->atomics[i].start);
7780 return ctx->shader->atomics[i].hw_idx + offset;
7781 }
7782 }
7783 assert(0);
7784 return -1;
7785 }
7786
7787 static int tgsi_set_gds_temp(struct r600_shader_ctx *ctx,
7788 int *uav_id_p, int *uav_index_mode_p)
7789 {
7790 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7791 int uav_id, uav_index_mode = 0;
7792 int r;
7793 bool is_cm = (ctx->bc->chip_class == CAYMAN);
7794
7795 uav_id = find_hw_atomic_counter(ctx, &inst->Src[0]);
7796
7797 if (inst->Src[0].Register.Indirect) {
7798 if (is_cm) {
7799 struct r600_bytecode_alu alu;
7800 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7801 alu.op = ALU_OP2_LSHL_INT;
7802 alu.src[0].sel = get_address_file_reg(ctx, inst->Src[0].Indirect.Index);
7803 alu.src[0].chan = 0;
7804 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7805 alu.src[1].value = 2;
7806 alu.dst.sel = ctx->temp_reg;
7807 alu.dst.chan = 0;
7808 alu.dst.write = 1;
7809 alu.last = 1;
7810 r = r600_bytecode_add_alu(ctx->bc, &alu);
7811 if (r)
7812 return r;
7813
7814 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
7815 ctx->temp_reg, 0,
7816 ctx->temp_reg, 0,
7817 V_SQ_ALU_SRC_LITERAL, uav_id * 4);
7818 if (r)
7819 return r;
7820 } else
7821 uav_index_mode = 2;
7822 } else if (is_cm) {
7823 r = single_alu_op2(ctx, ALU_OP1_MOV,
7824 ctx->temp_reg, 0,
7825 V_SQ_ALU_SRC_LITERAL, uav_id * 4,
7826 0, 0);
7827 if (r)
7828 return r;
7829 }
7830 *uav_id_p = uav_id;
7831 *uav_index_mode_p = uav_index_mode;
7832 return 0;
7833 }
7834
7835 static int tgsi_load_gds(struct r600_shader_ctx *ctx)
7836 {
7837 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7838 int r;
7839 struct r600_bytecode_gds gds;
7840 int uav_id = 0;
7841 int uav_index_mode = 0;
7842 bool is_cm = (ctx->bc->chip_class == CAYMAN);
7843
7844 r = tgsi_set_gds_temp(ctx, &uav_id, &uav_index_mode);
7845 if (r)
7846 return r;
7847
7848 memset(&gds, 0, sizeof(struct r600_bytecode_gds));
7849 gds.op = FETCH_OP_GDS_READ_RET;
7850 gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
7851 gds.uav_id = is_cm ? 0 : uav_id;
7852 gds.uav_index_mode = is_cm ? 0 : uav_index_mode;
7853 gds.src_gpr = ctx->temp_reg;
7854 gds.src_sel_x = (is_cm) ? 0 : 4;
7855 gds.src_sel_y = 4;
7856 gds.src_sel_z = 4;
7857 gds.dst_sel_x = 0;
7858 gds.dst_sel_y = 7;
7859 gds.dst_sel_z = 7;
7860 gds.dst_sel_w = 7;
7861 gds.src_gpr2 = 0;
7862 gds.alloc_consume = !is_cm;
7863 r = r600_bytecode_add_gds(ctx->bc, &gds);
7864 if (r)
7865 return r;
7866
7867 ctx->bc->cf_last->vpm = 1;
7868 return 0;
7869 }
7870
7871 /* this fixes up 1D arrays properly */
7872 static int load_index_src(struct r600_shader_ctx *ctx, int src_index, int *idx_gpr)
7873 {
7874 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7875 int r, i;
7876 struct r600_bytecode_alu alu;
7877 int temp_reg = r600_get_temp(ctx);
7878
7879 for (i = 0; i < 4; i++) {
7880 bool def_val = true, write_zero = false;
7881 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7882 alu.op = ALU_OP1_MOV;
7883 alu.dst.sel = temp_reg;
7884 alu.dst.chan = i;
7885
7886 switch (inst->Memory.Texture) {
7887 case TGSI_TEXTURE_BUFFER:
7888 case TGSI_TEXTURE_1D:
7889 if (i == 1 || i == 2 || i == 3) {
7890 write_zero = true;
7891 }
7892 break;
7893 case TGSI_TEXTURE_1D_ARRAY:
7894 if (i == 1 || i == 3)
7895 write_zero = true;
7896 else if (i == 2) {
7897 r600_bytecode_src(&alu.src[0], &ctx->src[src_index], 1);
7898 def_val = false;
7899 }
7900 break;
7901 case TGSI_TEXTURE_2D:
7902 if (i == 2 || i == 3)
7903 write_zero = true;
7904 break;
7905 default:
7906 if (i == 3)
7907 write_zero = true;
7908 break;
7909 }
7910
7911 if (write_zero) {
7912 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
7913 alu.src[0].value = 0;
7914 } else if (def_val) {
7915 r600_bytecode_src(&alu.src[0], &ctx->src[src_index], i);
7916 }
7917
7918 if (i == 3)
7919 alu.last = 1;
7920 alu.dst.write = 1;
7921 r = r600_bytecode_add_alu(ctx->bc, &alu);
7922 if (r)
7923 return r;
7924 }
7925 *idx_gpr = temp_reg;
7926 return 0;
7927 }
7928
7929 static int load_buffer_coord(struct r600_shader_ctx *ctx, int src_idx,
7930 int temp_reg)
7931 {
7932 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7933 int r;
7934 if (inst->Src[src_idx].Register.File == TGSI_FILE_IMMEDIATE) {
7935 int value = (ctx->literals[4 * inst->Src[src_idx].Register.Index + inst->Src[src_idx].Register.SwizzleX]);
7936 r = single_alu_op2(ctx, ALU_OP1_MOV,
7937 temp_reg, 0,
7938 V_SQ_ALU_SRC_LITERAL, value >> 2,
7939 0, 0);
7940 if (r)
7941 return r;
7942 } else {
7943 struct r600_bytecode_alu alu;
7944 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7945 alu.op = ALU_OP2_LSHR_INT;
7946 r600_bytecode_src(&alu.src[0], &ctx->src[src_idx], 0);
7947 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
7948 alu.src[1].value = 2;
7949 alu.dst.sel = temp_reg;
7950 alu.dst.write = 1;
7951 alu.last = 1;
7952 r = r600_bytecode_add_alu(ctx->bc, &alu);
7953 if (r)
7954 return r;
7955 }
7956 return 0;
7957 }
7958
7959 static int tgsi_load_buffer(struct r600_shader_ctx *ctx)
7960 {
7961 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7962 /* have to work out the offset into the RAT immediate return buffer */
7963 struct r600_bytecode_vtx vtx;
7964 struct r600_bytecode_cf *cf;
7965 int r;
7966 int temp_reg = r600_get_temp(ctx);
7967 unsigned rat_index_mode;
7968 unsigned base;
7969
7970 rat_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
7971 base = R600_IMAGE_REAL_RESOURCE_OFFSET + ctx->info.file_count[TGSI_FILE_IMAGE];
7972
7973 r = load_buffer_coord(ctx, 1, temp_reg);
7974 if (r)
7975 return r;
7976 ctx->bc->cf_last->barrier = 1;
7977 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
7978 vtx.op = FETCH_OP_VFETCH;
7979 vtx.buffer_id = inst->Src[0].Register.Index + base;
7980 vtx.buffer_index_mode = rat_index_mode;
7981 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
7982 vtx.src_gpr = temp_reg;
7983 vtx.src_sel_x = 0;
7984 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
7985 vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; /* SEL_X */
7986 vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; /* SEL_Y */
7987 vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; /* SEL_Z */
7988 vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; /* SEL_W */
7989 vtx.num_format_all = 1;
7990 vtx.format_comp_all = 1;
7991 vtx.srf_mode_all = 0;
7992
7993 if (inst->Dst[0].Register.WriteMask & 8) {
7994 vtx.data_format = FMT_32_32_32_32;
7995 vtx.use_const_fields = 0;
7996 } else if (inst->Dst[0].Register.WriteMask & 4) {
7997 vtx.data_format = FMT_32_32_32;
7998 vtx.use_const_fields = 0;
7999 } else if (inst->Dst[0].Register.WriteMask & 2) {
8000 vtx.data_format = FMT_32_32;
8001 vtx.use_const_fields = 0;
8002 } else {
8003 vtx.data_format = FMT_32;
8004 vtx.use_const_fields = 0;
8005 }
8006
8007 r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx);
8008 if (r)
8009 return r;
8010 cf = ctx->bc->cf_last;
8011 cf->barrier = 1;
8012 return 0;
8013 }
8014
8015 static int tgsi_load_rat(struct r600_shader_ctx *ctx)
8016 {
8017 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8018 /* have to work out the offset into the RAT immediate return buffer */
8019 struct r600_bytecode_vtx vtx;
8020 struct r600_bytecode_cf *cf;
8021 int r;
8022 int idx_gpr;
8023 unsigned format, num_format, format_comp, endian;
8024 const struct util_format_description *desc;
8025 unsigned rat_index_mode;
8026 unsigned immed_base;
8027
8028 r = load_thread_id_gpr(ctx);
8029 if (r)
8030 return r;
8031
8032 rat_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
8033
8034 immed_base = R600_IMAGE_IMMED_RESOURCE_OFFSET;
8035 r = load_index_src(ctx, 1, &idx_gpr);
8036 if (r)
8037 return r;
8038
8039 if (rat_index_mode)
8040 egcm_load_index_reg(ctx->bc, 1, false);
8041
8042 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
8043 cf = ctx->bc->cf_last;
8044
8045 cf->rat.id = ctx->shader->rat_base + inst->Src[0].Register.Index;
8046 cf->rat.inst = V_RAT_INST_NOP_RTN;
8047 cf->rat.index_mode = rat_index_mode;
8048 cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND;
8049 cf->output.gpr = ctx->thread_id_gpr;
8050 cf->output.index_gpr = idx_gpr;
8051 cf->output.comp_mask = 0xf;
8052 cf->output.burst_count = 1;
8053 cf->vpm = 1;
8054 cf->barrier = 1;
8055 cf->mark = 1;
8056 cf->output.elem_size = 0;
8057
8058 r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK);
8059 cf = ctx->bc->cf_last;
8060 cf->barrier = 1;
8061
8062 desc = util_format_description(inst->Memory.Format);
8063 r600_vertex_data_type(inst->Memory.Format,
8064 &format, &num_format, &format_comp, &endian);
8065 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
8066 vtx.op = FETCH_OP_VFETCH;
8067 vtx.buffer_id = immed_base + inst->Src[0].Register.Index;
8068 vtx.buffer_index_mode = rat_index_mode;
8069 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
8070 vtx.src_gpr = ctx->thread_id_gpr;
8071 vtx.src_sel_x = 1;
8072 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8073 vtx.dst_sel_x = desc->swizzle[0];
8074 vtx.dst_sel_y = desc->swizzle[1];
8075 vtx.dst_sel_z = desc->swizzle[2];
8076 vtx.dst_sel_w = desc->swizzle[3];
8077 vtx.srf_mode_all = 1;
8078 vtx.data_format = format;
8079 vtx.num_format_all = num_format;
8080 vtx.format_comp_all = format_comp;
8081 vtx.endian = endian;
8082 vtx.offset = 0;
8083 vtx.mega_fetch_count = 3;
8084 r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx);
8085 if (r)
8086 return r;
8087 cf = ctx->bc->cf_last;
8088 cf->barrier = 1;
8089 return 0;
8090 }
8091
8092 static int tgsi_load_lds(struct r600_shader_ctx *ctx)
8093 {
8094 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8095 struct r600_bytecode_alu alu;
8096 int r;
8097 int temp_reg = r600_get_temp(ctx);
8098
8099 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8100 alu.op = ALU_OP1_MOV;
8101 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
8102 alu.dst.sel = temp_reg;
8103 alu.dst.write = 1;
8104 alu.last = 1;
8105 r = r600_bytecode_add_alu(ctx->bc, &alu);
8106 if (r)
8107 return r;
8108
8109 r = do_lds_fetch_values(ctx, temp_reg,
8110 ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index, inst->Dst[0].Register.WriteMask);
8111 if (r)
8112 return r;
8113 return 0;
8114 }
8115
8116 static int tgsi_load(struct r600_shader_ctx *ctx)
8117 {
8118 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8119 if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
8120 return tgsi_load_rat(ctx);
8121 if (inst->Src[0].Register.File == TGSI_FILE_HW_ATOMIC)
8122 return tgsi_load_gds(ctx);
8123 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
8124 return tgsi_load_buffer(ctx);
8125 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY)
8126 return tgsi_load_lds(ctx);
8127 return 0;
8128 }
8129
8130 static int tgsi_store_buffer_rat(struct r600_shader_ctx *ctx)
8131 {
8132 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8133 struct r600_bytecode_cf *cf;
8134 int r, i;
8135 unsigned rat_index_mode;
8136 int lasti;
8137 int temp_reg = r600_get_temp(ctx), treg2 = r600_get_temp(ctx);
8138
8139 r = load_buffer_coord(ctx, 0, treg2);
8140 if (r)
8141 return r;
8142
8143 rat_index_mode = inst->Dst[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
8144 if (rat_index_mode)
8145 egcm_load_index_reg(ctx->bc, 1, false);
8146
8147 for (i = 0; i <= 3; i++) {
8148 struct r600_bytecode_alu alu;
8149 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8150 alu.op = ALU_OP1_MOV;
8151 alu.dst.sel = temp_reg;
8152 alu.dst.chan = i;
8153 alu.src[0].sel = V_SQ_ALU_SRC_0;
8154 alu.last = (i == 3);
8155 alu.dst.write = 1;
8156 r = r600_bytecode_add_alu(ctx->bc, &alu);
8157 if (r)
8158 return r;
8159 }
8160
8161 lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
8162 for (i = 0; i <= lasti; i++) {
8163 struct r600_bytecode_alu alu;
8164 if (!((1 << i) & inst->Dst[0].Register.WriteMask))
8165 continue;
8166
8167 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
8168 temp_reg, 0,
8169 treg2, 0,
8170 V_SQ_ALU_SRC_LITERAL, i);
8171 if (r)
8172 return r;
8173
8174 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8175 alu.op = ALU_OP1_MOV;
8176 alu.dst.sel = ctx->temp_reg;
8177 alu.dst.chan = 0;
8178
8179 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
8180 alu.last = 1;
8181 alu.dst.write = 1;
8182 r = r600_bytecode_add_alu(ctx->bc, &alu);
8183 if (r)
8184 return r;
8185
8186 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
8187 cf = ctx->bc->cf_last;
8188
8189 cf->rat.id = ctx->shader->rat_base + inst->Dst[0].Register.Index + ctx->info.file_count[TGSI_FILE_IMAGE];
8190 cf->rat.inst = V_RAT_INST_STORE_TYPED;
8191 cf->rat.index_mode = rat_index_mode;
8192 cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
8193 cf->output.gpr = ctx->temp_reg;
8194 cf->output.index_gpr = temp_reg;
8195 cf->output.comp_mask = 1;
8196 cf->output.burst_count = 1;
8197 cf->vpm = 1;
8198 cf->barrier = 1;
8199 cf->output.elem_size = 0;
8200 }
8201 return 0;
8202 }
8203
8204 static int tgsi_store_rat(struct r600_shader_ctx *ctx)
8205 {
8206 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8207 struct r600_bytecode_cf *cf;
8208 bool src_requires_loading = false;
8209 int val_gpr, idx_gpr;
8210 int r, i;
8211 unsigned rat_index_mode;
8212
8213 rat_index_mode = inst->Dst[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
8214
8215 r = load_index_src(ctx, 0, &idx_gpr);
8216 if (r)
8217 return r;
8218
8219 if (inst->Src[1].Register.File != TGSI_FILE_TEMPORARY)
8220 src_requires_loading = true;
8221
8222 if (src_requires_loading) {
8223 struct r600_bytecode_alu alu;
8224 for (i = 0; i < 4; i++) {
8225 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8226 alu.op = ALU_OP1_MOV;
8227 alu.dst.sel = ctx->temp_reg;
8228 alu.dst.chan = i;
8229
8230 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
8231 if (i == 3)
8232 alu.last = 1;
8233 alu.dst.write = 1;
8234 r = r600_bytecode_add_alu(ctx->bc, &alu);
8235 if (r)
8236 return r;
8237 }
8238 val_gpr = ctx->temp_reg;
8239 } else
8240 val_gpr = tgsi_tex_get_src_gpr(ctx, 1);
8241 if (rat_index_mode)
8242 egcm_load_index_reg(ctx->bc, 1, false);
8243
8244 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
8245 cf = ctx->bc->cf_last;
8246
8247 cf->rat.id = ctx->shader->rat_base + inst->Dst[0].Register.Index;
8248 cf->rat.inst = V_RAT_INST_STORE_TYPED;
8249 cf->rat.index_mode = rat_index_mode;
8250 cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
8251 cf->output.gpr = val_gpr;
8252 cf->output.index_gpr = idx_gpr;
8253 cf->output.comp_mask = 0xf;
8254 cf->output.burst_count = 1;
8255 cf->vpm = 1;
8256 cf->barrier = 1;
8257 cf->output.elem_size = 0;
8258 return 0;
8259 }
8260
8261 static int tgsi_store_lds(struct r600_shader_ctx *ctx)
8262 {
8263 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8264 struct r600_bytecode_alu alu;
8265 int r, i, lasti;
8266 int write_mask = inst->Dst[0].Register.WriteMask;
8267 int temp_reg = r600_get_temp(ctx);
8268
8269 /* LDS write */
8270 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8271 alu.op = ALU_OP1_MOV;
8272 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
8273 alu.dst.sel = temp_reg;
8274 alu.dst.write = 1;
8275 alu.last = 1;
8276 r = r600_bytecode_add_alu(ctx->bc, &alu);
8277 if (r)
8278 return r;
8279
8280 lasti = tgsi_last_instruction(write_mask);
8281 for (i = 1; i <= lasti; i++) {
8282 if (!(write_mask & (1 << i)))
8283 continue;
8284 r = single_alu_op2(ctx, ALU_OP2_ADD_INT,
8285 temp_reg, i,
8286 temp_reg, 0,
8287 V_SQ_ALU_SRC_LITERAL, 4 * i);
8288 if (r)
8289 return r;
8290 }
8291 for (i = 0; i <= lasti; i++) {
8292 if (!(write_mask & (1 << i)))
8293 continue;
8294
8295 if ((i == 0 && ((write_mask & 3) == 3)) ||
8296 (i == 2 && ((write_mask & 0xc) == 0xc))) {
8297 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8298 alu.op = LDS_OP3_LDS_WRITE_REL;
8299
8300 alu.src[0].sel = temp_reg;
8301 alu.src[0].chan = i;
8302 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
8303 r600_bytecode_src(&alu.src[2], &ctx->src[1], i + 1);
8304 alu.last = 1;
8305 alu.is_lds_idx_op = true;
8306 alu.lds_idx = 1;
8307 r = r600_bytecode_add_alu(ctx->bc, &alu);
8308 if (r)
8309 return r;
8310 i += 1;
8311 continue;
8312 }
8313 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8314 alu.op = LDS_OP2_LDS_WRITE;
8315
8316 alu.src[0].sel = temp_reg;
8317 alu.src[0].chan = i;
8318 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
8319
8320 alu.last = 1;
8321 alu.is_lds_idx_op = true;
8322
8323 r = r600_bytecode_add_alu(ctx->bc, &alu);
8324 if (r)
8325 return r;
8326 }
8327 return 0;
8328 }
8329
8330 static int tgsi_store(struct r600_shader_ctx *ctx)
8331 {
8332 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8333 if (inst->Dst[0].Register.File == TGSI_FILE_BUFFER)
8334 return tgsi_store_buffer_rat(ctx);
8335 else if (inst->Dst[0].Register.File == TGSI_FILE_MEMORY)
8336 return tgsi_store_lds(ctx);
8337 else
8338 return tgsi_store_rat(ctx);
8339 }
8340
8341 static int tgsi_atomic_op_rat(struct r600_shader_ctx *ctx)
8342 {
8343 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8344 /* have to work out the offset into the RAT immediate return buffer */
8345 struct r600_bytecode_alu alu;
8346 struct r600_bytecode_vtx vtx;
8347 struct r600_bytecode_cf *cf;
8348 int r;
8349 int idx_gpr;
8350 unsigned format, num_format, format_comp, endian;
8351 const struct util_format_description *desc;
8352 unsigned rat_index_mode;
8353 unsigned immed_base;
8354 unsigned rat_base;
8355
8356 immed_base = R600_IMAGE_IMMED_RESOURCE_OFFSET;
8357 rat_base = ctx->shader->rat_base;
8358
8359 r = load_thread_id_gpr(ctx);
8360 if (r)
8361 return r;
8362
8363 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER) {
8364 immed_base += ctx->info.file_count[TGSI_FILE_IMAGE];
8365 rat_base += ctx->info.file_count[TGSI_FILE_IMAGE];
8366
8367 r = load_buffer_coord(ctx, 1, ctx->temp_reg);
8368 if (r)
8369 return r;
8370 idx_gpr = ctx->temp_reg;
8371 } else {
8372 r = load_index_src(ctx, 1, &idx_gpr);
8373 if (r)
8374 return r;
8375 }
8376
8377 rat_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
8378
8379 if (ctx->inst_info->op == V_RAT_INST_CMPXCHG_INT_RTN) {
8380 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8381 alu.op = ALU_OP1_MOV;
8382 alu.dst.sel = ctx->thread_id_gpr;
8383 alu.dst.chan = 0;
8384 alu.dst.write = 1;
8385 r600_bytecode_src(&alu.src[0], &ctx->src[3], 0);
8386 alu.last = 1;
8387 r = r600_bytecode_add_alu(ctx->bc, &alu);
8388 if (r)
8389 return r;
8390
8391 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8392 alu.op = ALU_OP1_MOV;
8393 alu.dst.sel = ctx->thread_id_gpr;
8394 if (ctx->bc->chip_class == CAYMAN)
8395 alu.dst.chan = 2;
8396 else
8397 alu.dst.chan = 3;
8398 alu.dst.write = 1;
8399 r600_bytecode_src(&alu.src[0], &ctx->src[2], 0);
8400 alu.last = 1;
8401 r = r600_bytecode_add_alu(ctx->bc, &alu);
8402 if (r)
8403 return r;
8404 } else {
8405 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8406 alu.op = ALU_OP1_MOV;
8407 alu.dst.sel = ctx->thread_id_gpr;
8408 alu.dst.chan = 0;
8409 alu.dst.write = 1;
8410 r600_bytecode_src(&alu.src[0], &ctx->src[2], 0);
8411 alu.last = 1;
8412 r = r600_bytecode_add_alu(ctx->bc, &alu);
8413 if (r)
8414 return r;
8415 }
8416
8417 if (rat_index_mode)
8418 egcm_load_index_reg(ctx->bc, 1, false);
8419 r600_bytecode_add_cfinst(ctx->bc, CF_OP_MEM_RAT);
8420 cf = ctx->bc->cf_last;
8421
8422 cf->rat.id = rat_base + inst->Src[0].Register.Index;
8423 cf->rat.inst = ctx->inst_info->op;
8424 cf->rat.index_mode = rat_index_mode;
8425 cf->output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_READ_IND;
8426 cf->output.gpr = ctx->thread_id_gpr;
8427 cf->output.index_gpr = idx_gpr;
8428 cf->output.comp_mask = 0xf;
8429 cf->output.burst_count = 1;
8430 cf->vpm = 1;
8431 cf->barrier = 1;
8432 cf->mark = 1;
8433 cf->output.elem_size = 0;
8434 r600_bytecode_add_cfinst(ctx->bc, CF_OP_WAIT_ACK);
8435 cf = ctx->bc->cf_last;
8436 cf->barrier = 1;
8437 cf->cf_addr = 1;
8438
8439 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
8440 if (inst->Src[0].Register.File == TGSI_FILE_IMAGE) {
8441 desc = util_format_description(inst->Memory.Format);
8442 r600_vertex_data_type(inst->Memory.Format,
8443 &format, &num_format, &format_comp, &endian);
8444 vtx.dst_sel_x = desc->swizzle[0];
8445 } else {
8446 format = FMT_32;
8447 num_format = 1;
8448 format_comp = 0;
8449 endian = 0;
8450 vtx.dst_sel_x = 0;
8451 }
8452 vtx.op = FETCH_OP_VFETCH;
8453 vtx.buffer_id = immed_base + inst->Src[0].Register.Index;
8454 vtx.buffer_index_mode = rat_index_mode;
8455 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
8456 vtx.src_gpr = ctx->thread_id_gpr;
8457 vtx.src_sel_x = 1;
8458 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8459 vtx.dst_sel_y = 7;
8460 vtx.dst_sel_z = 7;
8461 vtx.dst_sel_w = 7;
8462 vtx.use_const_fields = 0;
8463 vtx.srf_mode_all = 1;
8464 vtx.data_format = format;
8465 vtx.num_format_all = num_format;
8466 vtx.format_comp_all = format_comp;
8467 vtx.endian = endian;
8468 vtx.offset = 0;
8469 vtx.mega_fetch_count = 0xf;
8470 r = r600_bytecode_add_vtx_tc(ctx->bc, &vtx);
8471 if (r)
8472 return r;
8473 cf = ctx->bc->cf_last;
8474 cf->vpm = 1;
8475 cf->barrier = 1;
8476 return 0;
8477 }
8478
8479 static int get_gds_op(int opcode)
8480 {
8481 switch (opcode) {
8482 case TGSI_OPCODE_ATOMUADD:
8483 return FETCH_OP_GDS_ADD_RET;
8484 case TGSI_OPCODE_ATOMAND:
8485 return FETCH_OP_GDS_AND_RET;
8486 case TGSI_OPCODE_ATOMOR:
8487 return FETCH_OP_GDS_OR_RET;
8488 case TGSI_OPCODE_ATOMXOR:
8489 return FETCH_OP_GDS_XOR_RET;
8490 case TGSI_OPCODE_ATOMUMIN:
8491 return FETCH_OP_GDS_MIN_UINT_RET;
8492 case TGSI_OPCODE_ATOMUMAX:
8493 return FETCH_OP_GDS_MAX_UINT_RET;
8494 case TGSI_OPCODE_ATOMXCHG:
8495 return FETCH_OP_GDS_XCHG_RET;
8496 case TGSI_OPCODE_ATOMCAS:
8497 return FETCH_OP_GDS_CMP_XCHG_RET;
8498 default:
8499 return -1;
8500 }
8501 }
8502
8503 static int tgsi_atomic_op_gds(struct r600_shader_ctx *ctx)
8504 {
8505 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8506 struct r600_bytecode_gds gds;
8507 struct r600_bytecode_alu alu;
8508 int gds_op = get_gds_op(inst->Instruction.Opcode);
8509 int r;
8510 int uav_id = 0;
8511 int uav_index_mode = 0;
8512 bool is_cm = (ctx->bc->chip_class == CAYMAN);
8513
8514 if (gds_op == -1) {
8515 fprintf(stderr, "unknown GDS op for opcode %d\n", inst->Instruction.Opcode);
8516 return -1;
8517 }
8518
8519 r = tgsi_set_gds_temp(ctx, &uav_id, &uav_index_mode);
8520 if (r)
8521 return r;
8522
8523 if (inst->Src[2].Register.File == TGSI_FILE_IMMEDIATE) {
8524 int value = (ctx->literals[4 * inst->Src[2].Register.Index + inst->Src[2].Register.SwizzleX]);
8525 int abs_value = abs(value);
8526 if (abs_value != value && gds_op == FETCH_OP_GDS_ADD_RET)
8527 gds_op = FETCH_OP_GDS_SUB_RET;
8528 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8529 alu.op = ALU_OP1_MOV;
8530 alu.dst.sel = ctx->temp_reg;
8531 alu.dst.chan = is_cm ? 1 : 0;
8532 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
8533 alu.src[0].value = abs_value;
8534 alu.last = 1;
8535 alu.dst.write = 1;
8536 r = r600_bytecode_add_alu(ctx->bc, &alu);
8537 if (r)
8538 return r;
8539 } else {
8540 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8541 alu.op = ALU_OP1_MOV;
8542 alu.dst.sel = ctx->temp_reg;
8543 alu.dst.chan = is_cm ? 1 : 0;
8544 r600_bytecode_src(&alu.src[0], &ctx->src[2], 0);
8545 alu.last = 1;
8546 alu.dst.write = 1;
8547 r = r600_bytecode_add_alu(ctx->bc, &alu);
8548 if (r)
8549 return r;
8550 }
8551
8552
8553 memset(&gds, 0, sizeof(struct r600_bytecode_gds));
8554 gds.op = gds_op;
8555 gds.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8556 gds.uav_id = is_cm ? 0 : uav_id;
8557 gds.uav_index_mode = is_cm ? 0 : uav_index_mode;
8558 gds.src_gpr = ctx->temp_reg;
8559 gds.src_gpr2 = 0;
8560 gds.src_sel_x = is_cm ? 0 : 4;
8561 gds.src_sel_y = is_cm ? 1 : 0;
8562 gds.src_sel_z = 7;
8563 gds.dst_sel_x = 0;
8564 gds.dst_sel_y = 7;
8565 gds.dst_sel_z = 7;
8566 gds.dst_sel_w = 7;
8567 gds.alloc_consume = !is_cm;
8568
8569 r = r600_bytecode_add_gds(ctx->bc, &gds);
8570 if (r)
8571 return r;
8572 ctx->bc->cf_last->vpm = 1;
8573 return 0;
8574 }
8575
8576 static int get_lds_op(int opcode)
8577 {
8578 switch (opcode) {
8579 case TGSI_OPCODE_ATOMUADD:
8580 return LDS_OP2_LDS_ADD_RET;
8581 case TGSI_OPCODE_ATOMAND:
8582 return LDS_OP2_LDS_AND_RET;
8583 case TGSI_OPCODE_ATOMOR:
8584 return LDS_OP2_LDS_OR_RET;
8585 case TGSI_OPCODE_ATOMXOR:
8586 return LDS_OP2_LDS_XOR_RET;
8587 case TGSI_OPCODE_ATOMUMIN:
8588 return LDS_OP2_LDS_MIN_UINT_RET;
8589 case TGSI_OPCODE_ATOMUMAX:
8590 return LDS_OP2_LDS_MAX_UINT_RET;
8591 case TGSI_OPCODE_ATOMIMIN:
8592 return LDS_OP2_LDS_MIN_INT_RET;
8593 case TGSI_OPCODE_ATOMIMAX:
8594 return LDS_OP2_LDS_MAX_INT_RET;
8595 case TGSI_OPCODE_ATOMXCHG:
8596 return LDS_OP2_LDS_XCHG_RET;
8597 case TGSI_OPCODE_ATOMCAS:
8598 return LDS_OP3_LDS_CMP_XCHG_RET;
8599 default:
8600 return -1;
8601 }
8602 }
8603
8604 static int tgsi_atomic_op_lds(struct r600_shader_ctx *ctx)
8605 {
8606 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8607 int lds_op = get_lds_op(inst->Instruction.Opcode);
8608 int r;
8609
8610 struct r600_bytecode_alu alu;
8611 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8612 alu.op = lds_op;
8613 alu.is_lds_idx_op = true;
8614 alu.last = 1;
8615 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
8616 r600_bytecode_src(&alu.src[1], &ctx->src[2], 0);
8617 if (lds_op == LDS_OP3_LDS_CMP_XCHG_RET)
8618 r600_bytecode_src(&alu.src[2], &ctx->src[3], 0);
8619 else
8620 alu.src[2].sel = V_SQ_ALU_SRC_0;
8621 r = r600_bytecode_add_alu(ctx->bc, &alu);
8622 if (r)
8623 return r;
8624
8625 /* then read from LDS_OQ_A_POP */
8626 memset(&alu, 0, sizeof(alu));
8627
8628 alu.op = ALU_OP1_MOV;
8629 alu.src[0].sel = EG_V_SQ_ALU_SRC_LDS_OQ_A_POP;
8630 alu.src[0].chan = 0;
8631 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
8632 alu.dst.write = 1;
8633 alu.last = 1;
8634 r = r600_bytecode_add_alu(ctx->bc, &alu);
8635 if (r)
8636 return r;
8637
8638 return 0;
8639 }
8640
8641 static int tgsi_atomic_op(struct r600_shader_ctx *ctx)
8642 {
8643 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8644 if (inst->Src[0].Register.File == TGSI_FILE_IMAGE)
8645 return tgsi_atomic_op_rat(ctx);
8646 if (inst->Src[0].Register.File == TGSI_FILE_HW_ATOMIC)
8647 return tgsi_atomic_op_gds(ctx);
8648 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER)
8649 return tgsi_atomic_op_rat(ctx);
8650 if (inst->Src[0].Register.File == TGSI_FILE_MEMORY)
8651 return tgsi_atomic_op_lds(ctx);
8652 return 0;
8653 }
8654
8655 static int tgsi_resq(struct r600_shader_ctx *ctx)
8656 {
8657 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8658 unsigned sampler_index_mode;
8659 struct r600_bytecode_tex tex;
8660 int r;
8661 boolean has_txq_cube_array_z = false;
8662
8663 if (inst->Src[0].Register.File == TGSI_FILE_BUFFER ||
8664 (inst->Src[0].Register.File == TGSI_FILE_IMAGE && inst->Memory.Texture == TGSI_TEXTURE_BUFFER)) {
8665 if (ctx->bc->chip_class < EVERGREEN)
8666 ctx->shader->uses_tex_buffers = true;
8667 return r600_do_buffer_txq(ctx, 0, ctx->shader->image_size_const_offset);
8668 }
8669
8670 if (inst->Memory.Texture == TGSI_TEXTURE_CUBE_ARRAY &&
8671 inst->Dst[0].Register.WriteMask & 4) {
8672 ctx->shader->has_txq_cube_array_z_comp = true;
8673 has_txq_cube_array_z = true;
8674 }
8675
8676 sampler_index_mode = inst->Src[0].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
8677 if (sampler_index_mode)
8678 egcm_load_index_reg(ctx->bc, 1, false);
8679
8680
8681 /* does this shader want a num layers from TXQ for a cube array? */
8682 if (has_txq_cube_array_z) {
8683 int id = tgsi_tex_get_src_gpr(ctx, 0) + ctx->shader->image_size_const_offset;
8684 struct r600_bytecode_alu alu;
8685
8686 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8687 alu.op = ALU_OP1_MOV;
8688
8689 alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
8690 /* with eg each dword is either number of cubes */
8691 alu.src[0].sel += id / 4;
8692 alu.src[0].chan = id % 4;
8693 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
8694 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
8695 alu.last = 1;
8696 r = r600_bytecode_add_alu(ctx->bc, &alu);
8697 if (r)
8698 return r;
8699 /* disable writemask from texture instruction */
8700 inst->Dst[0].Register.WriteMask &= ~4;
8701 }
8702 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
8703 tex.op = ctx->inst_info->op;
8704 tex.sampler_id = R600_IMAGE_REAL_RESOURCE_OFFSET + inst->Src[0].Register.Index;
8705 tex.sampler_index_mode = sampler_index_mode;
8706 tex.resource_id = tex.sampler_id;
8707 tex.resource_index_mode = sampler_index_mode;
8708 tex.src_sel_x = 4;
8709 tex.src_sel_y = 4;
8710 tex.src_sel_z = 4;
8711 tex.src_sel_w = 4;
8712 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
8713 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
8714 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
8715 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
8716 tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
8717 r = r600_bytecode_add_tex(ctx->bc, &tex);
8718 if (r)
8719 return r;
8720
8721 return 0;
8722 }
8723
8724 static int tgsi_lrp(struct r600_shader_ctx *ctx)
8725 {
8726 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8727 struct r600_bytecode_alu alu;
8728 unsigned lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
8729 unsigned i, temp_regs[2];
8730 int r;
8731
8732 /* optimize if it's just an equal balance */
8733 if (ctx->src[0].sel == V_SQ_ALU_SRC_0_5) {
8734 for (i = 0; i < lasti + 1; i++) {
8735 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
8736 continue;
8737
8738 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8739 alu.op = ALU_OP2_ADD;
8740 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
8741 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
8742 alu.omod = 3;
8743 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
8744 alu.dst.chan = i;
8745 if (i == lasti) {
8746 alu.last = 1;
8747 }
8748 r = r600_bytecode_add_alu(ctx->bc, &alu);
8749 if (r)
8750 return r;
8751 }
8752 return 0;
8753 }
8754
8755 /* 1 - src0 */
8756 for (i = 0; i < lasti + 1; i++) {
8757 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
8758 continue;
8759
8760 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8761 alu.op = ALU_OP2_ADD;
8762 alu.src[0].sel = V_SQ_ALU_SRC_1;
8763 alu.src[0].chan = 0;
8764 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
8765 r600_bytecode_src_toggle_neg(&alu.src[1]);
8766 alu.dst.sel = ctx->temp_reg;
8767 alu.dst.chan = i;
8768 if (i == lasti) {
8769 alu.last = 1;
8770 }
8771 alu.dst.write = 1;
8772 r = r600_bytecode_add_alu(ctx->bc, &alu);
8773 if (r)
8774 return r;
8775 }
8776
8777 /* (1 - src0) * src2 */
8778 for (i = 0; i < lasti + 1; i++) {
8779 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
8780 continue;
8781
8782 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8783 alu.op = ALU_OP2_MUL;
8784 alu.src[0].sel = ctx->temp_reg;
8785 alu.src[0].chan = i;
8786 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
8787 alu.dst.sel = ctx->temp_reg;
8788 alu.dst.chan = i;
8789 if (i == lasti) {
8790 alu.last = 1;
8791 }
8792 alu.dst.write = 1;
8793 r = r600_bytecode_add_alu(ctx->bc, &alu);
8794 if (r)
8795 return r;
8796 }
8797
8798 /* src0 * src1 + (1 - src0) * src2 */
8799 if (ctx->src[0].abs)
8800 temp_regs[0] = r600_get_temp(ctx);
8801 else
8802 temp_regs[0] = 0;
8803 if (ctx->src[1].abs)
8804 temp_regs[1] = r600_get_temp(ctx);
8805 else
8806 temp_regs[1] = 0;
8807
8808 for (i = 0; i < lasti + 1; i++) {
8809 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
8810 continue;
8811
8812 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8813 alu.op = ALU_OP3_MULADD;
8814 alu.is_op3 = 1;
8815 r = tgsi_make_src_for_op3(ctx, temp_regs[0], i, &alu.src[0], &ctx->src[0]);
8816 if (r)
8817 return r;
8818 r = tgsi_make_src_for_op3(ctx, temp_regs[1], i, &alu.src[1], &ctx->src[1]);
8819 if (r)
8820 return r;
8821 alu.src[2].sel = ctx->temp_reg;
8822 alu.src[2].chan = i;
8823
8824 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
8825 alu.dst.chan = i;
8826 if (i == lasti) {
8827 alu.last = 1;
8828 }
8829 r = r600_bytecode_add_alu(ctx->bc, &alu);
8830 if (r)
8831 return r;
8832 }
8833 return 0;
8834 }
8835
8836 static int tgsi_cmp(struct r600_shader_ctx *ctx)
8837 {
8838 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8839 struct r600_bytecode_alu alu;
8840 int i, r, j;
8841 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
8842 int temp_regs[3];
8843 unsigned op;
8844
8845 if (ctx->src[0].abs && ctx->src[0].neg) {
8846 op = ALU_OP3_CNDE;
8847 ctx->src[0].abs = 0;
8848 ctx->src[0].neg = 0;
8849 } else {
8850 op = ALU_OP3_CNDGE;
8851 }
8852
8853 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
8854 temp_regs[j] = 0;
8855 if (ctx->src[j].abs)
8856 temp_regs[j] = r600_get_temp(ctx);
8857 }
8858
8859 for (i = 0; i < lasti + 1; i++) {
8860 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
8861 continue;
8862
8863 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8864 alu.op = op;
8865 r = tgsi_make_src_for_op3(ctx, temp_regs[0], i, &alu.src[0], &ctx->src[0]);
8866 if (r)
8867 return r;
8868 r = tgsi_make_src_for_op3(ctx, temp_regs[2], i, &alu.src[1], &ctx->src[2]);
8869 if (r)
8870 return r;
8871 r = tgsi_make_src_for_op3(ctx, temp_regs[1], i, &alu.src[2], &ctx->src[1]);
8872 if (r)
8873 return r;
8874 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
8875 alu.dst.chan = i;
8876 alu.dst.write = 1;
8877 alu.is_op3 = 1;
8878 if (i == lasti)
8879 alu.last = 1;
8880 r = r600_bytecode_add_alu(ctx->bc, &alu);
8881 if (r)
8882 return r;
8883 }
8884 return 0;
8885 }
8886
8887 static int tgsi_ucmp(struct r600_shader_ctx *ctx)
8888 {
8889 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8890 struct r600_bytecode_alu alu;
8891 int i, r;
8892 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
8893
8894 for (i = 0; i < lasti + 1; i++) {
8895 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
8896 continue;
8897
8898 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8899 alu.op = ALU_OP3_CNDE_INT;
8900 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
8901 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
8902 r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
8903 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
8904 alu.dst.chan = i;
8905 alu.dst.write = 1;
8906 alu.is_op3 = 1;
8907 if (i == lasti)
8908 alu.last = 1;
8909 r = r600_bytecode_add_alu(ctx->bc, &alu);
8910 if (r)
8911 return r;
8912 }
8913 return 0;
8914 }
8915
8916 static int tgsi_exp(struct r600_shader_ctx *ctx)
8917 {
8918 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
8919 struct r600_bytecode_alu alu;
8920 int r;
8921 unsigned i;
8922
8923 /* result.x = 2^floor(src); */
8924 if (inst->Dst[0].Register.WriteMask & 1) {
8925 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8926
8927 alu.op = ALU_OP1_FLOOR;
8928 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
8929
8930 alu.dst.sel = ctx->temp_reg;
8931 alu.dst.chan = 0;
8932 alu.dst.write = 1;
8933 alu.last = 1;
8934 r = r600_bytecode_add_alu(ctx->bc, &alu);
8935 if (r)
8936 return r;
8937
8938 if (ctx->bc->chip_class == CAYMAN) {
8939 for (i = 0; i < 3; i++) {
8940 alu.op = ALU_OP1_EXP_IEEE;
8941 alu.src[0].sel = ctx->temp_reg;
8942 alu.src[0].chan = 0;
8943
8944 alu.dst.sel = ctx->temp_reg;
8945 alu.dst.chan = i;
8946 alu.dst.write = i == 0;
8947 alu.last = i == 2;
8948 r = r600_bytecode_add_alu(ctx->bc, &alu);
8949 if (r)
8950 return r;
8951 }
8952 } else {
8953 alu.op = ALU_OP1_EXP_IEEE;
8954 alu.src[0].sel = ctx->temp_reg;
8955 alu.src[0].chan = 0;
8956
8957 alu.dst.sel = ctx->temp_reg;
8958 alu.dst.chan = 0;
8959 alu.dst.write = 1;
8960 alu.last = 1;
8961 r = r600_bytecode_add_alu(ctx->bc, &alu);
8962 if (r)
8963 return r;
8964 }
8965 }
8966
8967 /* result.y = tmp - floor(tmp); */
8968 if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
8969 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8970
8971 alu.op = ALU_OP1_FRACT;
8972 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
8973
8974 alu.dst.sel = ctx->temp_reg;
8975 #if 0
8976 r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
8977 if (r)
8978 return r;
8979 #endif
8980 alu.dst.write = 1;
8981 alu.dst.chan = 1;
8982
8983 alu.last = 1;
8984
8985 r = r600_bytecode_add_alu(ctx->bc, &alu);
8986 if (r)
8987 return r;
8988 }
8989
8990 /* result.z = RoughApprox2ToX(tmp);*/
8991 if ((inst->Dst[0].Register.WriteMask >> 2) & 0x1) {
8992 if (ctx->bc->chip_class == CAYMAN) {
8993 for (i = 0; i < 3; i++) {
8994 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
8995 alu.op = ALU_OP1_EXP_IEEE;
8996 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
8997
8998 alu.dst.sel = ctx->temp_reg;
8999 alu.dst.chan = i;
9000 if (i == 2) {
9001 alu.dst.write = 1;
9002 alu.last = 1;
9003 }
9004
9005 r = r600_bytecode_add_alu(ctx->bc, &alu);
9006 if (r)
9007 return r;
9008 }
9009 } else {
9010 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9011 alu.op = ALU_OP1_EXP_IEEE;
9012 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9013
9014 alu.dst.sel = ctx->temp_reg;
9015 alu.dst.write = 1;
9016 alu.dst.chan = 2;
9017
9018 alu.last = 1;
9019
9020 r = r600_bytecode_add_alu(ctx->bc, &alu);
9021 if (r)
9022 return r;
9023 }
9024 }
9025
9026 /* result.w = 1.0;*/
9027 if ((inst->Dst[0].Register.WriteMask >> 3) & 0x1) {
9028 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9029
9030 alu.op = ALU_OP1_MOV;
9031 alu.src[0].sel = V_SQ_ALU_SRC_1;
9032 alu.src[0].chan = 0;
9033
9034 alu.dst.sel = ctx->temp_reg;
9035 alu.dst.chan = 3;
9036 alu.dst.write = 1;
9037 alu.last = 1;
9038 r = r600_bytecode_add_alu(ctx->bc, &alu);
9039 if (r)
9040 return r;
9041 }
9042 return tgsi_helper_copy(ctx, inst);
9043 }
9044
9045 static int tgsi_log(struct r600_shader_ctx *ctx)
9046 {
9047 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9048 struct r600_bytecode_alu alu;
9049 int r;
9050 unsigned i;
9051
9052 /* result.x = floor(log2(|src|)); */
9053 if (inst->Dst[0].Register.WriteMask & 1) {
9054 if (ctx->bc->chip_class == CAYMAN) {
9055 for (i = 0; i < 3; i++) {
9056 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9057
9058 alu.op = ALU_OP1_LOG_IEEE;
9059 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9060 r600_bytecode_src_set_abs(&alu.src[0]);
9061
9062 alu.dst.sel = ctx->temp_reg;
9063 alu.dst.chan = i;
9064 if (i == 0)
9065 alu.dst.write = 1;
9066 if (i == 2)
9067 alu.last = 1;
9068 r = r600_bytecode_add_alu(ctx->bc, &alu);
9069 if (r)
9070 return r;
9071 }
9072
9073 } else {
9074 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9075
9076 alu.op = ALU_OP1_LOG_IEEE;
9077 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9078 r600_bytecode_src_set_abs(&alu.src[0]);
9079
9080 alu.dst.sel = ctx->temp_reg;
9081 alu.dst.chan = 0;
9082 alu.dst.write = 1;
9083 alu.last = 1;
9084 r = r600_bytecode_add_alu(ctx->bc, &alu);
9085 if (r)
9086 return r;
9087 }
9088
9089 alu.op = ALU_OP1_FLOOR;
9090 alu.src[0].sel = ctx->temp_reg;
9091 alu.src[0].chan = 0;
9092
9093 alu.dst.sel = ctx->temp_reg;
9094 alu.dst.chan = 0;
9095 alu.dst.write = 1;
9096 alu.last = 1;
9097
9098 r = r600_bytecode_add_alu(ctx->bc, &alu);
9099 if (r)
9100 return r;
9101 }
9102
9103 /* result.y = |src.x| / (2 ^ floor(log2(|src.x|))); */
9104 if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
9105
9106 if (ctx->bc->chip_class == CAYMAN) {
9107 for (i = 0; i < 3; i++) {
9108 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9109
9110 alu.op = ALU_OP1_LOG_IEEE;
9111 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9112 r600_bytecode_src_set_abs(&alu.src[0]);
9113
9114 alu.dst.sel = ctx->temp_reg;
9115 alu.dst.chan = i;
9116 if (i == 1)
9117 alu.dst.write = 1;
9118 if (i == 2)
9119 alu.last = 1;
9120
9121 r = r600_bytecode_add_alu(ctx->bc, &alu);
9122 if (r)
9123 return r;
9124 }
9125 } else {
9126 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9127
9128 alu.op = ALU_OP1_LOG_IEEE;
9129 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9130 r600_bytecode_src_set_abs(&alu.src[0]);
9131
9132 alu.dst.sel = ctx->temp_reg;
9133 alu.dst.chan = 1;
9134 alu.dst.write = 1;
9135 alu.last = 1;
9136
9137 r = r600_bytecode_add_alu(ctx->bc, &alu);
9138 if (r)
9139 return r;
9140 }
9141
9142 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9143
9144 alu.op = ALU_OP1_FLOOR;
9145 alu.src[0].sel = ctx->temp_reg;
9146 alu.src[0].chan = 1;
9147
9148 alu.dst.sel = ctx->temp_reg;
9149 alu.dst.chan = 1;
9150 alu.dst.write = 1;
9151 alu.last = 1;
9152
9153 r = r600_bytecode_add_alu(ctx->bc, &alu);
9154 if (r)
9155 return r;
9156
9157 if (ctx->bc->chip_class == CAYMAN) {
9158 for (i = 0; i < 3; i++) {
9159 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9160 alu.op = ALU_OP1_EXP_IEEE;
9161 alu.src[0].sel = ctx->temp_reg;
9162 alu.src[0].chan = 1;
9163
9164 alu.dst.sel = ctx->temp_reg;
9165 alu.dst.chan = i;
9166 if (i == 1)
9167 alu.dst.write = 1;
9168 if (i == 2)
9169 alu.last = 1;
9170
9171 r = r600_bytecode_add_alu(ctx->bc, &alu);
9172 if (r)
9173 return r;
9174 }
9175 } else {
9176 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9177 alu.op = ALU_OP1_EXP_IEEE;
9178 alu.src[0].sel = ctx->temp_reg;
9179 alu.src[0].chan = 1;
9180
9181 alu.dst.sel = ctx->temp_reg;
9182 alu.dst.chan = 1;
9183 alu.dst.write = 1;
9184 alu.last = 1;
9185
9186 r = r600_bytecode_add_alu(ctx->bc, &alu);
9187 if (r)
9188 return r;
9189 }
9190
9191 if (ctx->bc->chip_class == CAYMAN) {
9192 for (i = 0; i < 3; i++) {
9193 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9194 alu.op = ALU_OP1_RECIP_IEEE;
9195 alu.src[0].sel = ctx->temp_reg;
9196 alu.src[0].chan = 1;
9197
9198 alu.dst.sel = ctx->temp_reg;
9199 alu.dst.chan = i;
9200 if (i == 1)
9201 alu.dst.write = 1;
9202 if (i == 2)
9203 alu.last = 1;
9204
9205 r = r600_bytecode_add_alu(ctx->bc, &alu);
9206 if (r)
9207 return r;
9208 }
9209 } else {
9210 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9211 alu.op = ALU_OP1_RECIP_IEEE;
9212 alu.src[0].sel = ctx->temp_reg;
9213 alu.src[0].chan = 1;
9214
9215 alu.dst.sel = ctx->temp_reg;
9216 alu.dst.chan = 1;
9217 alu.dst.write = 1;
9218 alu.last = 1;
9219
9220 r = r600_bytecode_add_alu(ctx->bc, &alu);
9221 if (r)
9222 return r;
9223 }
9224
9225 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9226
9227 alu.op = ALU_OP2_MUL;
9228
9229 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9230 r600_bytecode_src_set_abs(&alu.src[0]);
9231
9232 alu.src[1].sel = ctx->temp_reg;
9233 alu.src[1].chan = 1;
9234
9235 alu.dst.sel = ctx->temp_reg;
9236 alu.dst.chan = 1;
9237 alu.dst.write = 1;
9238 alu.last = 1;
9239
9240 r = r600_bytecode_add_alu(ctx->bc, &alu);
9241 if (r)
9242 return r;
9243 }
9244
9245 /* result.z = log2(|src|);*/
9246 if ((inst->Dst[0].Register.WriteMask >> 2) & 1) {
9247 if (ctx->bc->chip_class == CAYMAN) {
9248 for (i = 0; i < 3; i++) {
9249 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9250
9251 alu.op = ALU_OP1_LOG_IEEE;
9252 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9253 r600_bytecode_src_set_abs(&alu.src[0]);
9254
9255 alu.dst.sel = ctx->temp_reg;
9256 if (i == 2)
9257 alu.dst.write = 1;
9258 alu.dst.chan = i;
9259 if (i == 2)
9260 alu.last = 1;
9261
9262 r = r600_bytecode_add_alu(ctx->bc, &alu);
9263 if (r)
9264 return r;
9265 }
9266 } else {
9267 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9268
9269 alu.op = ALU_OP1_LOG_IEEE;
9270 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9271 r600_bytecode_src_set_abs(&alu.src[0]);
9272
9273 alu.dst.sel = ctx->temp_reg;
9274 alu.dst.write = 1;
9275 alu.dst.chan = 2;
9276 alu.last = 1;
9277
9278 r = r600_bytecode_add_alu(ctx->bc, &alu);
9279 if (r)
9280 return r;
9281 }
9282 }
9283
9284 /* result.w = 1.0; */
9285 if ((inst->Dst[0].Register.WriteMask >> 3) & 1) {
9286 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9287
9288 alu.op = ALU_OP1_MOV;
9289 alu.src[0].sel = V_SQ_ALU_SRC_1;
9290 alu.src[0].chan = 0;
9291
9292 alu.dst.sel = ctx->temp_reg;
9293 alu.dst.chan = 3;
9294 alu.dst.write = 1;
9295 alu.last = 1;
9296
9297 r = r600_bytecode_add_alu(ctx->bc, &alu);
9298 if (r)
9299 return r;
9300 }
9301
9302 return tgsi_helper_copy(ctx, inst);
9303 }
9304
9305 static int tgsi_eg_arl(struct r600_shader_ctx *ctx)
9306 {
9307 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9308 struct r600_bytecode_alu alu;
9309 int r;
9310 int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9311 unsigned reg = get_address_file_reg(ctx, inst->Dst[0].Register.Index);
9312
9313 assert(inst->Dst[0].Register.Index < 3);
9314 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9315
9316 switch (inst->Instruction.Opcode) {
9317 case TGSI_OPCODE_ARL:
9318 alu.op = ALU_OP1_FLT_TO_INT_FLOOR;
9319 break;
9320 case TGSI_OPCODE_ARR:
9321 alu.op = ALU_OP1_FLT_TO_INT;
9322 break;
9323 case TGSI_OPCODE_UARL:
9324 alu.op = ALU_OP1_MOV;
9325 break;
9326 default:
9327 assert(0);
9328 return -1;
9329 }
9330
9331 for (i = 0; i <= lasti; ++i) {
9332 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9333 continue;
9334 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
9335 alu.last = i == lasti;
9336 alu.dst.sel = reg;
9337 alu.dst.chan = i;
9338 alu.dst.write = 1;
9339 r = r600_bytecode_add_alu(ctx->bc, &alu);
9340 if (r)
9341 return r;
9342 }
9343
9344 if (inst->Dst[0].Register.Index > 0)
9345 ctx->bc->index_loaded[inst->Dst[0].Register.Index - 1] = 0;
9346 else
9347 ctx->bc->ar_loaded = 0;
9348
9349 return 0;
9350 }
9351 static int tgsi_r600_arl(struct r600_shader_ctx *ctx)
9352 {
9353 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9354 struct r600_bytecode_alu alu;
9355 int r;
9356 int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9357
9358 switch (inst->Instruction.Opcode) {
9359 case TGSI_OPCODE_ARL:
9360 memset(&alu, 0, sizeof(alu));
9361 alu.op = ALU_OP1_FLOOR;
9362 alu.dst.sel = ctx->bc->ar_reg;
9363 alu.dst.write = 1;
9364 for (i = 0; i <= lasti; ++i) {
9365 if (inst->Dst[0].Register.WriteMask & (1 << i)) {
9366 alu.dst.chan = i;
9367 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
9368 alu.last = i == lasti;
9369 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
9370 return r;
9371 }
9372 }
9373
9374 memset(&alu, 0, sizeof(alu));
9375 alu.op = ALU_OP1_FLT_TO_INT;
9376 alu.src[0].sel = ctx->bc->ar_reg;
9377 alu.dst.sel = ctx->bc->ar_reg;
9378 alu.dst.write = 1;
9379 /* FLT_TO_INT is trans-only on r600/r700 */
9380 alu.last = TRUE;
9381 for (i = 0; i <= lasti; ++i) {
9382 alu.dst.chan = i;
9383 alu.src[0].chan = i;
9384 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
9385 return r;
9386 }
9387 break;
9388 case TGSI_OPCODE_ARR:
9389 memset(&alu, 0, sizeof(alu));
9390 alu.op = ALU_OP1_FLT_TO_INT;
9391 alu.dst.sel = ctx->bc->ar_reg;
9392 alu.dst.write = 1;
9393 /* FLT_TO_INT is trans-only on r600/r700 */
9394 alu.last = TRUE;
9395 for (i = 0; i <= lasti; ++i) {
9396 if (inst->Dst[0].Register.WriteMask & (1 << i)) {
9397 alu.dst.chan = i;
9398 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
9399 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
9400 return r;
9401 }
9402 }
9403 break;
9404 case TGSI_OPCODE_UARL:
9405 memset(&alu, 0, sizeof(alu));
9406 alu.op = ALU_OP1_MOV;
9407 alu.dst.sel = ctx->bc->ar_reg;
9408 alu.dst.write = 1;
9409 for (i = 0; i <= lasti; ++i) {
9410 if (inst->Dst[0].Register.WriteMask & (1 << i)) {
9411 alu.dst.chan = i;
9412 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
9413 alu.last = i == lasti;
9414 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
9415 return r;
9416 }
9417 }
9418 break;
9419 default:
9420 assert(0);
9421 return -1;
9422 }
9423
9424 ctx->bc->ar_loaded = 0;
9425 return 0;
9426 }
9427
9428 static int tgsi_opdst(struct r600_shader_ctx *ctx)
9429 {
9430 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9431 struct r600_bytecode_alu alu;
9432 int i, r = 0;
9433
9434 for (i = 0; i < 4; i++) {
9435 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9436
9437 alu.op = ALU_OP2_MUL;
9438 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9439
9440 if (i == 0 || i == 3) {
9441 alu.src[0].sel = V_SQ_ALU_SRC_1;
9442 } else {
9443 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
9444 }
9445
9446 if (i == 0 || i == 2) {
9447 alu.src[1].sel = V_SQ_ALU_SRC_1;
9448 } else {
9449 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
9450 }
9451 if (i == 3)
9452 alu.last = 1;
9453 r = r600_bytecode_add_alu(ctx->bc, &alu);
9454 if (r)
9455 return r;
9456 }
9457 return 0;
9458 }
9459
9460 static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode, int alu_type)
9461 {
9462 struct r600_bytecode_alu alu;
9463 int r;
9464
9465 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9466 alu.op = opcode;
9467 alu.execute_mask = 1;
9468 alu.update_pred = 1;
9469
9470 alu.dst.sel = ctx->temp_reg;
9471 alu.dst.write = 1;
9472 alu.dst.chan = 0;
9473
9474 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9475 alu.src[1].sel = V_SQ_ALU_SRC_0;
9476 alu.src[1].chan = 0;
9477
9478 alu.last = 1;
9479
9480 r = r600_bytecode_add_alu_type(ctx->bc, &alu, alu_type);
9481 if (r)
9482 return r;
9483 return 0;
9484 }
9485
9486 static int pops(struct r600_shader_ctx *ctx, int pops)
9487 {
9488 unsigned force_pop = ctx->bc->force_add_cf;
9489
9490 if (!force_pop) {
9491 int alu_pop = 3;
9492 if (ctx->bc->cf_last) {
9493 if (ctx->bc->cf_last->op == CF_OP_ALU)
9494 alu_pop = 0;
9495 else if (ctx->bc->cf_last->op == CF_OP_ALU_POP_AFTER)
9496 alu_pop = 1;
9497 }
9498 alu_pop += pops;
9499 if (alu_pop == 1) {
9500 ctx->bc->cf_last->op = CF_OP_ALU_POP_AFTER;
9501 ctx->bc->force_add_cf = 1;
9502 } else if (alu_pop == 2) {
9503 ctx->bc->cf_last->op = CF_OP_ALU_POP2_AFTER;
9504 ctx->bc->force_add_cf = 1;
9505 } else {
9506 force_pop = 1;
9507 }
9508 }
9509
9510 if (force_pop) {
9511 r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP);
9512 ctx->bc->cf_last->pop_count = pops;
9513 ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
9514 }
9515
9516 return 0;
9517 }
9518
9519 static inline void callstack_update_max_depth(struct r600_shader_ctx *ctx,
9520 unsigned reason)
9521 {
9522 struct r600_stack_info *stack = &ctx->bc->stack;
9523 unsigned elements;
9524 int entries;
9525
9526 unsigned entry_size = stack->entry_size;
9527
9528 elements = (stack->loop + stack->push_wqm ) * entry_size;
9529 elements += stack->push;
9530
9531 switch (ctx->bc->chip_class) {
9532 case R600:
9533 case R700:
9534 /* pre-r8xx: if any non-WQM PUSH instruction is invoked, 2 elements on
9535 * the stack must be reserved to hold the current active/continue
9536 * masks */
9537 if (reason == FC_PUSH_VPM) {
9538 elements += 2;
9539 }
9540 break;
9541
9542 case CAYMAN:
9543 /* r9xx: any stack operation on empty stack consumes 2 additional
9544 * elements */
9545 elements += 2;
9546
9547 /* fallthrough */
9548 /* FIXME: do the two elements added above cover the cases for the
9549 * r8xx+ below? */
9550
9551 case EVERGREEN:
9552 /* r8xx+: 2 extra elements are not always required, but one extra
9553 * element must be added for each of the following cases:
9554 * 1. There is an ALU_ELSE_AFTER instruction at the point of greatest
9555 * stack usage.
9556 * (Currently we don't use ALU_ELSE_AFTER.)
9557 * 2. There are LOOP/WQM frames on the stack when any flavor of non-WQM
9558 * PUSH instruction executed.
9559 *
9560 * NOTE: it seems we also need to reserve additional element in some
9561 * other cases, e.g. when we have 4 levels of PUSH_VPM in the shader,
9562 * then STACK_SIZE should be 2 instead of 1 */
9563 if (reason == FC_PUSH_VPM) {
9564 elements += 1;
9565 }
9566 break;
9567
9568 default:
9569 assert(0);
9570 break;
9571 }
9572
9573 /* NOTE: it seems STACK_SIZE is interpreted by hw as if entry_size is 4
9574 * for all chips, so we use 4 in the final formula, not the real entry_size
9575 * for the chip */
9576 entry_size = 4;
9577
9578 entries = (elements + (entry_size - 1)) / entry_size;
9579
9580 if (entries > stack->max_entries)
9581 stack->max_entries = entries;
9582 }
9583
9584 static inline void callstack_pop(struct r600_shader_ctx *ctx, unsigned reason)
9585 {
9586 switch(reason) {
9587 case FC_PUSH_VPM:
9588 --ctx->bc->stack.push;
9589 assert(ctx->bc->stack.push >= 0);
9590 break;
9591 case FC_PUSH_WQM:
9592 --ctx->bc->stack.push_wqm;
9593 assert(ctx->bc->stack.push_wqm >= 0);
9594 break;
9595 case FC_LOOP:
9596 --ctx->bc->stack.loop;
9597 assert(ctx->bc->stack.loop >= 0);
9598 break;
9599 default:
9600 assert(0);
9601 break;
9602 }
9603 }
9604
9605 static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason)
9606 {
9607 switch (reason) {
9608 case FC_PUSH_VPM:
9609 ++ctx->bc->stack.push;
9610 break;
9611 case FC_PUSH_WQM:
9612 ++ctx->bc->stack.push_wqm;
9613 case FC_LOOP:
9614 ++ctx->bc->stack.loop;
9615 break;
9616 default:
9617 assert(0);
9618 }
9619
9620 callstack_update_max_depth(ctx, reason);
9621 }
9622
9623 static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp)
9624 {
9625 struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[fc_sp];
9626
9627 sp->mid = realloc((void *)sp->mid,
9628 sizeof(struct r600_bytecode_cf *) * (sp->num_mid + 1));
9629 sp->mid[sp->num_mid] = ctx->bc->cf_last;
9630 sp->num_mid++;
9631 }
9632
9633 static void fc_pushlevel(struct r600_shader_ctx *ctx, int type)
9634 {
9635 assert(ctx->bc->fc_sp < ARRAY_SIZE(ctx->bc->fc_stack));
9636 ctx->bc->fc_stack[ctx->bc->fc_sp].type = type;
9637 ctx->bc->fc_stack[ctx->bc->fc_sp].start = ctx->bc->cf_last;
9638 ctx->bc->fc_sp++;
9639 }
9640
9641 static void fc_poplevel(struct r600_shader_ctx *ctx)
9642 {
9643 struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[ctx->bc->fc_sp - 1];
9644 free(sp->mid);
9645 sp->mid = NULL;
9646 sp->num_mid = 0;
9647 sp->start = NULL;
9648 sp->type = 0;
9649 ctx->bc->fc_sp--;
9650 }
9651
9652 #if 0
9653 static int emit_return(struct r600_shader_ctx *ctx)
9654 {
9655 r600_bytecode_add_cfinst(ctx->bc, CF_OP_RETURN));
9656 return 0;
9657 }
9658
9659 static int emit_jump_to_offset(struct r600_shader_ctx *ctx, int pops, int offset)
9660 {
9661
9662 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP));
9663 ctx->bc->cf_last->pop_count = pops;
9664 /* XXX work out offset */
9665 return 0;
9666 }
9667
9668 static int emit_setret_in_loop_flag(struct r600_shader_ctx *ctx, unsigned flag_value)
9669 {
9670 return 0;
9671 }
9672
9673 static void emit_testflag(struct r600_shader_ctx *ctx)
9674 {
9675
9676 }
9677
9678 static void emit_return_on_flag(struct r600_shader_ctx *ctx, unsigned ifidx)
9679 {
9680 emit_testflag(ctx);
9681 emit_jump_to_offset(ctx, 1, 4);
9682 emit_setret_in_loop_flag(ctx, V_SQ_ALU_SRC_0);
9683 pops(ctx, ifidx + 1);
9684 emit_return(ctx);
9685 }
9686
9687 static void break_loop_on_flag(struct r600_shader_ctx *ctx, unsigned fc_sp)
9688 {
9689 emit_testflag(ctx);
9690
9691 r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
9692 ctx->bc->cf_last->pop_count = 1;
9693
9694 fc_set_mid(ctx, fc_sp);
9695
9696 pops(ctx, 1);
9697 }
9698 #endif
9699
9700 static int emit_if(struct r600_shader_ctx *ctx, int opcode)
9701 {
9702 int alu_type = CF_OP_ALU_PUSH_BEFORE;
9703
9704 /* There is a hardware bug on Cayman where a BREAK/CONTINUE followed by
9705 * LOOP_STARTxxx for nested loops may put the branch stack into a state
9706 * such that ALU_PUSH_BEFORE doesn't work as expected. Workaround this
9707 * by replacing the ALU_PUSH_BEFORE with a PUSH + ALU */
9708 if (ctx->bc->chip_class == CAYMAN && ctx->bc->stack.loop > 1) {
9709 r600_bytecode_add_cfinst(ctx->bc, CF_OP_PUSH);
9710 ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
9711 alu_type = CF_OP_ALU;
9712 }
9713
9714 emit_logic_pred(ctx, opcode, alu_type);
9715
9716 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
9717
9718 fc_pushlevel(ctx, FC_IF);
9719
9720 callstack_push(ctx, FC_PUSH_VPM);
9721 return 0;
9722 }
9723
9724 static int tgsi_if(struct r600_shader_ctx *ctx)
9725 {
9726 return emit_if(ctx, ALU_OP2_PRED_SETNE);
9727 }
9728
9729 static int tgsi_uif(struct r600_shader_ctx *ctx)
9730 {
9731 return emit_if(ctx, ALU_OP2_PRED_SETNE_INT);
9732 }
9733
9734 static int tgsi_else(struct r600_shader_ctx *ctx)
9735 {
9736 r600_bytecode_add_cfinst(ctx->bc, CF_OP_ELSE);
9737 ctx->bc->cf_last->pop_count = 1;
9738
9739 fc_set_mid(ctx, ctx->bc->fc_sp - 1);
9740 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id;
9741 return 0;
9742 }
9743
9744 static int tgsi_endif(struct r600_shader_ctx *ctx)
9745 {
9746 pops(ctx, 1);
9747 if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].type != FC_IF) {
9748 R600_ERR("if/endif unbalanced in shader\n");
9749 return -1;
9750 }
9751
9752 if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid == NULL) {
9753 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id + 2;
9754 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->pop_count = 1;
9755 } else {
9756 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid[0]->cf_addr = ctx->bc->cf_last->id + 2;
9757 }
9758 fc_poplevel(ctx);
9759
9760 callstack_pop(ctx, FC_PUSH_VPM);
9761 return 0;
9762 }
9763
9764 static int tgsi_bgnloop(struct r600_shader_ctx *ctx)
9765 {
9766 /* LOOP_START_DX10 ignores the LOOP_CONFIG* registers, so it is not
9767 * limited to 4096 iterations, like the other LOOP_* instructions. */
9768 r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_START_DX10);
9769
9770 fc_pushlevel(ctx, FC_LOOP);
9771
9772 /* check stack depth */
9773 callstack_push(ctx, FC_LOOP);
9774 return 0;
9775 }
9776
9777 static int tgsi_endloop(struct r600_shader_ctx *ctx)
9778 {
9779 int i;
9780
9781 r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_END);
9782
9783 if (ctx->bc->fc_stack[ctx->bc->fc_sp - 1].type != FC_LOOP) {
9784 R600_ERR("loop/endloop in shader code are not paired.\n");
9785 return -EINVAL;
9786 }
9787
9788 /* fixup loop pointers - from r600isa
9789 LOOP END points to CF after LOOP START,
9790 LOOP START point to CF after LOOP END
9791 BRK/CONT point to LOOP END CF
9792 */
9793 ctx->bc->cf_last->cf_addr = ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->id + 2;
9794
9795 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].start->cf_addr = ctx->bc->cf_last->id + 2;
9796
9797 for (i = 0; i < ctx->bc->fc_stack[ctx->bc->fc_sp - 1].num_mid; i++) {
9798 ctx->bc->fc_stack[ctx->bc->fc_sp - 1].mid[i]->cf_addr = ctx->bc->cf_last->id;
9799 }
9800 /* XXX add LOOPRET support */
9801 fc_poplevel(ctx);
9802 callstack_pop(ctx, FC_LOOP);
9803 return 0;
9804 }
9805
9806 static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx)
9807 {
9808 unsigned int fscp;
9809
9810 for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
9811 {
9812 if (FC_LOOP == ctx->bc->fc_stack[fscp - 1].type)
9813 break;
9814 }
9815
9816 if (fscp == 0) {
9817 R600_ERR("Break not inside loop/endloop pair\n");
9818 return -EINVAL;
9819 }
9820
9821 r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
9822
9823 fc_set_mid(ctx, fscp - 1);
9824
9825 return 0;
9826 }
9827
9828 static int tgsi_gs_emit(struct r600_shader_ctx *ctx)
9829 {
9830 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9831 int stream = ctx->literals[inst->Src[0].Register.Index * 4 + inst->Src[0].Register.SwizzleX];
9832 int r;
9833
9834 if (ctx->inst_info->op == CF_OP_EMIT_VERTEX)
9835 emit_gs_ring_writes(ctx, ctx->gs_stream_output_info, stream, TRUE);
9836
9837 r = r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
9838 if (!r) {
9839 ctx->bc->cf_last->count = stream; // Count field for CUT/EMIT_VERTEX indicates which stream
9840 if (ctx->inst_info->op == CF_OP_EMIT_VERTEX)
9841 return emit_inc_ring_offset(ctx, stream, TRUE);
9842 }
9843 return r;
9844 }
9845
9846 static int tgsi_umad(struct r600_shader_ctx *ctx)
9847 {
9848 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9849 struct r600_bytecode_alu alu;
9850 int i, j, r;
9851 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9852
9853 /* src0 * src1 */
9854 for (i = 0; i < lasti + 1; i++) {
9855 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9856 continue;
9857
9858 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9859
9860 alu.dst.chan = i;
9861 alu.dst.sel = ctx->temp_reg;
9862 alu.dst.write = 1;
9863
9864 alu.op = ALU_OP2_MULLO_UINT;
9865 for (j = 0; j < 2; j++) {
9866 r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
9867 }
9868
9869 alu.last = 1;
9870 r = emit_mul_int_op(ctx->bc, &alu);
9871 if (r)
9872 return r;
9873 }
9874
9875
9876 for (i = 0; i < lasti + 1; i++) {
9877 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9878 continue;
9879
9880 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9881 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9882
9883 alu.op = ALU_OP2_ADD_INT;
9884
9885 alu.src[0].sel = ctx->temp_reg;
9886 alu.src[0].chan = i;
9887
9888 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
9889 if (i == lasti) {
9890 alu.last = 1;
9891 }
9892 r = r600_bytecode_add_alu(ctx->bc, &alu);
9893 if (r)
9894 return r;
9895 }
9896 return 0;
9897 }
9898
9899 static int tgsi_pk2h(struct r600_shader_ctx *ctx)
9900 {
9901 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9902 struct r600_bytecode_alu alu;
9903 int r, i;
9904 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9905
9906 /* temp.xy = f32_to_f16(src) */
9907 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9908 alu.op = ALU_OP1_FLT32_TO_FLT16;
9909 alu.dst.chan = 0;
9910 alu.dst.sel = ctx->temp_reg;
9911 alu.dst.write = 1;
9912 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9913 r = r600_bytecode_add_alu(ctx->bc, &alu);
9914 if (r)
9915 return r;
9916 alu.dst.chan = 1;
9917 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
9918 alu.last = 1;
9919 r = r600_bytecode_add_alu(ctx->bc, &alu);
9920 if (r)
9921 return r;
9922
9923 /* dst.x = temp.y * 0x10000 + temp.x */
9924 for (i = 0; i < lasti + 1; i++) {
9925 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9926 continue;
9927
9928 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9929 alu.op = ALU_OP3_MULADD_UINT24;
9930 alu.is_op3 = 1;
9931 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9932 alu.last = i == lasti;
9933 alu.src[0].sel = ctx->temp_reg;
9934 alu.src[0].chan = 1;
9935 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
9936 alu.src[1].value = 0x10000;
9937 alu.src[2].sel = ctx->temp_reg;
9938 alu.src[2].chan = 0;
9939 r = r600_bytecode_add_alu(ctx->bc, &alu);
9940 if (r)
9941 return r;
9942 }
9943
9944 return 0;
9945 }
9946
9947 static int tgsi_up2h(struct r600_shader_ctx *ctx)
9948 {
9949 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
9950 struct r600_bytecode_alu alu;
9951 int r, i;
9952 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
9953
9954 /* temp.x = src.x */
9955 /* note: no need to mask out the high bits */
9956 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9957 alu.op = ALU_OP1_MOV;
9958 alu.dst.chan = 0;
9959 alu.dst.sel = ctx->temp_reg;
9960 alu.dst.write = 1;
9961 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9962 r = r600_bytecode_add_alu(ctx->bc, &alu);
9963 if (r)
9964 return r;
9965
9966 /* temp.y = src.x >> 16 */
9967 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9968 alu.op = ALU_OP2_LSHR_INT;
9969 alu.dst.chan = 1;
9970 alu.dst.sel = ctx->temp_reg;
9971 alu.dst.write = 1;
9972 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
9973 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
9974 alu.src[1].value = 16;
9975 alu.last = 1;
9976 r = r600_bytecode_add_alu(ctx->bc, &alu);
9977 if (r)
9978 return r;
9979
9980 /* dst.wz = dst.xy = f16_to_f32(temp.xy) */
9981 for (i = 0; i < lasti + 1; i++) {
9982 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
9983 continue;
9984 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
9985 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
9986 alu.op = ALU_OP1_FLT16_TO_FLT32;
9987 alu.src[0].sel = ctx->temp_reg;
9988 alu.src[0].chan = i % 2;
9989 alu.last = i == lasti;
9990 r = r600_bytecode_add_alu(ctx->bc, &alu);
9991 if (r)
9992 return r;
9993 }
9994
9995 return 0;
9996 }
9997
9998 static int tgsi_bfe(struct r600_shader_ctx *ctx)
9999 {
10000 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10001 struct r600_bytecode_alu alu;
10002 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
10003 int r, i;
10004 int dst = -1;
10005
10006 if ((inst->Src[0].Register.File == inst->Dst[0].Register.File &&
10007 inst->Src[0].Register.Index == inst->Dst[0].Register.Index) ||
10008 (inst->Src[2].Register.File == inst->Dst[0].Register.File &&
10009 inst->Src[2].Register.Index == inst->Dst[0].Register.Index))
10010 dst = r600_get_temp(ctx);
10011
10012 r = tgsi_op3_dst(ctx, dst);
10013 if (r)
10014 return r;
10015
10016 for (i = 0; i < lasti + 1; i++) {
10017 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10018 alu.op = ALU_OP2_SETGE_INT;
10019 r600_bytecode_src(&alu.src[0], &ctx->src[2], i);
10020 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
10021 alu.src[1].value = 32;
10022 alu.dst.sel = ctx->temp_reg;
10023 alu.dst.chan = i;
10024 alu.dst.write = 1;
10025 if (i == lasti)
10026 alu.last = 1;
10027 r = r600_bytecode_add_alu(ctx->bc, &alu);
10028 if (r)
10029 return r;
10030 }
10031
10032 for (i = 0; i < lasti + 1; i++) {
10033 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10034 alu.op = ALU_OP3_CNDE_INT;
10035 alu.is_op3 = 1;
10036 alu.src[0].sel = ctx->temp_reg;
10037 alu.src[0].chan = i;
10038
10039 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
10040 if (dst != -1)
10041 alu.src[1].sel = dst;
10042 else
10043 alu.src[1].sel = alu.dst.sel;
10044 alu.src[1].chan = i;
10045 r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
10046 alu.dst.write = 1;
10047 if (i == lasti)
10048 alu.last = 1;
10049 r = r600_bytecode_add_alu(ctx->bc, &alu);
10050 if (r)
10051 return r;
10052 }
10053
10054 return 0;
10055 }
10056
10057 static int tgsi_clock(struct r600_shader_ctx *ctx)
10058 {
10059 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
10060 struct r600_bytecode_alu alu;
10061 int r;
10062
10063 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10064 alu.op = ALU_OP1_MOV;
10065 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
10066 alu.src[0].sel = EG_V_SQ_ALU_SRC_TIME_LO;
10067 r = r600_bytecode_add_alu(ctx->bc, &alu);
10068 if (r)
10069 return r;
10070 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
10071 alu.op = ALU_OP1_MOV;
10072 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
10073 alu.src[0].sel = EG_V_SQ_ALU_SRC_TIME_HI;
10074 r = r600_bytecode_add_alu(ctx->bc, &alu);
10075 if (r)
10076 return r;
10077 return 0;
10078 }
10079
10080 static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
10081 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_r600_arl},
10082 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2},
10083 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit},
10084
10085 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate},
10086
10087 [TGSI_OPCODE_RSQ] = { ALU_OP0_NOP, tgsi_rsq},
10088 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp},
10089 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log},
10090 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL_IEEE, tgsi_op2},
10091 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2},
10092 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
10093 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
10094 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst},
10095 /* MIN_DX10 returns non-nan result if one src is NaN, MIN returns NaN */
10096 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2},
10097 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2},
10098 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap},
10099 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2},
10100 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3},
10101 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp},
10102 [TGSI_OPCODE_FMA] = { ALU_OP0_NOP, tgsi_unsupported},
10103 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
10104 [21] = { ALU_OP0_NOP, tgsi_unsupported},
10105 [22] = { ALU_OP0_NOP, tgsi_unsupported},
10106 [23] = { ALU_OP0_NOP, tgsi_unsupported},
10107 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2},
10108 [25] = { ALU_OP0_NOP, tgsi_unsupported},
10109 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2},
10110 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2},
10111 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
10112 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
10113 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, tgsi_pow},
10114 [31] = { ALU_OP0_NOP, tgsi_unsupported},
10115 [32] = { ALU_OP0_NOP, tgsi_unsupported},
10116 [TGSI_OPCODE_CLOCK] = { ALU_OP0_NOP, tgsi_unsupported},
10117 [34] = { ALU_OP0_NOP, tgsi_unsupported},
10118 [35] = { ALU_OP0_NOP, tgsi_unsupported},
10119 [TGSI_OPCODE_COS] = { ALU_OP1_COS, tgsi_trig},
10120 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
10121 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
10122 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */
10123 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_unsupported},
10124 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported},
10125 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported},
10126 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported},
10127 [44] = { ALU_OP0_NOP, tgsi_unsupported},
10128 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2},
10129 [46] = { ALU_OP0_NOP, tgsi_unsupported},
10130 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2},
10131 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, tgsi_trig},
10132 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap},
10133 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2},
10134 [51] = { ALU_OP0_NOP, tgsi_unsupported},
10135 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex},
10136 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex},
10137 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex},
10138 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_unsupported},
10139 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported},
10140 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported},
10141 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported},
10142 [59] = { ALU_OP0_NOP, tgsi_unsupported},
10143 [60] = { ALU_OP0_NOP, tgsi_unsupported},
10144 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_r600_arl},
10145 [62] = { ALU_OP0_NOP, tgsi_unsupported},
10146 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported},
10147 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported},
10148 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg},
10149 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp},
10150 [67] = { ALU_OP0_NOP, tgsi_unsupported},
10151 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
10152 [69] = { ALU_OP0_NOP, tgsi_unsupported},
10153 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported},
10154 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
10155 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex},
10156 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
10157 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if},
10158 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif},
10159 [76] = { ALU_OP0_NOP, tgsi_unsupported},
10160 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else},
10161 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif},
10162 [TGSI_OPCODE_DDX_FINE] = { ALU_OP0_NOP, tgsi_unsupported},
10163 [TGSI_OPCODE_DDY_FINE] = { ALU_OP0_NOP, tgsi_unsupported},
10164 [81] = { ALU_OP0_NOP, tgsi_unsupported},
10165 [82] = { ALU_OP0_NOP, tgsi_unsupported},
10166 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2},
10167 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
10168 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2},
10169 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2},
10170 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2_trans},
10171 [88] = { ALU_OP0_NOP, tgsi_unsupported},
10172 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2},
10173 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2},
10174 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod},
10175 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2},
10176 [93] = { ALU_OP0_NOP, tgsi_unsupported},
10177 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex},
10178 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
10179 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
10180 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
10181 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit},
10182 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop},
10183 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported},
10184 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop},
10185 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported},
10186 [103] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
10187 [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
10188 [TGSI_OPCODE_RESQ] = { ALU_OP0_NOP, tgsi_unsupported},
10189 [106] = { ALU_OP0_NOP, tgsi_unsupported},
10190 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported},
10191 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2},
10192 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2},
10193 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
10194 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
10195 [TGSI_OPCODE_MEMBAR] = { ALU_OP0_NOP, tgsi_unsupported},
10196 [113] = { ALU_OP0_NOP, tgsi_unsupported},
10197 [114] = { ALU_OP0_NOP, tgsi_unsupported},
10198 [115] = { ALU_OP0_NOP, tgsi_unsupported},
10199 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */
10200 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */
10201 [TGSI_OPCODE_DFMA] = { ALU_OP0_NOP, tgsi_unsupported},
10202 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_op2_trans},
10203 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv},
10204 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2},
10205 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2},
10206 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg},
10207 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2},
10208 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2_trans},
10209 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap},
10210 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_op2_trans},
10211 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
10212 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2},
10213 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv},
10214 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad},
10215 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2},
10216 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2},
10217 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod},
10218 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_UINT, tgsi_op2_trans},
10219 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2},
10220 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2},
10221 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2_trans},
10222 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
10223 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2_swap},
10224 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
10225 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported},
10226 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported},
10227 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
10228 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported},
10229 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported},
10230 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported},
10231 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported},
10232 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported},
10233 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported},
10234 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported},
10235 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported},
10236 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported},
10237 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported},
10238 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported},
10239 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported},
10240 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_r600_arl},
10241 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp},
10242 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs},
10243 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg},
10244 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_unsupported},
10245 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_unsupported},
10246 [163] = { ALU_OP0_NOP, tgsi_unsupported},
10247 [164] = { ALU_OP0_NOP, tgsi_unsupported},
10248 [165] = { ALU_OP0_NOP, tgsi_unsupported},
10249 [TGSI_OPCODE_BARRIER] = { ALU_OP0_NOP, tgsi_unsupported},
10250 [TGSI_OPCODE_ATOMUADD] = { ALU_OP0_NOP, tgsi_unsupported},
10251 [TGSI_OPCODE_ATOMXCHG] = { ALU_OP0_NOP, tgsi_unsupported},
10252 [TGSI_OPCODE_ATOMCAS] = { ALU_OP0_NOP, tgsi_unsupported},
10253 [TGSI_OPCODE_ATOMAND] = { ALU_OP0_NOP, tgsi_unsupported},
10254 [TGSI_OPCODE_ATOMOR] = { ALU_OP0_NOP, tgsi_unsupported},
10255 [TGSI_OPCODE_ATOMXOR] = { ALU_OP0_NOP, tgsi_unsupported},
10256 [TGSI_OPCODE_ATOMUMIN] = { ALU_OP0_NOP, tgsi_unsupported},
10257 [TGSI_OPCODE_ATOMUMAX] = { ALU_OP0_NOP, tgsi_unsupported},
10258 [TGSI_OPCODE_ATOMIMIN] = { ALU_OP0_NOP, tgsi_unsupported},
10259 [TGSI_OPCODE_ATOMIMAX] = { ALU_OP0_NOP, tgsi_unsupported},
10260 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex},
10261 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
10262 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex},
10263 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, tgsi_op2_trans},
10264 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, tgsi_op2_trans},
10265 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_unsupported},
10266 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_unsupported},
10267 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_unsupported},
10268 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_unsupported},
10269 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_unsupported},
10270 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_unsupported},
10271 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_unsupported},
10272 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_unsupported},
10273 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_unsupported},
10274 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_unsupported},
10275 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_unsupported},
10276 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_unsupported},
10277 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_unsupported},
10278 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported},
10279 };
10280
10281 static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = {
10282 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_eg_arl},
10283 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2},
10284 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit},
10285 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate},
10286 [TGSI_OPCODE_RSQ] = { ALU_OP0_NOP, tgsi_rsq},
10287 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp},
10288 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log},
10289 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL_IEEE, tgsi_op2},
10290 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2},
10291 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
10292 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
10293 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst},
10294 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2},
10295 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2},
10296 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap},
10297 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2},
10298 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3},
10299 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp},
10300 [TGSI_OPCODE_FMA] = { ALU_OP3_FMA, tgsi_op3},
10301 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
10302 [21] = { ALU_OP0_NOP, tgsi_unsupported},
10303 [22] = { ALU_OP0_NOP, tgsi_unsupported},
10304 [23] = { ALU_OP0_NOP, tgsi_unsupported},
10305 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2},
10306 [25] = { ALU_OP0_NOP, tgsi_unsupported},
10307 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2},
10308 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2},
10309 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
10310 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
10311 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, tgsi_pow},
10312 [31] = { ALU_OP0_NOP, tgsi_unsupported},
10313 [32] = { ALU_OP0_NOP, tgsi_unsupported},
10314 [TGSI_OPCODE_CLOCK] = { ALU_OP0_NOP, tgsi_clock},
10315 [34] = { ALU_OP0_NOP, tgsi_unsupported},
10316 [35] = { ALU_OP0_NOP, tgsi_unsupported},
10317 [TGSI_OPCODE_COS] = { ALU_OP1_COS, tgsi_trig},
10318 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
10319 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
10320 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */
10321 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_pk2h},
10322 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported},
10323 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported},
10324 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported},
10325 [44] = { ALU_OP0_NOP, tgsi_unsupported},
10326 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2},
10327 [46] = { ALU_OP0_NOP, tgsi_unsupported},
10328 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2},
10329 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, tgsi_trig},
10330 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap},
10331 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2},
10332 [51] = { ALU_OP0_NOP, tgsi_unsupported},
10333 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex},
10334 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex},
10335 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex},
10336 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_up2h},
10337 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported},
10338 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported},
10339 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported},
10340 [59] = { ALU_OP0_NOP, tgsi_unsupported},
10341 [60] = { ALU_OP0_NOP, tgsi_unsupported},
10342 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_eg_arl},
10343 [62] = { ALU_OP0_NOP, tgsi_unsupported},
10344 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported},
10345 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported},
10346 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg},
10347 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp},
10348 [67] = { ALU_OP0_NOP, tgsi_unsupported},
10349 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
10350 [69] = { ALU_OP0_NOP, tgsi_unsupported},
10351 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported},
10352 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
10353 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex},
10354 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
10355 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if},
10356 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif},
10357 [76] = { ALU_OP0_NOP, tgsi_unsupported},
10358 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else},
10359 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif},
10360 [TGSI_OPCODE_DDX_FINE] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
10361 [TGSI_OPCODE_DDY_FINE] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
10362 [82] = { ALU_OP0_NOP, tgsi_unsupported},
10363 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2},
10364 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
10365 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2},
10366 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2},
10367 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2},
10368 [88] = { ALU_OP0_NOP, tgsi_unsupported},
10369 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2},
10370 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2},
10371 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod},
10372 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2},
10373 [93] = { ALU_OP0_NOP, tgsi_unsupported},
10374 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex},
10375 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
10376 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
10377 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
10378 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit},
10379 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop},
10380 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported},
10381 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop},
10382 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported},
10383 [103] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
10384 [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
10385 [TGSI_OPCODE_RESQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_resq},
10386 [106] = { ALU_OP0_NOP, tgsi_unsupported},
10387 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported},
10388 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2},
10389 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2},
10390 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
10391 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
10392 [TGSI_OPCODE_MEMBAR] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
10393 [113] = { ALU_OP0_NOP, tgsi_unsupported},
10394 [114] = { ALU_OP0_NOP, tgsi_unsupported},
10395 [115] = { ALU_OP0_NOP, tgsi_unsupported},
10396 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */
10397 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */
10398 /* Refer below for TGSI_OPCODE_DFMA */
10399 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_f2i},
10400 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv},
10401 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2},
10402 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2},
10403 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg},
10404 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2},
10405 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2},
10406 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap},
10407 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_f2i},
10408 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
10409 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2},
10410 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv},
10411 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad},
10412 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2},
10413 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2},
10414 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod},
10415 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_UINT, tgsi_op2_trans},
10416 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2},
10417 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2},
10418 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2},
10419 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
10420 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2},
10421 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
10422 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported},
10423 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported},
10424 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
10425 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported},
10426 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported},
10427 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported},
10428 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported},
10429 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported},
10430 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported},
10431 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported},
10432 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported},
10433 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported},
10434 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported},
10435 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported},
10436 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported},
10437 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_eg_arl},
10438 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp},
10439 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs},
10440 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg},
10441 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_load},
10442 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_store},
10443 [163] = { ALU_OP0_NOP, tgsi_unsupported},
10444 [164] = { ALU_OP0_NOP, tgsi_unsupported},
10445 [165] = { ALU_OP0_NOP, tgsi_unsupported},
10446 [TGSI_OPCODE_BARRIER] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
10447 [TGSI_OPCODE_ATOMUADD] = { V_RAT_INST_ADD_RTN, tgsi_atomic_op},
10448 [TGSI_OPCODE_ATOMXCHG] = { V_RAT_INST_XCHG_RTN, tgsi_atomic_op},
10449 [TGSI_OPCODE_ATOMCAS] = { V_RAT_INST_CMPXCHG_INT_RTN, tgsi_atomic_op},
10450 [TGSI_OPCODE_ATOMAND] = { V_RAT_INST_AND_RTN, tgsi_atomic_op},
10451 [TGSI_OPCODE_ATOMOR] = { V_RAT_INST_OR_RTN, tgsi_atomic_op},
10452 [TGSI_OPCODE_ATOMXOR] = { V_RAT_INST_XOR_RTN, tgsi_atomic_op},
10453 [TGSI_OPCODE_ATOMUMIN] = { V_RAT_INST_MIN_UINT_RTN, tgsi_atomic_op},
10454 [TGSI_OPCODE_ATOMUMAX] = { V_RAT_INST_MAX_UINT_RTN, tgsi_atomic_op},
10455 [TGSI_OPCODE_ATOMIMIN] = { V_RAT_INST_MIN_INT_RTN, tgsi_atomic_op},
10456 [TGSI_OPCODE_ATOMIMAX] = { V_RAT_INST_MAX_INT_RTN, tgsi_atomic_op},
10457 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex},
10458 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
10459 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex},
10460 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, tgsi_op2_trans},
10461 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, tgsi_op2_trans},
10462 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_tex},
10463 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_tex},
10464 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_bfe},
10465 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_bfe},
10466 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_bfi},
10467 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_op2},
10468 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_op2},
10469 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_op2},
10470 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_msb},
10471 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_msb},
10472 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_interp_egcm},
10473 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_interp_egcm},
10474 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_interp_egcm},
10475 [TGSI_OPCODE_F2D] = { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},
10476 [TGSI_OPCODE_D2F] = { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},
10477 [TGSI_OPCODE_DABS] = { ALU_OP1_MOV, tgsi_op2_64},
10478 [TGSI_OPCODE_DNEG] = { ALU_OP2_ADD_64, tgsi_dneg},
10479 [TGSI_OPCODE_DADD] = { ALU_OP2_ADD_64, tgsi_op2_64},
10480 [TGSI_OPCODE_DMUL] = { ALU_OP2_MUL_64, cayman_mul_double_instr},
10481 [TGSI_OPCODE_DDIV] = { 0, cayman_ddiv_instr },
10482 [TGSI_OPCODE_DMAX] = { ALU_OP2_MAX_64, tgsi_op2_64},
10483 [TGSI_OPCODE_DMIN] = { ALU_OP2_MIN_64, tgsi_op2_64},
10484 [TGSI_OPCODE_DSLT] = { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
10485 [TGSI_OPCODE_DSGE] = { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},
10486 [TGSI_OPCODE_DSEQ] = { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},
10487 [TGSI_OPCODE_DSNE] = { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},
10488 [TGSI_OPCODE_DRCP] = { ALU_OP2_RECIP_64, cayman_emit_double_instr},
10489 [TGSI_OPCODE_DSQRT] = { ALU_OP2_SQRT_64, cayman_emit_double_instr},
10490 [TGSI_OPCODE_DMAD] = { ALU_OP3_FMA_64, tgsi_op3_64},
10491 [TGSI_OPCODE_DFMA] = { ALU_OP3_FMA_64, tgsi_op3_64},
10492 [TGSI_OPCODE_DFRAC] = { ALU_OP1_FRACT_64, tgsi_op2_64},
10493 [TGSI_OPCODE_DLDEXP] = { ALU_OP2_LDEXP_64, tgsi_op2_64},
10494 [TGSI_OPCODE_DFRACEXP] = { ALU_OP1_FREXP_64, tgsi_dfracexp},
10495 [TGSI_OPCODE_D2I] = { ALU_OP1_FLT_TO_INT, egcm_double_to_int},
10496 [TGSI_OPCODE_I2D] = { ALU_OP1_INT_TO_FLT, egcm_int_to_double},
10497 [TGSI_OPCODE_D2U] = { ALU_OP1_FLT_TO_UINT, egcm_double_to_int},
10498 [TGSI_OPCODE_U2D] = { ALU_OP1_UINT_TO_FLT, egcm_int_to_double},
10499 [TGSI_OPCODE_DRSQ] = { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},
10500 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported},
10501 };
10502
10503 static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = {
10504 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_eg_arl},
10505 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2},
10506 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit},
10507 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, cayman_emit_float_instr},
10508 [TGSI_OPCODE_RSQ] = { ALU_OP1_RECIPSQRT_IEEE, cayman_emit_float_instr},
10509 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp},
10510 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log},
10511 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL_IEEE, tgsi_op2},
10512 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2},
10513 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
10514 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
10515 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst},
10516 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN_DX10, tgsi_op2},
10517 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX_DX10, tgsi_op2},
10518 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap},
10519 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2},
10520 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD_IEEE, tgsi_op3},
10521 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp},
10522 [TGSI_OPCODE_FMA] = { ALU_OP3_FMA, tgsi_op3},
10523 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, cayman_emit_float_instr},
10524 [21] = { ALU_OP0_NOP, tgsi_unsupported},
10525 [22] = { ALU_OP0_NOP, tgsi_unsupported},
10526 [23] = { ALU_OP0_NOP, tgsi_unsupported},
10527 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2},
10528 [25] = { ALU_OP0_NOP, tgsi_unsupported},
10529 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2},
10530 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2},
10531 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, cayman_emit_float_instr},
10532 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, cayman_emit_float_instr},
10533 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, cayman_pow},
10534 [31] = { ALU_OP0_NOP, tgsi_unsupported},
10535 [32] = { ALU_OP0_NOP, tgsi_unsupported},
10536 [TGSI_OPCODE_CLOCK] = { ALU_OP0_NOP, tgsi_clock},
10537 [34] = { ALU_OP0_NOP, tgsi_unsupported},
10538 [35] = { ALU_OP0_NOP, tgsi_unsupported},
10539 [TGSI_OPCODE_COS] = { ALU_OP1_COS, cayman_trig},
10540 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
10541 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
10542 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */
10543 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_pk2h},
10544 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported},
10545 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported},
10546 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported},
10547 [44] = { ALU_OP0_NOP, tgsi_unsupported},
10548 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2},
10549 [46] = { ALU_OP0_NOP, tgsi_unsupported},
10550 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2},
10551 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, cayman_trig},
10552 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap},
10553 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2},
10554 [51] = { ALU_OP0_NOP, tgsi_unsupported},
10555 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex},
10556 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex},
10557 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex},
10558 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_up2h},
10559 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported},
10560 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported},
10561 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported},
10562 [59] = { ALU_OP0_NOP, tgsi_unsupported},
10563 [60] = { ALU_OP0_NOP, tgsi_unsupported},
10564 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_eg_arl},
10565 [62] = { ALU_OP0_NOP, tgsi_unsupported},
10566 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported},
10567 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported},
10568 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg},
10569 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp},
10570 [67] = { ALU_OP0_NOP, tgsi_unsupported},
10571 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
10572 [69] = { ALU_OP0_NOP, tgsi_unsupported},
10573 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported},
10574 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4_IEEE, tgsi_dp},
10575 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex},
10576 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
10577 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if},
10578 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif},
10579 [76] = { ALU_OP0_NOP, tgsi_unsupported},
10580 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else},
10581 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif},
10582 [TGSI_OPCODE_DDX_FINE] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
10583 [TGSI_OPCODE_DDY_FINE] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
10584 [82] = { ALU_OP0_NOP, tgsi_unsupported},
10585 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2},
10586 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2},
10587 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2},
10588 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2},
10589 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2},
10590 [88] = { ALU_OP0_NOP, tgsi_unsupported},
10591 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2},
10592 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2},
10593 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod},
10594 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2},
10595 [93] = { ALU_OP0_NOP, tgsi_unsupported},
10596 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex},
10597 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
10598 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
10599 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
10600 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit},
10601 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop},
10602 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported},
10603 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop},
10604 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported},
10605 [103] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
10606 [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
10607 [TGSI_OPCODE_RESQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_resq},
10608 [106] = { ALU_OP0_NOP, tgsi_unsupported},
10609 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported},
10610 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2},
10611 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2},
10612 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
10613 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
10614 [TGSI_OPCODE_MEMBAR] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
10615 [113] = { ALU_OP0_NOP, tgsi_unsupported},
10616 [114] = { ALU_OP0_NOP, tgsi_unsupported},
10617 [115] = { ALU_OP0_NOP, tgsi_unsupported},
10618 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */
10619 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */
10620 /* Refer below for TGSI_OPCODE_DFMA */
10621 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_op2},
10622 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv},
10623 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2},
10624 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2},
10625 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg},
10626 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2},
10627 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2},
10628 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap},
10629 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_op2},
10630 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2},
10631 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2},
10632 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv},
10633 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad},
10634 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2},
10635 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2},
10636 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod},
10637 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_INT, cayman_mul_int_instr},
10638 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2},
10639 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2},
10640 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2},
10641 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
10642 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2},
10643 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
10644 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported},
10645 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported},
10646 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
10647 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported},
10648 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported},
10649 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported},
10650 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported},
10651 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported},
10652 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported},
10653 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported},
10654 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported},
10655 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported},
10656 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported},
10657 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported},
10658 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported},
10659 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_eg_arl},
10660 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp},
10661 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs},
10662 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg},
10663 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_load},
10664 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_store},
10665 [163] = { ALU_OP0_NOP, tgsi_unsupported},
10666 [164] = { ALU_OP0_NOP, tgsi_unsupported},
10667 [165] = { ALU_OP0_NOP, tgsi_unsupported},
10668 [TGSI_OPCODE_BARRIER] = { ALU_OP0_GROUP_BARRIER, tgsi_barrier},
10669 [TGSI_OPCODE_ATOMUADD] = { V_RAT_INST_ADD_RTN, tgsi_atomic_op},
10670 [TGSI_OPCODE_ATOMXCHG] = { V_RAT_INST_XCHG_RTN, tgsi_atomic_op},
10671 [TGSI_OPCODE_ATOMCAS] = { V_RAT_INST_CMPXCHG_INT_RTN, tgsi_atomic_op},
10672 [TGSI_OPCODE_ATOMAND] = { V_RAT_INST_AND_RTN, tgsi_atomic_op},
10673 [TGSI_OPCODE_ATOMOR] = { V_RAT_INST_OR_RTN, tgsi_atomic_op},
10674 [TGSI_OPCODE_ATOMXOR] = { V_RAT_INST_XOR_RTN, tgsi_atomic_op},
10675 [TGSI_OPCODE_ATOMUMIN] = { V_RAT_INST_MIN_UINT_RTN, tgsi_atomic_op},
10676 [TGSI_OPCODE_ATOMUMAX] = { V_RAT_INST_MAX_UINT_RTN, tgsi_atomic_op},
10677 [TGSI_OPCODE_ATOMIMIN] = { V_RAT_INST_MIN_INT_RTN, tgsi_atomic_op},
10678 [TGSI_OPCODE_ATOMIMAX] = { V_RAT_INST_MAX_INT_RTN, tgsi_atomic_op},
10679 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex},
10680 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
10681 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex},
10682 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, cayman_mul_int_instr},
10683 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, cayman_mul_int_instr},
10684 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_tex},
10685 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_tex},
10686 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_bfe},
10687 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_bfe},
10688 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_bfi},
10689 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_op2},
10690 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_op2},
10691 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_op2},
10692 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_msb},
10693 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_msb},
10694 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_interp_egcm},
10695 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_interp_egcm},
10696 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_interp_egcm},
10697 [TGSI_OPCODE_F2D] = { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},
10698 [TGSI_OPCODE_D2F] = { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},
10699 [TGSI_OPCODE_DABS] = { ALU_OP1_MOV, tgsi_op2_64},
10700 [TGSI_OPCODE_DNEG] = { ALU_OP2_ADD_64, tgsi_dneg},
10701 [TGSI_OPCODE_DADD] = { ALU_OP2_ADD_64, tgsi_op2_64},
10702 [TGSI_OPCODE_DMUL] = { ALU_OP2_MUL_64, cayman_mul_double_instr},
10703 [TGSI_OPCODE_DDIV] = { 0, cayman_ddiv_instr },
10704 [TGSI_OPCODE_DMAX] = { ALU_OP2_MAX_64, tgsi_op2_64},
10705 [TGSI_OPCODE_DMIN] = { ALU_OP2_MIN_64, tgsi_op2_64},
10706 [TGSI_OPCODE_DSLT] = { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
10707 [TGSI_OPCODE_DSGE] = { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},
10708 [TGSI_OPCODE_DSEQ] = { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},
10709 [TGSI_OPCODE_DSNE] = { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},
10710 [TGSI_OPCODE_DRCP] = { ALU_OP2_RECIP_64, cayman_emit_double_instr},
10711 [TGSI_OPCODE_DSQRT] = { ALU_OP2_SQRT_64, cayman_emit_double_instr},
10712 [TGSI_OPCODE_DMAD] = { ALU_OP3_FMA_64, tgsi_op3_64},
10713 [TGSI_OPCODE_DFMA] = { ALU_OP3_FMA_64, tgsi_op3_64},
10714 [TGSI_OPCODE_DFRAC] = { ALU_OP1_FRACT_64, tgsi_op2_64},
10715 [TGSI_OPCODE_DLDEXP] = { ALU_OP2_LDEXP_64, tgsi_op2_64},
10716 [TGSI_OPCODE_DFRACEXP] = { ALU_OP1_FREXP_64, tgsi_dfracexp},
10717 [TGSI_OPCODE_D2I] = { ALU_OP1_FLT_TO_INT, egcm_double_to_int},
10718 [TGSI_OPCODE_I2D] = { ALU_OP1_INT_TO_FLT, egcm_int_to_double},
10719 [TGSI_OPCODE_D2U] = { ALU_OP1_FLT_TO_UINT, egcm_double_to_int},
10720 [TGSI_OPCODE_U2D] = { ALU_OP1_UINT_TO_FLT, egcm_int_to_double},
10721 [TGSI_OPCODE_DRSQ] = { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},
10722 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported},
10723 };