r600g: Disable LLVM by default at runtime for graphics
[mesa.git] / src / gallium / drivers / r600 / r600_shader.c
1 /*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 */
23 #include "r600_sq.h"
24 #include "r600_llvm.h"
25 #include "r600_formats.h"
26 #include "r600_opcodes.h"
27 #include "r600_shader.h"
28 #include "r600d.h"
29
30 #include "sb/sb_public.h"
31
32 #include "pipe/p_shader_tokens.h"
33 #include "tgsi/tgsi_info.h"
34 #include "tgsi/tgsi_parse.h"
35 #include "tgsi/tgsi_scan.h"
36 #include "tgsi/tgsi_dump.h"
37 #include "util/u_memory.h"
38 #include "util/u_math.h"
39 #include <stdio.h>
40 #include <errno.h>
41
42 /* CAYMAN notes
43 Why CAYMAN got loops for lots of instructions is explained here.
44
45 -These 8xx t-slot only ops are implemented in all vector slots.
46 MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT
47 These 8xx t-slot only opcodes become vector ops, with all four
48 slots expecting the arguments on sources a and b. Result is
49 broadcast to all channels.
50 MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT
51 These 8xx t-slot only opcodes become vector ops in the z, y, and
52 x slots.
53 EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64
54 RECIPSQRT_IEEE/CLAMPED/FF/_64/CLAMPED_64
55 SQRT_IEEE/_64
56 SIN/COS
57 The w slot may have an independent co-issued operation, or if the
58 result is required to be in the w slot, the opcode above may be
59 issued in the w slot as well.
60 The compiler must issue the source argument to slots z, y, and x
61 */
62
63 static int r600_shader_from_tgsi(struct r600_context *rctx,
64 struct r600_pipe_shader *pipeshader,
65 struct r600_shader_key key);
66
67 static void r600_add_gpr_array(struct r600_shader *ps, int start_gpr,
68 int size, unsigned comp_mask) {
69
70 if (!size)
71 return;
72
73 if (ps->num_arrays == ps->max_arrays) {
74 ps->max_arrays += 64;
75 ps->arrays = realloc(ps->arrays, ps->max_arrays *
76 sizeof(struct r600_shader_array));
77 }
78
79 int n = ps->num_arrays;
80 ++ps->num_arrays;
81
82 ps->arrays[n].comp_mask = comp_mask;
83 ps->arrays[n].gpr_start = start_gpr;
84 ps->arrays[n].gpr_count = size;
85 }
86
87 static void r600_dump_streamout(struct pipe_stream_output_info *so)
88 {
89 unsigned i;
90
91 fprintf(stderr, "STREAMOUT\n");
92 for (i = 0; i < so->num_outputs; i++) {
93 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
94 so->output[i].start_component;
95 fprintf(stderr, " %i: MEM_STREAM0_BUF%i[%i..%i] <- OUT[%i].%s%s%s%s%s\n",
96 i, so->output[i].output_buffer,
97 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
98 so->output[i].register_index,
99 mask & 1 ? "x" : "",
100 mask & 2 ? "y" : "",
101 mask & 4 ? "z" : "",
102 mask & 8 ? "w" : "",
103 so->output[i].dst_offset < so->output[i].start_component ? " (will lower)" : "");
104 }
105 }
106
107 static int store_shader(struct pipe_context *ctx,
108 struct r600_pipe_shader *shader)
109 {
110 struct r600_context *rctx = (struct r600_context *)ctx;
111 uint32_t *ptr, i;
112
113 if (shader->bo == NULL) {
114 shader->bo = (struct r600_resource*)
115 pipe_buffer_create(ctx->screen, PIPE_BIND_CUSTOM, PIPE_USAGE_IMMUTABLE, shader->shader.bc.ndw * 4);
116 if (shader->bo == NULL) {
117 return -ENOMEM;
118 }
119 ptr = r600_buffer_map_sync_with_rings(&rctx->b, shader->bo, PIPE_TRANSFER_WRITE);
120 if (R600_BIG_ENDIAN) {
121 for (i = 0; i < shader->shader.bc.ndw; ++i) {
122 ptr[i] = util_cpu_to_le32(shader->shader.bc.bytecode[i]);
123 }
124 } else {
125 memcpy(ptr, shader->shader.bc.bytecode, shader->shader.bc.ndw * sizeof(*ptr));
126 }
127 rctx->b.ws->buffer_unmap(shader->bo->cs_buf);
128 }
129
130 return 0;
131 }
132
133 int r600_pipe_shader_create(struct pipe_context *ctx,
134 struct r600_pipe_shader *shader,
135 struct r600_shader_key key)
136 {
137 struct r600_context *rctx = (struct r600_context *)ctx;
138 struct r600_pipe_shader_selector *sel = shader->selector;
139 int r;
140 bool dump = r600_can_dump_shader(&rctx->screen->b, sel->tokens);
141 unsigned use_sb = !(rctx->screen->b.debug_flags & DBG_NO_SB);
142 unsigned sb_disasm = use_sb || (rctx->screen->b.debug_flags & DBG_SB_DISASM);
143 unsigned export_shader = key.vs_as_es;
144
145 shader->shader.bc.isa = rctx->isa;
146
147 if (dump) {
148 fprintf(stderr, "--------------------------------------------------------------\n");
149 tgsi_dump(sel->tokens, 0);
150
151 if (sel->so.num_outputs) {
152 r600_dump_streamout(&sel->so);
153 }
154 }
155 r = r600_shader_from_tgsi(rctx, shader, key);
156 if (r) {
157 R600_ERR("translation from TGSI failed !\n");
158 goto error;
159 }
160
161 /* disable SB for geom shaders - it can't handle the CF_EMIT instructions */
162 use_sb &= (shader->shader.processor_type != TGSI_PROCESSOR_GEOMETRY);
163
164 /* Check if the bytecode has already been built. When using the llvm
165 * backend, r600_shader_from_tgsi() will take care of building the
166 * bytecode.
167 */
168 if (!shader->shader.bc.bytecode) {
169 r = r600_bytecode_build(&shader->shader.bc);
170 if (r) {
171 R600_ERR("building bytecode failed !\n");
172 goto error;
173 }
174 }
175
176 if (dump && !sb_disasm) {
177 fprintf(stderr, "--------------------------------------------------------------\n");
178 r600_bytecode_disasm(&shader->shader.bc);
179 fprintf(stderr, "______________________________________________________________\n");
180 } else if ((dump && sb_disasm) || use_sb) {
181 r = r600_sb_bytecode_process(rctx, &shader->shader.bc, &shader->shader,
182 dump, use_sb);
183 if (r) {
184 R600_ERR("r600_sb_bytecode_process failed !\n");
185 goto error;
186 }
187 }
188
189 if (shader->gs_copy_shader) {
190 if (dump) {
191 // dump copy shader
192 r = r600_sb_bytecode_process(rctx, &shader->gs_copy_shader->shader.bc,
193 &shader->gs_copy_shader->shader, dump, 0);
194 if (r)
195 goto error;
196 }
197
198 if ((r = store_shader(ctx, shader->gs_copy_shader)))
199 goto error;
200 }
201
202 /* Store the shader in a buffer. */
203 if ((r = store_shader(ctx, shader)))
204 goto error;
205
206 /* Build state. */
207 switch (shader->shader.processor_type) {
208 case TGSI_PROCESSOR_GEOMETRY:
209 if (rctx->b.chip_class >= EVERGREEN) {
210 evergreen_update_gs_state(ctx, shader);
211 evergreen_update_vs_state(ctx, shader->gs_copy_shader);
212 } else {
213 r600_update_gs_state(ctx, shader);
214 r600_update_vs_state(ctx, shader->gs_copy_shader);
215 }
216 break;
217 case TGSI_PROCESSOR_VERTEX:
218 if (rctx->b.chip_class >= EVERGREEN) {
219 if (export_shader)
220 evergreen_update_es_state(ctx, shader);
221 else
222 evergreen_update_vs_state(ctx, shader);
223 } else {
224 if (export_shader)
225 r600_update_es_state(ctx, shader);
226 else
227 r600_update_vs_state(ctx, shader);
228 }
229 break;
230 case TGSI_PROCESSOR_FRAGMENT:
231 if (rctx->b.chip_class >= EVERGREEN) {
232 evergreen_update_ps_state(ctx, shader);
233 } else {
234 r600_update_ps_state(ctx, shader);
235 }
236 break;
237 default:
238 r = -EINVAL;
239 goto error;
240 }
241 return 0;
242
243 error:
244 r600_pipe_shader_destroy(ctx, shader);
245 return r;
246 }
247
248 void r600_pipe_shader_destroy(struct pipe_context *ctx, struct r600_pipe_shader *shader)
249 {
250 pipe_resource_reference((struct pipe_resource**)&shader->bo, NULL);
251 r600_bytecode_clear(&shader->shader.bc);
252 r600_release_command_buffer(&shader->command_buffer);
253 }
254
255 /*
256 * tgsi -> r600 shader
257 */
258 struct r600_shader_tgsi_instruction;
259
260 struct r600_shader_src {
261 unsigned sel;
262 unsigned swizzle[4];
263 unsigned neg;
264 unsigned abs;
265 unsigned rel;
266 unsigned kc_bank;
267 uint32_t value[4];
268 };
269
270 struct r600_shader_ctx {
271 struct tgsi_shader_info info;
272 struct tgsi_parse_context parse;
273 const struct tgsi_token *tokens;
274 unsigned type;
275 unsigned file_offset[TGSI_FILE_COUNT];
276 unsigned temp_reg;
277 struct r600_shader_tgsi_instruction *inst_info;
278 struct r600_bytecode *bc;
279 struct r600_shader *shader;
280 struct r600_shader_src src[4];
281 uint32_t *literals;
282 uint32_t nliterals;
283 uint32_t max_driver_temp_used;
284 boolean use_llvm;
285 /* needed for evergreen interpolation */
286 boolean input_centroid;
287 boolean input_linear;
288 boolean input_perspective;
289 int num_interp_gpr;
290 int face_gpr;
291 int colors_used;
292 boolean clip_vertex_write;
293 unsigned cv_output;
294 unsigned edgeflag_output;
295 int fragcoord_input;
296 int native_integers;
297 int next_ring_offset;
298 int gs_out_ring_offset;
299 int gs_next_vertex;
300 struct r600_shader *gs_for_vs;
301 int gs_export_gpr_treg;
302 };
303
304 struct r600_shader_tgsi_instruction {
305 unsigned tgsi_opcode;
306 unsigned is_op3;
307 unsigned op;
308 int (*process)(struct r600_shader_ctx *ctx);
309 };
310
311 static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, bool ind);
312 static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[];
313 static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx);
314 static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason);
315 static void fc_pushlevel(struct r600_shader_ctx *ctx, int type);
316 static int tgsi_else(struct r600_shader_ctx *ctx);
317 static int tgsi_endif(struct r600_shader_ctx *ctx);
318 static int tgsi_bgnloop(struct r600_shader_ctx *ctx);
319 static int tgsi_endloop(struct r600_shader_ctx *ctx);
320 static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx);
321
322 static int tgsi_is_supported(struct r600_shader_ctx *ctx)
323 {
324 struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction;
325 int j;
326
327 if (i->Instruction.NumDstRegs > 1) {
328 R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs);
329 return -EINVAL;
330 }
331 if (i->Instruction.Predicate) {
332 R600_ERR("predicate unsupported\n");
333 return -EINVAL;
334 }
335 #if 0
336 if (i->Instruction.Label) {
337 R600_ERR("label unsupported\n");
338 return -EINVAL;
339 }
340 #endif
341 for (j = 0; j < i->Instruction.NumSrcRegs; j++) {
342 if (i->Src[j].Register.Dimension) {
343 switch (i->Src[j].Register.File) {
344 case TGSI_FILE_CONSTANT:
345 break;
346 case TGSI_FILE_INPUT:
347 if (ctx->type == TGSI_PROCESSOR_GEOMETRY)
348 break;
349 default:
350 R600_ERR("unsupported src %d (dimension %d)\n", j,
351 i->Src[j].Register.Dimension);
352 return -EINVAL;
353 }
354 }
355 }
356 for (j = 0; j < i->Instruction.NumDstRegs; j++) {
357 if (i->Dst[j].Register.Dimension) {
358 R600_ERR("unsupported dst (dimension)\n");
359 return -EINVAL;
360 }
361 }
362 return 0;
363 }
364
365 static void evergreen_interp_assign_ij_index(struct r600_shader_ctx *ctx,
366 int input)
367 {
368 int ij_index = 0;
369
370 if (ctx->shader->input[input].interpolate == TGSI_INTERPOLATE_PERSPECTIVE) {
371 if (ctx->shader->input[input].centroid)
372 ij_index++;
373 } else if (ctx->shader->input[input].interpolate == TGSI_INTERPOLATE_LINEAR) {
374 /* if we have perspective add one */
375 if (ctx->input_perspective) {
376 ij_index++;
377 /* if we have perspective centroid */
378 if (ctx->input_centroid)
379 ij_index++;
380 }
381 if (ctx->shader->input[input].centroid)
382 ij_index++;
383 }
384
385 ctx->shader->input[input].ij_index = ij_index;
386 }
387
388 static int evergreen_interp_alu(struct r600_shader_ctx *ctx, int input)
389 {
390 int i, r;
391 struct r600_bytecode_alu alu;
392 int gpr = 0, base_chan = 0;
393 int ij_index = ctx->shader->input[input].ij_index;
394
395 /* work out gpr and base_chan from index */
396 gpr = ij_index / 2;
397 base_chan = (2 * (ij_index % 2)) + 1;
398
399 for (i = 0; i < 8; i++) {
400 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
401
402 if (i < 4)
403 alu.op = ALU_OP2_INTERP_ZW;
404 else
405 alu.op = ALU_OP2_INTERP_XY;
406
407 if ((i > 1) && (i < 6)) {
408 alu.dst.sel = ctx->shader->input[input].gpr;
409 alu.dst.write = 1;
410 }
411
412 alu.dst.chan = i % 4;
413
414 alu.src[0].sel = gpr;
415 alu.src[0].chan = (base_chan - (i % 2));
416
417 alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
418
419 alu.bank_swizzle_force = SQ_ALU_VEC_210;
420 if ((i % 4) == 3)
421 alu.last = 1;
422 r = r600_bytecode_add_alu(ctx->bc, &alu);
423 if (r)
424 return r;
425 }
426 return 0;
427 }
428
429 static int evergreen_interp_flat(struct r600_shader_ctx *ctx, int input)
430 {
431 int i, r;
432 struct r600_bytecode_alu alu;
433
434 for (i = 0; i < 4; i++) {
435 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
436
437 alu.op = ALU_OP1_INTERP_LOAD_P0;
438
439 alu.dst.sel = ctx->shader->input[input].gpr;
440 alu.dst.write = 1;
441
442 alu.dst.chan = i;
443
444 alu.src[0].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
445 alu.src[0].chan = i;
446
447 if (i == 3)
448 alu.last = 1;
449 r = r600_bytecode_add_alu(ctx->bc, &alu);
450 if (r)
451 return r;
452 }
453 return 0;
454 }
455
456 /*
457 * Special export handling in shaders
458 *
459 * shader export ARRAY_BASE for EXPORT_POS:
460 * 60 is position
461 * 61 is misc vector
462 * 62, 63 are clip distance vectors
463 *
464 * The use of the values exported in 61-63 are controlled by PA_CL_VS_OUT_CNTL:
465 * VS_OUT_MISC_VEC_ENA - enables the use of all fields in export 61
466 * USE_VTX_POINT_SIZE - point size in the X channel of export 61
467 * USE_VTX_EDGE_FLAG - edge flag in the Y channel of export 61
468 * USE_VTX_RENDER_TARGET_INDX - render target index in the Z channel of export 61
469 * USE_VTX_VIEWPORT_INDX - viewport index in the W channel of export 61
470 * USE_VTX_KILL_FLAG - kill flag in the Z channel of export 61 (mutually
471 * exclusive from render target index)
472 * VS_OUT_CCDIST0_VEC_ENA/VS_OUT_CCDIST1_VEC_ENA - enable clip distance vectors
473 *
474 *
475 * shader export ARRAY_BASE for EXPORT_PIXEL:
476 * 0-7 CB targets
477 * 61 computed Z vector
478 *
479 * The use of the values exported in the computed Z vector are controlled
480 * by DB_SHADER_CONTROL:
481 * Z_EXPORT_ENABLE - Z as a float in RED
482 * STENCIL_REF_EXPORT_ENABLE - stencil ref as int in GREEN
483 * COVERAGE_TO_MASK_ENABLE - alpha to mask in ALPHA
484 * MASK_EXPORT_ENABLE - pixel sample mask in BLUE
485 * DB_SOURCE_FORMAT - export control restrictions
486 *
487 */
488
489
490 /* Map name/sid pair from tgsi to the 8-bit semantic index for SPI setup */
491 static int r600_spi_sid(struct r600_shader_io * io)
492 {
493 int index, name = io->name;
494
495 /* These params are handled differently, they don't need
496 * semantic indices, so we'll use 0 for them.
497 */
498 if (name == TGSI_SEMANTIC_POSITION ||
499 name == TGSI_SEMANTIC_PSIZE ||
500 name == TGSI_SEMANTIC_EDGEFLAG ||
501 name == TGSI_SEMANTIC_LAYER ||
502 name == TGSI_SEMANTIC_VIEWPORT_INDEX ||
503 name == TGSI_SEMANTIC_FACE)
504 index = 0;
505 else {
506 if (name == TGSI_SEMANTIC_GENERIC) {
507 /* For generic params simply use sid from tgsi */
508 index = io->sid;
509 } else {
510 /* For non-generic params - pack name and sid into 8 bits */
511 index = 0x80 | (name<<3) | (io->sid);
512 }
513
514 /* Make sure that all really used indices have nonzero value, so
515 * we can just compare it to 0 later instead of comparing the name
516 * with different values to detect special cases. */
517 index++;
518 }
519
520 return index;
521 };
522
523 /* turn input into interpolate on EG */
524 static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index)
525 {
526 int r = 0;
527
528 if (ctx->shader->input[index].spi_sid) {
529 ctx->shader->input[index].lds_pos = ctx->shader->nlds++;
530 if (ctx->shader->input[index].interpolate > 0) {
531 evergreen_interp_assign_ij_index(ctx, index);
532 if (!ctx->use_llvm)
533 r = evergreen_interp_alu(ctx, index);
534 } else {
535 if (!ctx->use_llvm)
536 r = evergreen_interp_flat(ctx, index);
537 }
538 }
539 return r;
540 }
541
542 static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back)
543 {
544 struct r600_bytecode_alu alu;
545 int i, r;
546 int gpr_front = ctx->shader->input[front].gpr;
547 int gpr_back = ctx->shader->input[back].gpr;
548
549 for (i = 0; i < 4; i++) {
550 memset(&alu, 0, sizeof(alu));
551 alu.op = ALU_OP3_CNDGT;
552 alu.is_op3 = 1;
553 alu.dst.write = 1;
554 alu.dst.sel = gpr_front;
555 alu.src[0].sel = ctx->face_gpr;
556 alu.src[1].sel = gpr_front;
557 alu.src[2].sel = gpr_back;
558
559 alu.dst.chan = i;
560 alu.src[1].chan = i;
561 alu.src[2].chan = i;
562 alu.last = (i==3);
563
564 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
565 return r;
566 }
567
568 return 0;
569 }
570
571 static int tgsi_declaration(struct r600_shader_ctx *ctx)
572 {
573 struct tgsi_full_declaration *d = &ctx->parse.FullToken.FullDeclaration;
574 int r, i, j, count = d->Range.Last - d->Range.First + 1;
575
576 switch (d->Declaration.File) {
577 case TGSI_FILE_INPUT:
578 i = ctx->shader->ninput;
579 assert(i < Elements(ctx->shader->input));
580 ctx->shader->ninput += count;
581 ctx->shader->input[i].name = d->Semantic.Name;
582 ctx->shader->input[i].sid = d->Semantic.Index;
583 ctx->shader->input[i].interpolate = d->Interp.Interpolate;
584 ctx->shader->input[i].centroid = d->Interp.Centroid;
585 ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First;
586 if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
587 ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]);
588 switch (ctx->shader->input[i].name) {
589 case TGSI_SEMANTIC_FACE:
590 ctx->face_gpr = ctx->shader->input[i].gpr;
591 break;
592 case TGSI_SEMANTIC_COLOR:
593 ctx->colors_used++;
594 break;
595 case TGSI_SEMANTIC_POSITION:
596 ctx->fragcoord_input = i;
597 break;
598 }
599 if (ctx->bc->chip_class >= EVERGREEN) {
600 if ((r = evergreen_interp_input(ctx, i)))
601 return r;
602 }
603 } else if (ctx->type == TGSI_PROCESSOR_GEOMETRY) {
604 /* FIXME probably skip inputs if they aren't passed in the ring */
605 ctx->shader->input[i].ring_offset = ctx->next_ring_offset;
606 ctx->next_ring_offset += 16;
607 if (ctx->shader->input[i].name == TGSI_SEMANTIC_PRIMID)
608 ctx->shader->gs_prim_id_input = true;
609 }
610 for (j = 1; j < count; ++j) {
611 ctx->shader->input[i + j] = ctx->shader->input[i];
612 ctx->shader->input[i + j].gpr += j;
613 }
614 break;
615 case TGSI_FILE_OUTPUT:
616 i = ctx->shader->noutput++;
617 assert(i < Elements(ctx->shader->output));
618 ctx->shader->output[i].name = d->Semantic.Name;
619 ctx->shader->output[i].sid = d->Semantic.Index;
620 ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First;
621 ctx->shader->output[i].interpolate = d->Interp.Interpolate;
622 ctx->shader->output[i].write_mask = d->Declaration.UsageMask;
623 if (ctx->type == TGSI_PROCESSOR_VERTEX ||
624 ctx->type == TGSI_PROCESSOR_GEOMETRY) {
625 ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]);
626 switch (d->Semantic.Name) {
627 case TGSI_SEMANTIC_CLIPDIST:
628 ctx->shader->clip_dist_write |= d->Declaration.UsageMask << (d->Semantic.Index << 2);
629 break;
630 case TGSI_SEMANTIC_PSIZE:
631 ctx->shader->vs_out_misc_write = 1;
632 ctx->shader->vs_out_point_size = 1;
633 break;
634 case TGSI_SEMANTIC_EDGEFLAG:
635 ctx->shader->vs_out_misc_write = 1;
636 ctx->shader->vs_out_edgeflag = 1;
637 ctx->edgeflag_output = i;
638 break;
639 case TGSI_SEMANTIC_VIEWPORT_INDEX:
640 ctx->shader->vs_out_misc_write = 1;
641 ctx->shader->vs_out_viewport = 1;
642 break;
643 case TGSI_SEMANTIC_LAYER:
644 ctx->shader->vs_out_misc_write = 1;
645 ctx->shader->vs_out_layer = 1;
646 break;
647 case TGSI_SEMANTIC_CLIPVERTEX:
648 ctx->clip_vertex_write = TRUE;
649 ctx->cv_output = i;
650 break;
651 }
652 if (ctx->type == TGSI_PROCESSOR_GEOMETRY) {
653 ctx->gs_out_ring_offset += 16;
654 }
655 } else if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
656 switch (d->Semantic.Name) {
657 case TGSI_SEMANTIC_COLOR:
658 ctx->shader->nr_ps_max_color_exports++;
659 break;
660 }
661 }
662 break;
663 case TGSI_FILE_TEMPORARY:
664 if (ctx->info.indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
665 if (d->Array.ArrayID) {
666 r600_add_gpr_array(ctx->shader,
667 ctx->file_offset[TGSI_FILE_TEMPORARY] +
668 d->Range.First,
669 d->Range.Last - d->Range.First + 1, 0x0F);
670 }
671 }
672 break;
673
674 case TGSI_FILE_CONSTANT:
675 case TGSI_FILE_SAMPLER:
676 case TGSI_FILE_ADDRESS:
677 break;
678
679 case TGSI_FILE_SYSTEM_VALUE:
680 if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) {
681 if (!ctx->native_integers) {
682 struct r600_bytecode_alu alu;
683 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
684
685 alu.op = ALU_OP1_INT_TO_FLT;
686 alu.src[0].sel = 0;
687 alu.src[0].chan = 3;
688
689 alu.dst.sel = 0;
690 alu.dst.chan = 3;
691 alu.dst.write = 1;
692 alu.last = 1;
693
694 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
695 return r;
696 }
697 break;
698 } else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID)
699 break;
700 default:
701 R600_ERR("unsupported file %d declaration\n", d->Declaration.File);
702 return -EINVAL;
703 }
704 return 0;
705 }
706
707 static int r600_get_temp(struct r600_shader_ctx *ctx)
708 {
709 return ctx->temp_reg + ctx->max_driver_temp_used++;
710 }
711
712 /*
713 * for evergreen we need to scan the shader to find the number of GPRs we need to
714 * reserve for interpolation.
715 *
716 * we need to know if we are going to emit
717 * any centroid inputs
718 * if perspective and linear are required
719 */
720 static int evergreen_gpr_count(struct r600_shader_ctx *ctx)
721 {
722 int i;
723 int num_baryc;
724
725 ctx->input_linear = FALSE;
726 ctx->input_perspective = FALSE;
727 ctx->input_centroid = FALSE;
728 ctx->num_interp_gpr = 1;
729
730 /* any centroid inputs */
731 for (i = 0; i < ctx->info.num_inputs; i++) {
732 /* skip position/face */
733 if (ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_POSITION ||
734 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_FACE)
735 continue;
736 if (ctx->info.input_interpolate[i] == TGSI_INTERPOLATE_LINEAR)
737 ctx->input_linear = TRUE;
738 if (ctx->info.input_interpolate[i] == TGSI_INTERPOLATE_PERSPECTIVE)
739 ctx->input_perspective = TRUE;
740 if (ctx->info.input_centroid[i])
741 ctx->input_centroid = TRUE;
742 }
743
744 num_baryc = 0;
745 /* ignoring sample for now */
746 if (ctx->input_perspective)
747 num_baryc++;
748 if (ctx->input_linear)
749 num_baryc++;
750 if (ctx->input_centroid)
751 num_baryc *= 2;
752
753 ctx->num_interp_gpr += (num_baryc + 1) >> 1;
754
755 /* XXX PULL MODEL and LINE STIPPLE, FIXED PT POS */
756 return ctx->num_interp_gpr;
757 }
758
759 static void tgsi_src(struct r600_shader_ctx *ctx,
760 const struct tgsi_full_src_register *tgsi_src,
761 struct r600_shader_src *r600_src)
762 {
763 memset(r600_src, 0, sizeof(*r600_src));
764 r600_src->swizzle[0] = tgsi_src->Register.SwizzleX;
765 r600_src->swizzle[1] = tgsi_src->Register.SwizzleY;
766 r600_src->swizzle[2] = tgsi_src->Register.SwizzleZ;
767 r600_src->swizzle[3] = tgsi_src->Register.SwizzleW;
768 r600_src->neg = tgsi_src->Register.Negate;
769 r600_src->abs = tgsi_src->Register.Absolute;
770
771 if (tgsi_src->Register.File == TGSI_FILE_IMMEDIATE) {
772 int index;
773 if ((tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleY) &&
774 (tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleZ) &&
775 (tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleW)) {
776
777 index = tgsi_src->Register.Index * 4 + tgsi_src->Register.SwizzleX;
778 r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel, &r600_src->neg);
779 if (r600_src->sel != V_SQ_ALU_SRC_LITERAL)
780 return;
781 }
782 index = tgsi_src->Register.Index;
783 r600_src->sel = V_SQ_ALU_SRC_LITERAL;
784 memcpy(r600_src->value, ctx->literals + index * 4, sizeof(r600_src->value));
785 } else if (tgsi_src->Register.File == TGSI_FILE_SYSTEM_VALUE) {
786 if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INSTANCEID) {
787 r600_src->swizzle[0] = 3;
788 r600_src->swizzle[1] = 3;
789 r600_src->swizzle[2] = 3;
790 r600_src->swizzle[3] = 3;
791 r600_src->sel = 0;
792 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTEXID) {
793 r600_src->swizzle[0] = 0;
794 r600_src->swizzle[1] = 0;
795 r600_src->swizzle[2] = 0;
796 r600_src->swizzle[3] = 0;
797 r600_src->sel = 0;
798 }
799 } else {
800 if (tgsi_src->Register.Indirect)
801 r600_src->rel = V_SQ_REL_RELATIVE;
802 r600_src->sel = tgsi_src->Register.Index;
803 r600_src->sel += ctx->file_offset[tgsi_src->Register.File];
804 }
805 if (tgsi_src->Register.File == TGSI_FILE_CONSTANT) {
806 if (tgsi_src->Register.Dimension) {
807 r600_src->kc_bank = tgsi_src->Dimension.Index;
808 }
809 }
810 }
811
812 static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx, unsigned int cb_idx, unsigned int offset, unsigned int dst_reg)
813 {
814 struct r600_bytecode_vtx vtx;
815 unsigned int ar_reg;
816 int r;
817
818 if (offset) {
819 struct r600_bytecode_alu alu;
820
821 memset(&alu, 0, sizeof(alu));
822
823 alu.op = ALU_OP2_ADD_INT;
824 alu.src[0].sel = ctx->bc->ar_reg;
825
826 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
827 alu.src[1].value = offset;
828
829 alu.dst.sel = dst_reg;
830 alu.dst.write = 1;
831 alu.last = 1;
832
833 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
834 return r;
835
836 ar_reg = dst_reg;
837 } else {
838 ar_reg = ctx->bc->ar_reg;
839 }
840
841 memset(&vtx, 0, sizeof(vtx));
842 vtx.buffer_id = cb_idx;
843 vtx.fetch_type = 2; /* VTX_FETCH_NO_INDEX_OFFSET */
844 vtx.src_gpr = ar_reg;
845 vtx.mega_fetch_count = 16;
846 vtx.dst_gpr = dst_reg;
847 vtx.dst_sel_x = 0; /* SEL_X */
848 vtx.dst_sel_y = 1; /* SEL_Y */
849 vtx.dst_sel_z = 2; /* SEL_Z */
850 vtx.dst_sel_w = 3; /* SEL_W */
851 vtx.data_format = FMT_32_32_32_32_FLOAT;
852 vtx.num_format_all = 2; /* NUM_FORMAT_SCALED */
853 vtx.format_comp_all = 1; /* FORMAT_COMP_SIGNED */
854 vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */
855 vtx.endian = r600_endian_swap(32);
856
857 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
858 return r;
859
860 return 0;
861 }
862
863 static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
864 {
865 struct r600_bytecode_vtx vtx;
866 int r;
867 unsigned index = src->Register.Index;
868 unsigned vtx_id = src->Dimension.Index;
869 int offset_reg = vtx_id / 3;
870 int offset_chan = vtx_id % 3;
871
872 /* offsets of per-vertex data in ESGS ring are passed to GS in R0.x, R0.y,
873 * R0.w, R1.x, R1.y, R1.z (it seems R0.z is used for PrimitiveID) */
874
875 if (offset_reg == 0 && offset_chan == 2)
876 offset_chan = 3;
877
878 if (src->Dimension.Indirect) {
879 int treg[3];
880 int t2;
881 struct r600_bytecode_alu alu;
882 int r, i;
883
884 /* you have got to be shitting me -
885 we have to put the R0.x/y/w into Rt.x Rt+1.x Rt+2.x then index reg from Rt.
886 at least this is what fglrx seems to do. */
887 for (i = 0; i < 3; i++) {
888 treg[i] = r600_get_temp(ctx);
889 }
890 t2 = r600_get_temp(ctx);
891 for (i = 0; i < 3; i++) {
892 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
893 alu.op = ALU_OP1_MOV;
894 alu.src[0].sel = 0;
895 alu.src[0].chan = i == 2 ? 3 : i;
896 alu.dst.sel = treg[i];
897 alu.dst.chan = 0;
898 alu.dst.write = 1;
899 alu.last = 1;
900 r = r600_bytecode_add_alu(ctx->bc, &alu);
901 if (r)
902 return r;
903 }
904 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
905 alu.op = ALU_OP1_MOV;
906 alu.src[0].sel = treg[0];
907 alu.src[0].rel = 1;
908 alu.dst.sel = t2;
909 alu.dst.write = 1;
910 alu.last = 1;
911 r = r600_bytecode_add_alu(ctx->bc, &alu);
912 if (r)
913 return r;
914 offset_reg = t2;
915 }
916
917
918 memset(&vtx, 0, sizeof(vtx));
919 vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
920 vtx.fetch_type = 2; /* VTX_FETCH_NO_INDEX_OFFSET */
921 vtx.src_gpr = offset_reg;
922 vtx.src_sel_x = offset_chan;
923 vtx.offset = index * 16; /*bytes*/
924 vtx.mega_fetch_count = 16;
925 vtx.dst_gpr = dst_reg;
926 vtx.dst_sel_x = 0; /* SEL_X */
927 vtx.dst_sel_y = 1; /* SEL_Y */
928 vtx.dst_sel_z = 2; /* SEL_Z */
929 vtx.dst_sel_w = 3; /* SEL_W */
930 if (ctx->bc->chip_class >= EVERGREEN) {
931 vtx.use_const_fields = 1;
932 } else {
933 vtx.data_format = FMT_32_32_32_32_FLOAT;
934 }
935
936 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
937 return r;
938
939 return 0;
940 }
941
942 static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx)
943 {
944 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
945 int i;
946
947 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
948 struct tgsi_full_src_register *src = &inst->Src[i];
949
950 if (src->Register.File == TGSI_FILE_INPUT) {
951 if (ctx->shader->input[src->Register.Index].name == TGSI_SEMANTIC_PRIMID) {
952 /* primitive id is in R0.z */
953 ctx->src[i].sel = 0;
954 ctx->src[i].swizzle[0] = 2;
955 }
956 }
957 if (src->Register.File == TGSI_FILE_INPUT && src->Register.Dimension) {
958 int treg = r600_get_temp(ctx);
959
960 fetch_gs_input(ctx, src, treg);
961 ctx->src[i].sel = treg;
962 }
963 }
964 return 0;
965 }
966
967 static int tgsi_split_constant(struct r600_shader_ctx *ctx)
968 {
969 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
970 struct r600_bytecode_alu alu;
971 int i, j, k, nconst, r;
972
973 for (i = 0, nconst = 0; i < inst->Instruction.NumSrcRegs; i++) {
974 if (inst->Src[i].Register.File == TGSI_FILE_CONSTANT) {
975 nconst++;
976 }
977 tgsi_src(ctx, &inst->Src[i], &ctx->src[i]);
978 }
979 for (i = 0, j = nconst - 1; i < inst->Instruction.NumSrcRegs; i++) {
980 if (inst->Src[i].Register.File != TGSI_FILE_CONSTANT) {
981 continue;
982 }
983
984 if (ctx->src[i].rel) {
985 int treg = r600_get_temp(ctx);
986 if ((r = tgsi_fetch_rel_const(ctx, ctx->src[i].kc_bank, ctx->src[i].sel - 512, treg)))
987 return r;
988
989 ctx->src[i].kc_bank = 0;
990 ctx->src[i].sel = treg;
991 ctx->src[i].rel = 0;
992 j--;
993 } else if (j > 0) {
994 int treg = r600_get_temp(ctx);
995 for (k = 0; k < 4; k++) {
996 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
997 alu.op = ALU_OP1_MOV;
998 alu.src[0].sel = ctx->src[i].sel;
999 alu.src[0].chan = k;
1000 alu.src[0].rel = ctx->src[i].rel;
1001 alu.dst.sel = treg;
1002 alu.dst.chan = k;
1003 alu.dst.write = 1;
1004 if (k == 3)
1005 alu.last = 1;
1006 r = r600_bytecode_add_alu(ctx->bc, &alu);
1007 if (r)
1008 return r;
1009 }
1010 ctx->src[i].sel = treg;
1011 ctx->src[i].rel =0;
1012 j--;
1013 }
1014 }
1015 return 0;
1016 }
1017
1018 /* need to move any immediate into a temp - for trig functions which use literal for PI stuff */
1019 static int tgsi_split_literal_constant(struct r600_shader_ctx *ctx)
1020 {
1021 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1022 struct r600_bytecode_alu alu;
1023 int i, j, k, nliteral, r;
1024
1025 for (i = 0, nliteral = 0; i < inst->Instruction.NumSrcRegs; i++) {
1026 if (ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
1027 nliteral++;
1028 }
1029 }
1030 for (i = 0, j = nliteral - 1; i < inst->Instruction.NumSrcRegs; i++) {
1031 if (j > 0 && ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
1032 int treg = r600_get_temp(ctx);
1033 for (k = 0; k < 4; k++) {
1034 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1035 alu.op = ALU_OP1_MOV;
1036 alu.src[0].sel = ctx->src[i].sel;
1037 alu.src[0].chan = k;
1038 alu.src[0].value = ctx->src[i].value[k];
1039 alu.dst.sel = treg;
1040 alu.dst.chan = k;
1041 alu.dst.write = 1;
1042 if (k == 3)
1043 alu.last = 1;
1044 r = r600_bytecode_add_alu(ctx->bc, &alu);
1045 if (r)
1046 return r;
1047 }
1048 ctx->src[i].sel = treg;
1049 j--;
1050 }
1051 }
1052 return 0;
1053 }
1054
1055 static int process_twoside_color_inputs(struct r600_shader_ctx *ctx)
1056 {
1057 int i, r, count = ctx->shader->ninput;
1058
1059 for (i = 0; i < count; i++) {
1060 if (ctx->shader->input[i].name == TGSI_SEMANTIC_COLOR) {
1061 r = select_twoside_color(ctx, i, ctx->shader->input[i].back_color_input);
1062 if (r)
1063 return r;
1064 }
1065 }
1066 return 0;
1067 }
1068
1069 static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output_info *so)
1070 {
1071 unsigned so_gpr[PIPE_MAX_SHADER_OUTPUTS];
1072 int i, j, r;
1073
1074 /* Sanity checking. */
1075 if (so->num_outputs > PIPE_MAX_SHADER_OUTPUTS) {
1076 R600_ERR("Too many stream outputs: %d\n", so->num_outputs);
1077 r = -EINVAL;
1078 goto out_err;
1079 }
1080 for (i = 0; i < so->num_outputs; i++) {
1081 if (so->output[i].output_buffer >= 4) {
1082 R600_ERR("Exceeded the max number of stream output buffers, got: %d\n",
1083 so->output[i].output_buffer);
1084 r = -EINVAL;
1085 goto out_err;
1086 }
1087 }
1088
1089 /* Initialize locations where the outputs are stored. */
1090 for (i = 0; i < so->num_outputs; i++) {
1091 so_gpr[i] = ctx->shader->output[so->output[i].register_index].gpr;
1092
1093 /* Lower outputs with dst_offset < start_component.
1094 *
1095 * We can only output 4D vectors with a write mask, e.g. we can
1096 * only output the W component at offset 3, etc. If we want
1097 * to store Y, Z, or W at buffer offset 0, we need to use MOV
1098 * to move it to X and output X. */
1099 if (so->output[i].dst_offset < so->output[i].start_component) {
1100 unsigned tmp = r600_get_temp(ctx);
1101
1102 for (j = 0; j < so->output[i].num_components; j++) {
1103 struct r600_bytecode_alu alu;
1104 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1105 alu.op = ALU_OP1_MOV;
1106 alu.src[0].sel = so_gpr[i];
1107 alu.src[0].chan = so->output[i].start_component + j;
1108
1109 alu.dst.sel = tmp;
1110 alu.dst.chan = j;
1111 alu.dst.write = 1;
1112 if (j == so->output[i].num_components - 1)
1113 alu.last = 1;
1114 r = r600_bytecode_add_alu(ctx->bc, &alu);
1115 if (r)
1116 return r;
1117 }
1118 so->output[i].start_component = 0;
1119 so_gpr[i] = tmp;
1120 }
1121 }
1122
1123 /* Write outputs to buffers. */
1124 for (i = 0; i < so->num_outputs; i++) {
1125 struct r600_bytecode_output output;
1126
1127 memset(&output, 0, sizeof(struct r600_bytecode_output));
1128 output.gpr = so_gpr[i];
1129 output.elem_size = so->output[i].num_components;
1130 output.array_base = so->output[i].dst_offset - so->output[i].start_component;
1131 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
1132 output.burst_count = 1;
1133 /* array_size is an upper limit for the burst_count
1134 * with MEM_STREAM instructions */
1135 output.array_size = 0xFFF;
1136 output.comp_mask = ((1 << so->output[i].num_components) - 1) << so->output[i].start_component;
1137 if (ctx->bc->chip_class >= EVERGREEN) {
1138 switch (so->output[i].output_buffer) {
1139 case 0:
1140 output.op = CF_OP_MEM_STREAM0_BUF0;
1141 break;
1142 case 1:
1143 output.op = CF_OP_MEM_STREAM0_BUF1;
1144 break;
1145 case 2:
1146 output.op = CF_OP_MEM_STREAM0_BUF2;
1147 break;
1148 case 3:
1149 output.op = CF_OP_MEM_STREAM0_BUF3;
1150 break;
1151 }
1152 } else {
1153 switch (so->output[i].output_buffer) {
1154 case 0:
1155 output.op = CF_OP_MEM_STREAM0;
1156 break;
1157 case 1:
1158 output.op = CF_OP_MEM_STREAM1;
1159 break;
1160 case 2:
1161 output.op = CF_OP_MEM_STREAM2;
1162 break;
1163 case 3:
1164 output.op = CF_OP_MEM_STREAM3;
1165 break;
1166 }
1167 }
1168 r = r600_bytecode_add_output(ctx->bc, &output);
1169 if (r)
1170 goto out_err;
1171 }
1172 return 0;
1173 out_err:
1174 return r;
1175 }
1176
1177 static void convert_edgeflag_to_int(struct r600_shader_ctx *ctx)
1178 {
1179 struct r600_bytecode_alu alu;
1180 unsigned reg;
1181
1182 if (!ctx->shader->vs_out_edgeflag)
1183 return;
1184
1185 reg = ctx->shader->output[ctx->edgeflag_output].gpr;
1186
1187 /* clamp(x, 0, 1) */
1188 memset(&alu, 0, sizeof(alu));
1189 alu.op = ALU_OP1_MOV;
1190 alu.src[0].sel = reg;
1191 alu.dst.sel = reg;
1192 alu.dst.write = 1;
1193 alu.dst.clamp = 1;
1194 alu.last = 1;
1195 r600_bytecode_add_alu(ctx->bc, &alu);
1196
1197 memset(&alu, 0, sizeof(alu));
1198 alu.op = ALU_OP1_FLT_TO_INT;
1199 alu.src[0].sel = reg;
1200 alu.dst.sel = reg;
1201 alu.dst.write = 1;
1202 alu.last = 1;
1203 r600_bytecode_add_alu(ctx->bc, &alu);
1204 }
1205
1206 static int generate_gs_copy_shader(struct r600_context *rctx,
1207 struct r600_pipe_shader *gs,
1208 struct pipe_stream_output_info *so)
1209 {
1210 struct r600_shader_ctx ctx = {};
1211 struct r600_shader *gs_shader = &gs->shader;
1212 struct r600_pipe_shader *cshader;
1213 int ocnt = gs_shader->noutput;
1214 struct r600_bytecode_alu alu;
1215 struct r600_bytecode_vtx vtx;
1216 struct r600_bytecode_output output;
1217 struct r600_bytecode_cf *cf_jump, *cf_pop,
1218 *last_exp_pos = NULL, *last_exp_param = NULL;
1219 int i, next_clip_pos = 61, next_param = 0;
1220
1221 cshader = calloc(1, sizeof(struct r600_pipe_shader));
1222 if (!cshader)
1223 return 0;
1224
1225 memcpy(cshader->shader.output, gs_shader->output, ocnt *
1226 sizeof(struct r600_shader_io));
1227
1228 cshader->shader.noutput = ocnt;
1229
1230 ctx.shader = &cshader->shader;
1231 ctx.bc = &ctx.shader->bc;
1232 ctx.type = ctx.bc->type = TGSI_PROCESSOR_VERTEX;
1233
1234 r600_bytecode_init(ctx.bc, rctx->b.chip_class, rctx->b.family,
1235 rctx->screen->has_compressed_msaa_texturing);
1236
1237 ctx.bc->isa = rctx->isa;
1238
1239 /* R0.x = R0.x & 0x3fffffff */
1240 memset(&alu, 0, sizeof(alu));
1241 alu.op = ALU_OP2_AND_INT;
1242 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1243 alu.src[1].value = 0x3fffffff;
1244 alu.dst.write = 1;
1245 r600_bytecode_add_alu(ctx.bc, &alu);
1246
1247 /* R0.y = R0.x >> 30 */
1248 memset(&alu, 0, sizeof(alu));
1249 alu.op = ALU_OP2_LSHR_INT;
1250 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1251 alu.src[1].value = 0x1e;
1252 alu.dst.chan = 1;
1253 alu.dst.write = 1;
1254 alu.last = 1;
1255 r600_bytecode_add_alu(ctx.bc, &alu);
1256
1257 /* PRED_SETE_INT __, R0.y, 0 */
1258 memset(&alu, 0, sizeof(alu));
1259 alu.op = ALU_OP2_PRED_SETE_INT;
1260 alu.src[0].chan = 1;
1261 alu.src[1].sel = V_SQ_ALU_SRC_0;
1262 alu.execute_mask = 1;
1263 alu.update_pred = 1;
1264 alu.last = 1;
1265 r600_bytecode_add_alu_type(ctx.bc, &alu, CF_OP_ALU_PUSH_BEFORE);
1266
1267 r600_bytecode_add_cfinst(ctx.bc, CF_OP_JUMP);
1268 cf_jump = ctx.bc->cf_last;
1269
1270 /* fetch vertex data from GSVS ring */
1271 for (i = 0; i < ocnt; ++i) {
1272 struct r600_shader_io *out = &ctx.shader->output[i];
1273 out->gpr = i + 1;
1274 out->ring_offset = i * 16;
1275
1276 memset(&vtx, 0, sizeof(vtx));
1277 vtx.op = FETCH_OP_VFETCH;
1278 vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
1279 vtx.fetch_type = 2;
1280 vtx.offset = out->ring_offset;
1281 vtx.dst_gpr = out->gpr;
1282 vtx.dst_sel_x = 0;
1283 vtx.dst_sel_y = 1;
1284 vtx.dst_sel_z = 2;
1285 vtx.dst_sel_w = 3;
1286 if (rctx->b.chip_class >= EVERGREEN) {
1287 vtx.use_const_fields = 1;
1288 } else {
1289 vtx.data_format = FMT_32_32_32_32_FLOAT;
1290 }
1291
1292 r600_bytecode_add_vtx(ctx.bc, &vtx);
1293 }
1294
1295 /* XXX handle clipvertex, streamout? */
1296 emit_streamout(&ctx, so);
1297
1298 /* export vertex data */
1299 /* XXX factor out common code with r600_shader_from_tgsi ? */
1300 for (i = 0; i < ocnt; ++i) {
1301 struct r600_shader_io *out = &ctx.shader->output[i];
1302
1303 if (out->name == TGSI_SEMANTIC_CLIPVERTEX)
1304 continue;
1305
1306 memset(&output, 0, sizeof(output));
1307 output.gpr = out->gpr;
1308 output.elem_size = 3;
1309 output.swizzle_x = 0;
1310 output.swizzle_y = 1;
1311 output.swizzle_z = 2;
1312 output.swizzle_w = 3;
1313 output.burst_count = 1;
1314 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
1315 output.op = CF_OP_EXPORT;
1316 switch (out->name) {
1317 case TGSI_SEMANTIC_POSITION:
1318 output.array_base = 60;
1319 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1320 break;
1321
1322 case TGSI_SEMANTIC_PSIZE:
1323 output.array_base = 61;
1324 if (next_clip_pos == 61)
1325 next_clip_pos = 62;
1326 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1327 output.swizzle_y = 7;
1328 output.swizzle_z = 7;
1329 output.swizzle_w = 7;
1330 ctx.shader->vs_out_misc_write = 1;
1331 ctx.shader->vs_out_point_size = 1;
1332 break;
1333 case TGSI_SEMANTIC_LAYER:
1334 output.array_base = 61;
1335 if (next_clip_pos == 61)
1336 next_clip_pos = 62;
1337 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1338 output.swizzle_x = 7;
1339 output.swizzle_y = 7;
1340 output.swizzle_z = 0;
1341 output.swizzle_w = 7;
1342 ctx.shader->vs_out_misc_write = 1;
1343 ctx.shader->vs_out_layer = 1;
1344 break;
1345 case TGSI_SEMANTIC_VIEWPORT_INDEX:
1346 output.array_base = 61;
1347 if (next_clip_pos == 61)
1348 next_clip_pos = 62;
1349 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1350 ctx.shader->vs_out_misc_write = 1;
1351 ctx.shader->vs_out_viewport = 1;
1352 output.swizzle_x = 7;
1353 output.swizzle_y = 7;
1354 output.swizzle_z = 7;
1355 output.swizzle_w = 0;
1356 break;
1357 case TGSI_SEMANTIC_CLIPDIST:
1358 /* spi_sid is 0 for clipdistance outputs that were generated
1359 * for clipvertex - we don't need to pass them to PS */
1360 ctx.shader->clip_dist_write = gs->shader.clip_dist_write;
1361 if (out->spi_sid) {
1362 /* duplicate it as PARAM to pass to the pixel shader */
1363 output.array_base = next_param++;
1364 r600_bytecode_add_output(ctx.bc, &output);
1365 last_exp_param = ctx.bc->cf_last;
1366 }
1367 output.array_base = next_clip_pos++;
1368 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1369 break;
1370 case TGSI_SEMANTIC_FOG:
1371 output.swizzle_y = 4; /* 0 */
1372 output.swizzle_z = 4; /* 0 */
1373 output.swizzle_w = 5; /* 1 */
1374 break;
1375 default:
1376 output.array_base = next_param++;
1377 break;
1378 }
1379 r600_bytecode_add_output(ctx.bc, &output);
1380 if (output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM)
1381 last_exp_param = ctx.bc->cf_last;
1382 else
1383 last_exp_pos = ctx.bc->cf_last;
1384 }
1385
1386 if (!last_exp_pos) {
1387 memset(&output, 0, sizeof(output));
1388 output.gpr = 0;
1389 output.elem_size = 3;
1390 output.swizzle_x = 7;
1391 output.swizzle_y = 7;
1392 output.swizzle_z = 7;
1393 output.swizzle_w = 7;
1394 output.burst_count = 1;
1395 output.type = 2;
1396 output.op = CF_OP_EXPORT;
1397 output.array_base = 60;
1398 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1399 r600_bytecode_add_output(ctx.bc, &output);
1400 last_exp_pos = ctx.bc->cf_last;
1401 }
1402
1403 if (!last_exp_param) {
1404 memset(&output, 0, sizeof(output));
1405 output.gpr = 0;
1406 output.elem_size = 3;
1407 output.swizzle_x = 7;
1408 output.swizzle_y = 7;
1409 output.swizzle_z = 7;
1410 output.swizzle_w = 7;
1411 output.burst_count = 1;
1412 output.type = 2;
1413 output.op = CF_OP_EXPORT;
1414 output.array_base = next_param++;
1415 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
1416 r600_bytecode_add_output(ctx.bc, &output);
1417 last_exp_param = ctx.bc->cf_last;
1418 }
1419
1420 last_exp_pos->op = CF_OP_EXPORT_DONE;
1421 last_exp_param->op = CF_OP_EXPORT_DONE;
1422
1423 r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
1424 cf_pop = ctx.bc->cf_last;
1425
1426 cf_jump->cf_addr = cf_pop->id + 2;
1427 cf_jump->pop_count = 1;
1428 cf_pop->cf_addr = cf_pop->id + 2;
1429 cf_pop->pop_count = 1;
1430
1431 if (ctx.bc->chip_class == CAYMAN)
1432 cm_bytecode_add_cf_end(ctx.bc);
1433 else {
1434 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
1435 ctx.bc->cf_last->end_of_program = 1;
1436 }
1437
1438 gs->gs_copy_shader = cshader;
1439
1440 ctx.bc->nstack = 1;
1441 cshader->shader.ring_item_size = ocnt * 16;
1442
1443 return r600_bytecode_build(ctx.bc);
1444 }
1445
1446 static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, bool ind)
1447 {
1448 struct r600_bytecode_output output;
1449 int i, k, ring_offset;
1450
1451 for (i = 0; i < ctx->shader->noutput; i++) {
1452 if (ctx->gs_for_vs) {
1453 /* for ES we need to lookup corresponding ring offset expected by GS
1454 * (map this output to GS input by name and sid) */
1455 /* FIXME precompute offsets */
1456 ring_offset = -1;
1457 for(k = 0; k < ctx->gs_for_vs->ninput; ++k) {
1458 struct r600_shader_io *in = &ctx->gs_for_vs->input[k];
1459 struct r600_shader_io *out = &ctx->shader->output[i];
1460 if (in->name == out->name && in->sid == out->sid)
1461 ring_offset = in->ring_offset;
1462 }
1463
1464 if (ring_offset == -1)
1465 continue;
1466 } else
1467 ring_offset = i * 16;
1468
1469 /* next_ring_offset after parsing input decls contains total size of
1470 * single vertex data, gs_next_vertex - current vertex index */
1471 if (!ind)
1472 ring_offset += ctx->gs_out_ring_offset * ctx->gs_next_vertex;
1473
1474 /* get a temp and add the ring offset to the next vertex base in the shader */
1475 memset(&output, 0, sizeof(struct r600_bytecode_output));
1476 output.gpr = ctx->shader->output[i].gpr;
1477 output.elem_size = 3;
1478 output.comp_mask = 0xF;
1479 output.burst_count = 1;
1480
1481 if (ind)
1482 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
1483 else
1484 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
1485 output.op = CF_OP_MEM_RING;
1486
1487
1488 if (ind) {
1489 output.array_base = ring_offset >> 2; /* in dwords */
1490 output.array_size = 0xfff;
1491 output.index_gpr = ctx->gs_export_gpr_treg;
1492 } else
1493 output.array_base = ring_offset >> 2; /* in dwords */
1494 r600_bytecode_add_output(ctx->bc, &output);
1495 }
1496
1497 if (ind) {
1498 struct r600_bytecode_alu alu;
1499 int r;
1500
1501 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1502 alu.op = ALU_OP2_ADD_INT;
1503 alu.src[0].sel = ctx->gs_export_gpr_treg;
1504 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1505 alu.src[1].value = ctx->gs_out_ring_offset >> 4;
1506 alu.dst.sel = ctx->gs_export_gpr_treg;
1507 alu.dst.write = 1;
1508 alu.last = 1;
1509 r = r600_bytecode_add_alu(ctx->bc, &alu);
1510 if (r)
1511 return r;
1512 }
1513 ++ctx->gs_next_vertex;
1514 return 0;
1515 }
1516
1517 static int r600_shader_from_tgsi(struct r600_context *rctx,
1518 struct r600_pipe_shader *pipeshader,
1519 struct r600_shader_key key)
1520 {
1521 struct r600_screen *rscreen = rctx->screen;
1522 struct r600_shader *shader = &pipeshader->shader;
1523 struct tgsi_token *tokens = pipeshader->selector->tokens;
1524 struct pipe_stream_output_info so = pipeshader->selector->so;
1525 struct tgsi_full_immediate *immediate;
1526 struct tgsi_full_property *property;
1527 struct r600_shader_ctx ctx;
1528 struct r600_bytecode_output output[32];
1529 unsigned output_done, noutput;
1530 unsigned opcode;
1531 int i, j, k, r = 0;
1532 int next_param_base = 0, next_clip_base;
1533 int max_color_exports = MAX2(key.nr_cbufs, 1);
1534 /* Declarations used by llvm code */
1535 bool use_llvm = false;
1536 bool indirect_gprs;
1537 bool ring_outputs = false;
1538 bool pos_emitted = false;
1539
1540 #ifdef R600_USE_LLVM
1541 use_llvm = rscreen->b.debug_flags & DBG_LLVM;
1542 #endif
1543 ctx.bc = &shader->bc;
1544 ctx.shader = shader;
1545 ctx.native_integers = true;
1546
1547 shader->vs_as_es = key.vs_as_es;
1548
1549 r600_bytecode_init(ctx.bc, rscreen->b.chip_class, rscreen->b.family,
1550 rscreen->has_compressed_msaa_texturing);
1551 ctx.tokens = tokens;
1552 tgsi_scan_shader(tokens, &ctx.info);
1553 shader->indirect_files = ctx.info.indirect_files;
1554 indirect_gprs = ctx.info.indirect_files & ~(1 << TGSI_FILE_CONSTANT);
1555 tgsi_parse_init(&ctx.parse, tokens);
1556 ctx.type = ctx.parse.FullHeader.Processor.Processor;
1557 shader->processor_type = ctx.type;
1558 ctx.bc->type = shader->processor_type;
1559
1560 ring_outputs = key.vs_as_es || (ctx.type == TGSI_PROCESSOR_GEOMETRY);
1561
1562 if (key.vs_as_es) {
1563 ctx.gs_for_vs = &rctx->gs_shader->current->shader;
1564 } else {
1565 ctx.gs_for_vs = NULL;
1566 }
1567
1568 ctx.next_ring_offset = 0;
1569 ctx.gs_out_ring_offset = 0;
1570 ctx.gs_next_vertex = 0;
1571
1572 ctx.face_gpr = -1;
1573 ctx.fragcoord_input = -1;
1574 ctx.colors_used = 0;
1575 ctx.clip_vertex_write = 0;
1576
1577 shader->nr_ps_color_exports = 0;
1578 shader->nr_ps_max_color_exports = 0;
1579
1580 shader->two_side = key.color_two_side;
1581
1582 /* register allocations */
1583 /* Values [0,127] correspond to GPR[0..127].
1584 * Values [128,159] correspond to constant buffer bank 0
1585 * Values [160,191] correspond to constant buffer bank 1
1586 * Values [256,511] correspond to cfile constants c[0..255]. (Gone on EG)
1587 * Values [256,287] correspond to constant buffer bank 2 (EG)
1588 * Values [288,319] correspond to constant buffer bank 3 (EG)
1589 * Other special values are shown in the list below.
1590 * 244 ALU_SRC_1_DBL_L: special constant 1.0 double-float, LSW. (RV670+)
1591 * 245 ALU_SRC_1_DBL_M: special constant 1.0 double-float, MSW. (RV670+)
1592 * 246 ALU_SRC_0_5_DBL_L: special constant 0.5 double-float, LSW. (RV670+)
1593 * 247 ALU_SRC_0_5_DBL_M: special constant 0.5 double-float, MSW. (RV670+)
1594 * 248 SQ_ALU_SRC_0: special constant 0.0.
1595 * 249 SQ_ALU_SRC_1: special constant 1.0 float.
1596 * 250 SQ_ALU_SRC_1_INT: special constant 1 integer.
1597 * 251 SQ_ALU_SRC_M_1_INT: special constant -1 integer.
1598 * 252 SQ_ALU_SRC_0_5: special constant 0.5 float.
1599 * 253 SQ_ALU_SRC_LITERAL: literal constant.
1600 * 254 SQ_ALU_SRC_PV: previous vector result.
1601 * 255 SQ_ALU_SRC_PS: previous scalar result.
1602 */
1603 for (i = 0; i < TGSI_FILE_COUNT; i++) {
1604 ctx.file_offset[i] = 0;
1605 }
1606
1607 #ifdef R600_USE_LLVM
1608 if (use_llvm && ctx.info.indirect_files && (ctx.info.indirect_files & (1 << TGSI_FILE_CONSTANT)) != ctx.info.indirect_files) {
1609 fprintf(stderr, "Warning: R600 LLVM backend does not support "
1610 "indirect adressing. Falling back to TGSI "
1611 "backend.\n");
1612 use_llvm = 0;
1613 }
1614 #endif
1615 if (ctx.type == TGSI_PROCESSOR_VERTEX) {
1616 ctx.file_offset[TGSI_FILE_INPUT] = 1;
1617 if (!use_llvm) {
1618 r600_bytecode_add_cfinst(ctx.bc, CF_OP_CALL_FS);
1619 }
1620 }
1621 if (ctx.type == TGSI_PROCESSOR_FRAGMENT && ctx.bc->chip_class >= EVERGREEN) {
1622 ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx);
1623 }
1624 if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
1625 /* FIXME 1 would be enough in some cases (3 or less input vertices) */
1626 ctx.file_offset[TGSI_FILE_INPUT] = 2;
1627 }
1628 ctx.use_llvm = use_llvm;
1629
1630 if (use_llvm) {
1631 ctx.file_offset[TGSI_FILE_OUTPUT] =
1632 ctx.file_offset[TGSI_FILE_INPUT];
1633 } else {
1634 ctx.file_offset[TGSI_FILE_OUTPUT] =
1635 ctx.file_offset[TGSI_FILE_INPUT] +
1636 ctx.info.file_max[TGSI_FILE_INPUT] + 1;
1637 }
1638 ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] +
1639 ctx.info.file_max[TGSI_FILE_OUTPUT] + 1;
1640
1641 /* Outside the GPR range. This will be translated to one of the
1642 * kcache banks later. */
1643 ctx.file_offset[TGSI_FILE_CONSTANT] = 512;
1644
1645 ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL;
1646 ctx.bc->ar_reg = ctx.file_offset[TGSI_FILE_TEMPORARY] +
1647 ctx.info.file_max[TGSI_FILE_TEMPORARY] + 1;
1648 if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
1649 ctx.gs_export_gpr_treg = ctx.bc->ar_reg + 1;
1650 ctx.temp_reg = ctx.bc->ar_reg + 2;
1651 } else
1652 ctx.temp_reg = ctx.bc->ar_reg + 1;
1653
1654 if (indirect_gprs) {
1655 shader->max_arrays = 0;
1656 shader->num_arrays = 0;
1657
1658 if (ctx.info.indirect_files & (1 << TGSI_FILE_INPUT)) {
1659 r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_INPUT],
1660 ctx.file_offset[TGSI_FILE_OUTPUT] -
1661 ctx.file_offset[TGSI_FILE_INPUT],
1662 0x0F);
1663 }
1664 if (ctx.info.indirect_files & (1 << TGSI_FILE_OUTPUT)) {
1665 r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_OUTPUT],
1666 ctx.file_offset[TGSI_FILE_TEMPORARY] -
1667 ctx.file_offset[TGSI_FILE_OUTPUT],
1668 0x0F);
1669 }
1670 }
1671
1672 ctx.nliterals = 0;
1673 ctx.literals = NULL;
1674 shader->fs_write_all = FALSE;
1675 while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
1676 tgsi_parse_token(&ctx.parse);
1677 switch (ctx.parse.FullToken.Token.Type) {
1678 case TGSI_TOKEN_TYPE_IMMEDIATE:
1679 immediate = &ctx.parse.FullToken.FullImmediate;
1680 ctx.literals = realloc(ctx.literals, (ctx.nliterals + 1) * 16);
1681 if(ctx.literals == NULL) {
1682 r = -ENOMEM;
1683 goto out_err;
1684 }
1685 ctx.literals[ctx.nliterals * 4 + 0] = immediate->u[0].Uint;
1686 ctx.literals[ctx.nliterals * 4 + 1] = immediate->u[1].Uint;
1687 ctx.literals[ctx.nliterals * 4 + 2] = immediate->u[2].Uint;
1688 ctx.literals[ctx.nliterals * 4 + 3] = immediate->u[3].Uint;
1689 ctx.nliterals++;
1690 break;
1691 case TGSI_TOKEN_TYPE_DECLARATION:
1692 r = tgsi_declaration(&ctx);
1693 if (r)
1694 goto out_err;
1695 break;
1696 case TGSI_TOKEN_TYPE_INSTRUCTION:
1697 break;
1698 case TGSI_TOKEN_TYPE_PROPERTY:
1699 property = &ctx.parse.FullToken.FullProperty;
1700 switch (property->Property.PropertyName) {
1701 case TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS:
1702 if (property->u[0].Data == 1)
1703 shader->fs_write_all = TRUE;
1704 break;
1705 case TGSI_PROPERTY_VS_PROHIBIT_UCPS:
1706 /* we don't need this one */
1707 break;
1708 case TGSI_PROPERTY_GS_INPUT_PRIM:
1709 shader->gs_input_prim = property->u[0].Data;
1710 break;
1711 case TGSI_PROPERTY_GS_OUTPUT_PRIM:
1712 shader->gs_output_prim = property->u[0].Data;
1713 break;
1714 case TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES:
1715 shader->gs_max_out_vertices = property->u[0].Data;
1716 break;
1717 }
1718 break;
1719 default:
1720 R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type);
1721 r = -EINVAL;
1722 goto out_err;
1723 }
1724 }
1725
1726 shader->ring_item_size = ctx.next_ring_offset;
1727
1728 /* Process two side if needed */
1729 if (shader->two_side && ctx.colors_used) {
1730 int i, count = ctx.shader->ninput;
1731 unsigned next_lds_loc = ctx.shader->nlds;
1732
1733 /* additional inputs will be allocated right after the existing inputs,
1734 * we won't need them after the color selection, so we don't need to
1735 * reserve these gprs for the rest of the shader code and to adjust
1736 * output offsets etc. */
1737 int gpr = ctx.file_offset[TGSI_FILE_INPUT] +
1738 ctx.info.file_max[TGSI_FILE_INPUT] + 1;
1739
1740 if (ctx.face_gpr == -1) {
1741 i = ctx.shader->ninput++;
1742 ctx.shader->input[i].name = TGSI_SEMANTIC_FACE;
1743 ctx.shader->input[i].spi_sid = 0;
1744 ctx.shader->input[i].gpr = gpr++;
1745 ctx.face_gpr = ctx.shader->input[i].gpr;
1746 }
1747
1748 for (i = 0; i < count; i++) {
1749 if (ctx.shader->input[i].name == TGSI_SEMANTIC_COLOR) {
1750 int ni = ctx.shader->ninput++;
1751 memcpy(&ctx.shader->input[ni],&ctx.shader->input[i], sizeof(struct r600_shader_io));
1752 ctx.shader->input[ni].name = TGSI_SEMANTIC_BCOLOR;
1753 ctx.shader->input[ni].spi_sid = r600_spi_sid(&ctx.shader->input[ni]);
1754 ctx.shader->input[ni].gpr = gpr++;
1755 // TGSI to LLVM needs to know the lds position of inputs.
1756 // Non LLVM path computes it later (in process_twoside_color)
1757 ctx.shader->input[ni].lds_pos = next_lds_loc++;
1758 ctx.shader->input[i].back_color_input = ni;
1759 if (ctx.bc->chip_class >= EVERGREEN) {
1760 if ((r = evergreen_interp_input(&ctx, ni)))
1761 return r;
1762 }
1763 }
1764 }
1765 }
1766
1767 /* LLVM backend setup */
1768 #ifdef R600_USE_LLVM
1769 if (use_llvm) {
1770 struct radeon_llvm_context radeon_llvm_ctx;
1771 LLVMModuleRef mod;
1772 bool dump = r600_can_dump_shader(&rscreen->b, tokens);
1773 boolean use_kill = false;
1774
1775 memset(&radeon_llvm_ctx, 0, sizeof(radeon_llvm_ctx));
1776 radeon_llvm_ctx.type = ctx.type;
1777 radeon_llvm_ctx.two_side = shader->two_side;
1778 radeon_llvm_ctx.face_gpr = ctx.face_gpr;
1779 radeon_llvm_ctx.inputs_count = ctx.shader->ninput + 1;
1780 radeon_llvm_ctx.r600_inputs = ctx.shader->input;
1781 radeon_llvm_ctx.r600_outputs = ctx.shader->output;
1782 radeon_llvm_ctx.color_buffer_count = max_color_exports;
1783 radeon_llvm_ctx.chip_class = ctx.bc->chip_class;
1784 radeon_llvm_ctx.fs_color_all = shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN);
1785 radeon_llvm_ctx.stream_outputs = &so;
1786 radeon_llvm_ctx.clip_vertex = ctx.cv_output;
1787 radeon_llvm_ctx.alpha_to_one = key.alpha_to_one;
1788 radeon_llvm_ctx.has_compressed_msaa_texturing =
1789 ctx.bc->has_compressed_msaa_texturing;
1790 mod = r600_tgsi_llvm(&radeon_llvm_ctx, tokens);
1791 ctx.shader->has_txq_cube_array_z_comp = radeon_llvm_ctx.has_txq_cube_array_z_comp;
1792 ctx.shader->uses_tex_buffers = radeon_llvm_ctx.uses_tex_buffers;
1793
1794 if (r600_llvm_compile(mod, rscreen->b.family, ctx.bc, &use_kill, dump)) {
1795 radeon_llvm_dispose(&radeon_llvm_ctx);
1796 use_llvm = 0;
1797 fprintf(stderr, "R600 LLVM backend failed to compile "
1798 "shader. Falling back to TGSI\n");
1799 } else {
1800 ctx.file_offset[TGSI_FILE_OUTPUT] =
1801 ctx.file_offset[TGSI_FILE_INPUT];
1802 }
1803 if (use_kill)
1804 ctx.shader->uses_kill = use_kill;
1805 radeon_llvm_dispose(&radeon_llvm_ctx);
1806 }
1807 #endif
1808 /* End of LLVM backend setup */
1809
1810 if (shader->fs_write_all && rscreen->b.chip_class >= EVERGREEN)
1811 shader->nr_ps_max_color_exports = 8;
1812
1813 if (!use_llvm) {
1814 if (ctx.fragcoord_input >= 0) {
1815 if (ctx.bc->chip_class == CAYMAN) {
1816 for (j = 0 ; j < 4; j++) {
1817 struct r600_bytecode_alu alu;
1818 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1819 alu.op = ALU_OP1_RECIP_IEEE;
1820 alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
1821 alu.src[0].chan = 3;
1822
1823 alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
1824 alu.dst.chan = j;
1825 alu.dst.write = (j == 3);
1826 alu.last = 1;
1827 if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
1828 return r;
1829 }
1830 } else {
1831 struct r600_bytecode_alu alu;
1832 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1833 alu.op = ALU_OP1_RECIP_IEEE;
1834 alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
1835 alu.src[0].chan = 3;
1836
1837 alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
1838 alu.dst.chan = 3;
1839 alu.dst.write = 1;
1840 alu.last = 1;
1841 if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
1842 return r;
1843 }
1844 }
1845
1846 if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
1847 struct r600_bytecode_alu alu;
1848 int r;
1849
1850 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1851 alu.op = ALU_OP1_MOV;
1852 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
1853 alu.src[0].value = 0;
1854 alu.dst.sel = ctx.gs_export_gpr_treg;
1855 alu.dst.write = 1;
1856 alu.last = 1;
1857 r = r600_bytecode_add_alu(ctx.bc, &alu);
1858 if (r)
1859 return r;
1860 }
1861 if (shader->two_side && ctx.colors_used) {
1862 if ((r = process_twoside_color_inputs(&ctx)))
1863 return r;
1864 }
1865
1866 tgsi_parse_init(&ctx.parse, tokens);
1867 while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
1868 tgsi_parse_token(&ctx.parse);
1869 switch (ctx.parse.FullToken.Token.Type) {
1870 case TGSI_TOKEN_TYPE_INSTRUCTION:
1871 r = tgsi_is_supported(&ctx);
1872 if (r)
1873 goto out_err;
1874 ctx.max_driver_temp_used = 0;
1875 /* reserve first tmp for everyone */
1876 r600_get_temp(&ctx);
1877
1878 opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode;
1879 if ((r = tgsi_split_constant(&ctx)))
1880 goto out_err;
1881 if ((r = tgsi_split_literal_constant(&ctx)))
1882 goto out_err;
1883 if (ctx.type == TGSI_PROCESSOR_GEOMETRY)
1884 if ((r = tgsi_split_gs_inputs(&ctx)))
1885 goto out_err;
1886 if (ctx.bc->chip_class == CAYMAN)
1887 ctx.inst_info = &cm_shader_tgsi_instruction[opcode];
1888 else if (ctx.bc->chip_class >= EVERGREEN)
1889 ctx.inst_info = &eg_shader_tgsi_instruction[opcode];
1890 else
1891 ctx.inst_info = &r600_shader_tgsi_instruction[opcode];
1892 r = ctx.inst_info->process(&ctx);
1893 if (r)
1894 goto out_err;
1895 break;
1896 default:
1897 break;
1898 }
1899 }
1900 }
1901
1902 /* Reset the temporary register counter. */
1903 ctx.max_driver_temp_used = 0;
1904
1905 noutput = shader->noutput;
1906
1907 if (!ring_outputs && ctx.clip_vertex_write) {
1908 unsigned clipdist_temp[2];
1909
1910 clipdist_temp[0] = r600_get_temp(&ctx);
1911 clipdist_temp[1] = r600_get_temp(&ctx);
1912
1913 /* need to convert a clipvertex write into clipdistance writes and not export
1914 the clip vertex anymore */
1915
1916 memset(&shader->output[noutput], 0, 2*sizeof(struct r600_shader_io));
1917 shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
1918 shader->output[noutput].gpr = clipdist_temp[0];
1919 noutput++;
1920 shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
1921 shader->output[noutput].gpr = clipdist_temp[1];
1922 noutput++;
1923
1924 /* reset spi_sid for clipvertex output to avoid confusing spi */
1925 shader->output[ctx.cv_output].spi_sid = 0;
1926
1927 shader->clip_dist_write = 0xFF;
1928
1929 for (i = 0; i < 8; i++) {
1930 int oreg = i >> 2;
1931 int ochan = i & 3;
1932
1933 for (j = 0; j < 4; j++) {
1934 struct r600_bytecode_alu alu;
1935 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1936 alu.op = ALU_OP2_DOT4;
1937 alu.src[0].sel = shader->output[ctx.cv_output].gpr;
1938 alu.src[0].chan = j;
1939
1940 alu.src[1].sel = 512 + i;
1941 alu.src[1].kc_bank = R600_UCP_CONST_BUFFER;
1942 alu.src[1].chan = j;
1943
1944 alu.dst.sel = clipdist_temp[oreg];
1945 alu.dst.chan = j;
1946 alu.dst.write = (j == ochan);
1947 if (j == 3)
1948 alu.last = 1;
1949 if (!use_llvm)
1950 r = r600_bytecode_add_alu(ctx.bc, &alu);
1951 if (r)
1952 return r;
1953 }
1954 }
1955 }
1956
1957 /* Add stream outputs. */
1958 if (!ring_outputs && ctx.type == TGSI_PROCESSOR_VERTEX &&
1959 so.num_outputs && !use_llvm)
1960 emit_streamout(&ctx, &so);
1961
1962 convert_edgeflag_to_int(&ctx);
1963
1964 if (ring_outputs) {
1965 if (key.vs_as_es)
1966 emit_gs_ring_writes(&ctx, FALSE);
1967 } else {
1968 /* Export output */
1969 next_clip_base = shader->vs_out_misc_write ? 62 : 61;
1970
1971 for (i = 0, j = 0; i < noutput; i++, j++) {
1972 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
1973 output[j].gpr = shader->output[i].gpr;
1974 output[j].elem_size = 3;
1975 output[j].swizzle_x = 0;
1976 output[j].swizzle_y = 1;
1977 output[j].swizzle_z = 2;
1978 output[j].swizzle_w = 3;
1979 output[j].burst_count = 1;
1980 output[j].type = -1;
1981 output[j].op = CF_OP_EXPORT;
1982 switch (ctx.type) {
1983 case TGSI_PROCESSOR_VERTEX:
1984 switch (shader->output[i].name) {
1985 case TGSI_SEMANTIC_POSITION:
1986 output[j].array_base = 60;
1987 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1988 pos_emitted = true;
1989 break;
1990
1991 case TGSI_SEMANTIC_PSIZE:
1992 output[j].array_base = 61;
1993 output[j].swizzle_y = 7;
1994 output[j].swizzle_z = 7;
1995 output[j].swizzle_w = 7;
1996 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1997 pos_emitted = true;
1998 break;
1999 case TGSI_SEMANTIC_EDGEFLAG:
2000 output[j].array_base = 61;
2001 output[j].swizzle_x = 7;
2002 output[j].swizzle_y = 0;
2003 output[j].swizzle_z = 7;
2004 output[j].swizzle_w = 7;
2005 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2006 pos_emitted = true;
2007 break;
2008 case TGSI_SEMANTIC_LAYER:
2009 output[j].array_base = 61;
2010 output[j].swizzle_x = 7;
2011 output[j].swizzle_y = 7;
2012 output[j].swizzle_z = 0;
2013 output[j].swizzle_w = 7;
2014 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2015 pos_emitted = true;
2016 break;
2017 case TGSI_SEMANTIC_CLIPVERTEX:
2018 j--;
2019 break;
2020 case TGSI_SEMANTIC_CLIPDIST:
2021 output[j].array_base = next_clip_base++;
2022 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2023 pos_emitted = true;
2024 /* spi_sid is 0 for clipdistance outputs that were generated
2025 * for clipvertex - we don't need to pass them to PS */
2026 if (shader->output[i].spi_sid) {
2027 j++;
2028 /* duplicate it as PARAM to pass to the pixel shader */
2029 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
2030 output[j].array_base = next_param_base++;
2031 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2032 }
2033 break;
2034 case TGSI_SEMANTIC_FOG:
2035 output[j].swizzle_y = 4; /* 0 */
2036 output[j].swizzle_z = 4; /* 0 */
2037 output[j].swizzle_w = 5; /* 1 */
2038 break;
2039 }
2040 break;
2041 case TGSI_PROCESSOR_FRAGMENT:
2042 if (shader->output[i].name == TGSI_SEMANTIC_COLOR) {
2043 /* never export more colors than the number of CBs */
2044 if (shader->output[i].sid >= max_color_exports) {
2045 /* skip export */
2046 j--;
2047 continue;
2048 }
2049 output[j].swizzle_w = key.alpha_to_one ? 5 : 3;
2050 output[j].array_base = shader->output[i].sid;
2051 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2052 shader->nr_ps_color_exports++;
2053 if (shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN)) {
2054 for (k = 1; k < max_color_exports; k++) {
2055 j++;
2056 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
2057 output[j].gpr = shader->output[i].gpr;
2058 output[j].elem_size = 3;
2059 output[j].swizzle_x = 0;
2060 output[j].swizzle_y = 1;
2061 output[j].swizzle_z = 2;
2062 output[j].swizzle_w = key.alpha_to_one ? 5 : 3;
2063 output[j].burst_count = 1;
2064 output[j].array_base = k;
2065 output[j].op = CF_OP_EXPORT;
2066 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2067 shader->nr_ps_color_exports++;
2068 }
2069 }
2070 } else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) {
2071 output[j].array_base = 61;
2072 output[j].swizzle_x = 2;
2073 output[j].swizzle_y = 7;
2074 output[j].swizzle_z = output[j].swizzle_w = 7;
2075 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2076 } else if (shader->output[i].name == TGSI_SEMANTIC_STENCIL) {
2077 output[j].array_base = 61;
2078 output[j].swizzle_x = 7;
2079 output[j].swizzle_y = 1;
2080 output[j].swizzle_z = output[j].swizzle_w = 7;
2081 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2082 } else {
2083 R600_ERR("unsupported fragment output name %d\n", shader->output[i].name);
2084 r = -EINVAL;
2085 goto out_err;
2086 }
2087 break;
2088 default:
2089 R600_ERR("unsupported processor type %d\n", ctx.type);
2090 r = -EINVAL;
2091 goto out_err;
2092 }
2093
2094 if (output[j].type==-1) {
2095 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2096 output[j].array_base = next_param_base++;
2097 }
2098 }
2099
2100 /* add fake position export */
2101 if (ctx.type == TGSI_PROCESSOR_VERTEX && pos_emitted == false) {
2102 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
2103 output[j].gpr = 0;
2104 output[j].elem_size = 3;
2105 output[j].swizzle_x = 7;
2106 output[j].swizzle_y = 7;
2107 output[j].swizzle_z = 7;
2108 output[j].swizzle_w = 7;
2109 output[j].burst_count = 1;
2110 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2111 output[j].array_base = 60;
2112 output[j].op = CF_OP_EXPORT;
2113 j++;
2114 }
2115
2116 /* add fake param output for vertex shader if no param is exported */
2117 if (ctx.type == TGSI_PROCESSOR_VERTEX && next_param_base == 0) {
2118 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
2119 output[j].gpr = 0;
2120 output[j].elem_size = 3;
2121 output[j].swizzle_x = 7;
2122 output[j].swizzle_y = 7;
2123 output[j].swizzle_z = 7;
2124 output[j].swizzle_w = 7;
2125 output[j].burst_count = 1;
2126 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2127 output[j].array_base = 0;
2128 output[j].op = CF_OP_EXPORT;
2129 j++;
2130 }
2131
2132 /* add fake pixel export */
2133 if (ctx.type == TGSI_PROCESSOR_FRAGMENT && shader->nr_ps_color_exports == 0) {
2134 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
2135 output[j].gpr = 0;
2136 output[j].elem_size = 3;
2137 output[j].swizzle_x = 7;
2138 output[j].swizzle_y = 7;
2139 output[j].swizzle_z = 7;
2140 output[j].swizzle_w = 7;
2141 output[j].burst_count = 1;
2142 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2143 output[j].array_base = 0;
2144 output[j].op = CF_OP_EXPORT;
2145 j++;
2146 }
2147
2148 noutput = j;
2149
2150 /* set export done on last export of each type */
2151 for (i = noutput - 1, output_done = 0; i >= 0; i--) {
2152 if (!(output_done & (1 << output[i].type))) {
2153 output_done |= (1 << output[i].type);
2154 output[i].op = CF_OP_EXPORT_DONE;
2155 }
2156 }
2157 /* add output to bytecode */
2158 if (!use_llvm) {
2159 for (i = 0; i < noutput; i++) {
2160 r = r600_bytecode_add_output(ctx.bc, &output[i]);
2161 if (r)
2162 goto out_err;
2163 }
2164 }
2165 }
2166
2167 /* add program end */
2168 if (!use_llvm) {
2169 if (ctx.bc->chip_class == CAYMAN)
2170 cm_bytecode_add_cf_end(ctx.bc);
2171 else {
2172 const struct cf_op_info *last = NULL;
2173
2174 if (ctx.bc->cf_last)
2175 last = r600_isa_cf(ctx.bc->cf_last->op);
2176
2177 /* alu clause instructions don't have EOP bit, so add NOP */
2178 if (!last || last->flags & CF_ALU || ctx.bc->cf_last->op == CF_OP_LOOP_END || ctx.bc->cf_last->op == CF_OP_CALL_FS)
2179 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
2180
2181 ctx.bc->cf_last->end_of_program = 1;
2182 }
2183 }
2184
2185 /* check GPR limit - we have 124 = 128 - 4
2186 * (4 are reserved as alu clause temporary registers) */
2187 if (ctx.bc->ngpr > 124) {
2188 R600_ERR("GPR limit exceeded - shader requires %d registers\n", ctx.bc->ngpr);
2189 r = -ENOMEM;
2190 goto out_err;
2191 }
2192
2193 if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
2194 if ((r = generate_gs_copy_shader(rctx, pipeshader, &so)))
2195 return r;
2196 }
2197
2198 free(ctx.literals);
2199 tgsi_parse_free(&ctx.parse);
2200 return 0;
2201 out_err:
2202 free(ctx.literals);
2203 tgsi_parse_free(&ctx.parse);
2204 return r;
2205 }
2206
2207 static int tgsi_unsupported(struct r600_shader_ctx *ctx)
2208 {
2209 R600_ERR("%s tgsi opcode unsupported\n",
2210 tgsi_get_opcode_name(ctx->inst_info->tgsi_opcode));
2211 return -EINVAL;
2212 }
2213
2214 static int tgsi_end(struct r600_shader_ctx *ctx)
2215 {
2216 return 0;
2217 }
2218
2219 static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
2220 const struct r600_shader_src *shader_src,
2221 unsigned chan)
2222 {
2223 bc_src->sel = shader_src->sel;
2224 bc_src->chan = shader_src->swizzle[chan];
2225 bc_src->neg = shader_src->neg;
2226 bc_src->abs = shader_src->abs;
2227 bc_src->rel = shader_src->rel;
2228 bc_src->value = shader_src->value[bc_src->chan];
2229 bc_src->kc_bank = shader_src->kc_bank;
2230 }
2231
2232 static void r600_bytecode_src_set_abs(struct r600_bytecode_alu_src *bc_src)
2233 {
2234 bc_src->abs = 1;
2235 bc_src->neg = 0;
2236 }
2237
2238 static void r600_bytecode_src_toggle_neg(struct r600_bytecode_alu_src *bc_src)
2239 {
2240 bc_src->neg = !bc_src->neg;
2241 }
2242
2243 static void tgsi_dst(struct r600_shader_ctx *ctx,
2244 const struct tgsi_full_dst_register *tgsi_dst,
2245 unsigned swizzle,
2246 struct r600_bytecode_alu_dst *r600_dst)
2247 {
2248 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2249
2250 r600_dst->sel = tgsi_dst->Register.Index;
2251 r600_dst->sel += ctx->file_offset[tgsi_dst->Register.File];
2252 r600_dst->chan = swizzle;
2253 r600_dst->write = 1;
2254 if (tgsi_dst->Register.Indirect)
2255 r600_dst->rel = V_SQ_REL_RELATIVE;
2256 if (inst->Instruction.Saturate) {
2257 r600_dst->clamp = 1;
2258 }
2259 }
2260
2261 static int tgsi_last_instruction(unsigned writemask)
2262 {
2263 int i, lasti = 0;
2264
2265 for (i = 0; i < 4; i++) {
2266 if (writemask & (1 << i)) {
2267 lasti = i;
2268 }
2269 }
2270 return lasti;
2271 }
2272
2273 static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap, int trans_only)
2274 {
2275 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2276 struct r600_bytecode_alu alu;
2277 unsigned write_mask = inst->Dst[0].Register.WriteMask;
2278 int i, j, r, lasti = tgsi_last_instruction(write_mask);
2279 /* use temp register if trans_only and more than one dst component */
2280 int use_tmp = trans_only && (write_mask ^ (1 << lasti));
2281
2282 for (i = 0; i <= lasti; i++) {
2283 if (!(write_mask & (1 << i)))
2284 continue;
2285
2286 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2287 if (use_tmp) {
2288 alu.dst.sel = ctx->temp_reg;
2289 alu.dst.chan = i;
2290 alu.dst.write = 1;
2291 } else
2292 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2293
2294 alu.op = ctx->inst_info->op;
2295 if (!swap) {
2296 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
2297 r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
2298 }
2299 } else {
2300 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
2301 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
2302 }
2303 /* handle some special cases */
2304 switch (ctx->inst_info->tgsi_opcode) {
2305 case TGSI_OPCODE_SUB:
2306 r600_bytecode_src_toggle_neg(&alu.src[1]);
2307 break;
2308 case TGSI_OPCODE_ABS:
2309 r600_bytecode_src_set_abs(&alu.src[0]);
2310 break;
2311 default:
2312 break;
2313 }
2314 if (i == lasti || trans_only) {
2315 alu.last = 1;
2316 }
2317 r = r600_bytecode_add_alu(ctx->bc, &alu);
2318 if (r)
2319 return r;
2320 }
2321
2322 if (use_tmp) {
2323 /* move result from temp to dst */
2324 for (i = 0; i <= lasti; i++) {
2325 if (!(write_mask & (1 << i)))
2326 continue;
2327
2328 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2329 alu.op = ALU_OP1_MOV;
2330 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2331 alu.src[0].sel = ctx->temp_reg;
2332 alu.src[0].chan = i;
2333 alu.last = (i == lasti);
2334
2335 r = r600_bytecode_add_alu(ctx->bc, &alu);
2336 if (r)
2337 return r;
2338 }
2339 }
2340 return 0;
2341 }
2342
2343 static int tgsi_op2(struct r600_shader_ctx *ctx)
2344 {
2345 return tgsi_op2_s(ctx, 0, 0);
2346 }
2347
2348 static int tgsi_op2_swap(struct r600_shader_ctx *ctx)
2349 {
2350 return tgsi_op2_s(ctx, 1, 0);
2351 }
2352
2353 static int tgsi_op2_trans(struct r600_shader_ctx *ctx)
2354 {
2355 return tgsi_op2_s(ctx, 0, 1);
2356 }
2357
2358 static int tgsi_ineg(struct r600_shader_ctx *ctx)
2359 {
2360 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2361 struct r600_bytecode_alu alu;
2362 int i, r;
2363 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
2364
2365 for (i = 0; i < lasti + 1; i++) {
2366
2367 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
2368 continue;
2369 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2370 alu.op = ctx->inst_info->op;
2371
2372 alu.src[0].sel = V_SQ_ALU_SRC_0;
2373
2374 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
2375
2376 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2377
2378 if (i == lasti) {
2379 alu.last = 1;
2380 }
2381 r = r600_bytecode_add_alu(ctx->bc, &alu);
2382 if (r)
2383 return r;
2384 }
2385 return 0;
2386
2387 }
2388
2389 static int cayman_emit_float_instr(struct r600_shader_ctx *ctx)
2390 {
2391 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2392 int i, j, r;
2393 struct r600_bytecode_alu alu;
2394 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
2395
2396 for (i = 0 ; i < last_slot; i++) {
2397 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2398 alu.op = ctx->inst_info->op;
2399 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
2400 r600_bytecode_src(&alu.src[j], &ctx->src[j], 0);
2401
2402 /* RSQ should take the absolute value of src */
2403 if (ctx->inst_info->tgsi_opcode == TGSI_OPCODE_RSQ) {
2404 r600_bytecode_src_set_abs(&alu.src[j]);
2405 }
2406 }
2407 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2408 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
2409
2410 if (i == last_slot - 1)
2411 alu.last = 1;
2412 r = r600_bytecode_add_alu(ctx->bc, &alu);
2413 if (r)
2414 return r;
2415 }
2416 return 0;
2417 }
2418
2419 static int cayman_mul_int_instr(struct r600_shader_ctx *ctx)
2420 {
2421 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2422 int i, j, k, r;
2423 struct r600_bytecode_alu alu;
2424 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
2425 for (k = 0; k < last_slot; k++) {
2426 if (!(inst->Dst[0].Register.WriteMask & (1 << k)))
2427 continue;
2428
2429 for (i = 0 ; i < 4; i++) {
2430 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2431 alu.op = ctx->inst_info->op;
2432 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
2433 r600_bytecode_src(&alu.src[j], &ctx->src[j], k);
2434 }
2435 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2436 alu.dst.write = (i == k);
2437 if (i == 3)
2438 alu.last = 1;
2439 r = r600_bytecode_add_alu(ctx->bc, &alu);
2440 if (r)
2441 return r;
2442 }
2443 }
2444 return 0;
2445 }
2446
2447 /*
2448 * r600 - trunc to -PI..PI range
2449 * r700 - normalize by dividing by 2PI
2450 * see fdo bug 27901
2451 */
2452 static int tgsi_setup_trig(struct r600_shader_ctx *ctx)
2453 {
2454 static float half_inv_pi = 1.0 /(3.1415926535 * 2);
2455 static float double_pi = 3.1415926535 * 2;
2456 static float neg_pi = -3.1415926535;
2457
2458 int r;
2459 struct r600_bytecode_alu alu;
2460
2461 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2462 alu.op = ALU_OP3_MULADD;
2463 alu.is_op3 = 1;
2464
2465 alu.dst.chan = 0;
2466 alu.dst.sel = ctx->temp_reg;
2467 alu.dst.write = 1;
2468
2469 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
2470
2471 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2472 alu.src[1].chan = 0;
2473 alu.src[1].value = *(uint32_t *)&half_inv_pi;
2474 alu.src[2].sel = V_SQ_ALU_SRC_0_5;
2475 alu.src[2].chan = 0;
2476 alu.last = 1;
2477 r = r600_bytecode_add_alu(ctx->bc, &alu);
2478 if (r)
2479 return r;
2480
2481 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2482 alu.op = ALU_OP1_FRACT;
2483
2484 alu.dst.chan = 0;
2485 alu.dst.sel = ctx->temp_reg;
2486 alu.dst.write = 1;
2487
2488 alu.src[0].sel = ctx->temp_reg;
2489 alu.src[0].chan = 0;
2490 alu.last = 1;
2491 r = r600_bytecode_add_alu(ctx->bc, &alu);
2492 if (r)
2493 return r;
2494
2495 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2496 alu.op = ALU_OP3_MULADD;
2497 alu.is_op3 = 1;
2498
2499 alu.dst.chan = 0;
2500 alu.dst.sel = ctx->temp_reg;
2501 alu.dst.write = 1;
2502
2503 alu.src[0].sel = ctx->temp_reg;
2504 alu.src[0].chan = 0;
2505
2506 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2507 alu.src[1].chan = 0;
2508 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
2509 alu.src[2].chan = 0;
2510
2511 if (ctx->bc->chip_class == R600) {
2512 alu.src[1].value = *(uint32_t *)&double_pi;
2513 alu.src[2].value = *(uint32_t *)&neg_pi;
2514 } else {
2515 alu.src[1].sel = V_SQ_ALU_SRC_1;
2516 alu.src[2].sel = V_SQ_ALU_SRC_0_5;
2517 alu.src[2].neg = 1;
2518 }
2519
2520 alu.last = 1;
2521 r = r600_bytecode_add_alu(ctx->bc, &alu);
2522 if (r)
2523 return r;
2524 return 0;
2525 }
2526
2527 static int cayman_trig(struct r600_shader_ctx *ctx)
2528 {
2529 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2530 struct r600_bytecode_alu alu;
2531 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
2532 int i, r;
2533
2534 r = tgsi_setup_trig(ctx);
2535 if (r)
2536 return r;
2537
2538
2539 for (i = 0; i < last_slot; i++) {
2540 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2541 alu.op = ctx->inst_info->op;
2542 alu.dst.chan = i;
2543
2544 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2545 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
2546
2547 alu.src[0].sel = ctx->temp_reg;
2548 alu.src[0].chan = 0;
2549 if (i == last_slot - 1)
2550 alu.last = 1;
2551 r = r600_bytecode_add_alu(ctx->bc, &alu);
2552 if (r)
2553 return r;
2554 }
2555 return 0;
2556 }
2557
2558 static int tgsi_trig(struct r600_shader_ctx *ctx)
2559 {
2560 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2561 struct r600_bytecode_alu alu;
2562 int i, r;
2563 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
2564
2565 r = tgsi_setup_trig(ctx);
2566 if (r)
2567 return r;
2568
2569 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2570 alu.op = ctx->inst_info->op;
2571 alu.dst.chan = 0;
2572 alu.dst.sel = ctx->temp_reg;
2573 alu.dst.write = 1;
2574
2575 alu.src[0].sel = ctx->temp_reg;
2576 alu.src[0].chan = 0;
2577 alu.last = 1;
2578 r = r600_bytecode_add_alu(ctx->bc, &alu);
2579 if (r)
2580 return r;
2581
2582 /* replicate result */
2583 for (i = 0; i < lasti + 1; i++) {
2584 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
2585 continue;
2586
2587 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2588 alu.op = ALU_OP1_MOV;
2589
2590 alu.src[0].sel = ctx->temp_reg;
2591 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2592 if (i == lasti)
2593 alu.last = 1;
2594 r = r600_bytecode_add_alu(ctx->bc, &alu);
2595 if (r)
2596 return r;
2597 }
2598 return 0;
2599 }
2600
2601 static int tgsi_scs(struct r600_shader_ctx *ctx)
2602 {
2603 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2604 struct r600_bytecode_alu alu;
2605 int i, r;
2606
2607 /* We'll only need the trig stuff if we are going to write to the
2608 * X or Y components of the destination vector.
2609 */
2610 if (likely(inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY)) {
2611 r = tgsi_setup_trig(ctx);
2612 if (r)
2613 return r;
2614 }
2615
2616 /* dst.x = COS */
2617 if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2618 if (ctx->bc->chip_class == CAYMAN) {
2619 for (i = 0 ; i < 3; i++) {
2620 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2621 alu.op = ALU_OP1_COS;
2622 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2623
2624 if (i == 0)
2625 alu.dst.write = 1;
2626 else
2627 alu.dst.write = 0;
2628 alu.src[0].sel = ctx->temp_reg;
2629 alu.src[0].chan = 0;
2630 if (i == 2)
2631 alu.last = 1;
2632 r = r600_bytecode_add_alu(ctx->bc, &alu);
2633 if (r)
2634 return r;
2635 }
2636 } else {
2637 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2638 alu.op = ALU_OP1_COS;
2639 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
2640
2641 alu.src[0].sel = ctx->temp_reg;
2642 alu.src[0].chan = 0;
2643 alu.last = 1;
2644 r = r600_bytecode_add_alu(ctx->bc, &alu);
2645 if (r)
2646 return r;
2647 }
2648 }
2649
2650 /* dst.y = SIN */
2651 if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2652 if (ctx->bc->chip_class == CAYMAN) {
2653 for (i = 0 ; i < 3; i++) {
2654 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2655 alu.op = ALU_OP1_SIN;
2656 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2657 if (i == 1)
2658 alu.dst.write = 1;
2659 else
2660 alu.dst.write = 0;
2661 alu.src[0].sel = ctx->temp_reg;
2662 alu.src[0].chan = 0;
2663 if (i == 2)
2664 alu.last = 1;
2665 r = r600_bytecode_add_alu(ctx->bc, &alu);
2666 if (r)
2667 return r;
2668 }
2669 } else {
2670 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2671 alu.op = ALU_OP1_SIN;
2672 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
2673
2674 alu.src[0].sel = ctx->temp_reg;
2675 alu.src[0].chan = 0;
2676 alu.last = 1;
2677 r = r600_bytecode_add_alu(ctx->bc, &alu);
2678 if (r)
2679 return r;
2680 }
2681 }
2682
2683 /* dst.z = 0.0; */
2684 if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2685 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2686
2687 alu.op = ALU_OP1_MOV;
2688
2689 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
2690
2691 alu.src[0].sel = V_SQ_ALU_SRC_0;
2692 alu.src[0].chan = 0;
2693
2694 alu.last = 1;
2695
2696 r = r600_bytecode_add_alu(ctx->bc, &alu);
2697 if (r)
2698 return r;
2699 }
2700
2701 /* dst.w = 1.0; */
2702 if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2703 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2704
2705 alu.op = ALU_OP1_MOV;
2706
2707 tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
2708
2709 alu.src[0].sel = V_SQ_ALU_SRC_1;
2710 alu.src[0].chan = 0;
2711
2712 alu.last = 1;
2713
2714 r = r600_bytecode_add_alu(ctx->bc, &alu);
2715 if (r)
2716 return r;
2717 }
2718
2719 return 0;
2720 }
2721
2722 static int tgsi_kill(struct r600_shader_ctx *ctx)
2723 {
2724 struct r600_bytecode_alu alu;
2725 int i, r;
2726
2727 for (i = 0; i < 4; i++) {
2728 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2729 alu.op = ctx->inst_info->op;
2730
2731 alu.dst.chan = i;
2732
2733 alu.src[0].sel = V_SQ_ALU_SRC_0;
2734
2735 if (ctx->inst_info->tgsi_opcode == TGSI_OPCODE_KILL) {
2736 alu.src[1].sel = V_SQ_ALU_SRC_1;
2737 alu.src[1].neg = 1;
2738 } else {
2739 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
2740 }
2741 if (i == 3) {
2742 alu.last = 1;
2743 }
2744 r = r600_bytecode_add_alu(ctx->bc, &alu);
2745 if (r)
2746 return r;
2747 }
2748
2749 /* kill must be last in ALU */
2750 ctx->bc->force_add_cf = 1;
2751 ctx->shader->uses_kill = TRUE;
2752 return 0;
2753 }
2754
2755 static int tgsi_lit(struct r600_shader_ctx *ctx)
2756 {
2757 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2758 struct r600_bytecode_alu alu;
2759 int r;
2760
2761 /* tmp.x = max(src.y, 0.0) */
2762 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2763 alu.op = ALU_OP2_MAX;
2764 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
2765 alu.src[1].sel = V_SQ_ALU_SRC_0; /*0.0*/
2766 alu.src[1].chan = 1;
2767
2768 alu.dst.sel = ctx->temp_reg;
2769 alu.dst.chan = 0;
2770 alu.dst.write = 1;
2771
2772 alu.last = 1;
2773 r = r600_bytecode_add_alu(ctx->bc, &alu);
2774 if (r)
2775 return r;
2776
2777 if (inst->Dst[0].Register.WriteMask & (1 << 2))
2778 {
2779 int chan;
2780 int sel;
2781 int i;
2782
2783 if (ctx->bc->chip_class == CAYMAN) {
2784 for (i = 0; i < 3; i++) {
2785 /* tmp.z = log(tmp.x) */
2786 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2787 alu.op = ALU_OP1_LOG_CLAMPED;
2788 alu.src[0].sel = ctx->temp_reg;
2789 alu.src[0].chan = 0;
2790 alu.dst.sel = ctx->temp_reg;
2791 alu.dst.chan = i;
2792 if (i == 2) {
2793 alu.dst.write = 1;
2794 alu.last = 1;
2795 } else
2796 alu.dst.write = 0;
2797
2798 r = r600_bytecode_add_alu(ctx->bc, &alu);
2799 if (r)
2800 return r;
2801 }
2802 } else {
2803 /* tmp.z = log(tmp.x) */
2804 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2805 alu.op = ALU_OP1_LOG_CLAMPED;
2806 alu.src[0].sel = ctx->temp_reg;
2807 alu.src[0].chan = 0;
2808 alu.dst.sel = ctx->temp_reg;
2809 alu.dst.chan = 2;
2810 alu.dst.write = 1;
2811 alu.last = 1;
2812 r = r600_bytecode_add_alu(ctx->bc, &alu);
2813 if (r)
2814 return r;
2815 }
2816
2817 chan = alu.dst.chan;
2818 sel = alu.dst.sel;
2819
2820 /* tmp.x = amd MUL_LIT(tmp.z, src.w, src.x ) */
2821 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2822 alu.op = ALU_OP3_MUL_LIT;
2823 alu.src[0].sel = sel;
2824 alu.src[0].chan = chan;
2825 r600_bytecode_src(&alu.src[1], &ctx->src[0], 3);
2826 r600_bytecode_src(&alu.src[2], &ctx->src[0], 0);
2827 alu.dst.sel = ctx->temp_reg;
2828 alu.dst.chan = 0;
2829 alu.dst.write = 1;
2830 alu.is_op3 = 1;
2831 alu.last = 1;
2832 r = r600_bytecode_add_alu(ctx->bc, &alu);
2833 if (r)
2834 return r;
2835
2836 if (ctx->bc->chip_class == CAYMAN) {
2837 for (i = 0; i < 3; i++) {
2838 /* dst.z = exp(tmp.x) */
2839 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2840 alu.op = ALU_OP1_EXP_IEEE;
2841 alu.src[0].sel = ctx->temp_reg;
2842 alu.src[0].chan = 0;
2843 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2844 if (i == 2) {
2845 alu.dst.write = 1;
2846 alu.last = 1;
2847 } else
2848 alu.dst.write = 0;
2849 r = r600_bytecode_add_alu(ctx->bc, &alu);
2850 if (r)
2851 return r;
2852 }
2853 } else {
2854 /* dst.z = exp(tmp.x) */
2855 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2856 alu.op = ALU_OP1_EXP_IEEE;
2857 alu.src[0].sel = ctx->temp_reg;
2858 alu.src[0].chan = 0;
2859 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
2860 alu.last = 1;
2861 r = r600_bytecode_add_alu(ctx->bc, &alu);
2862 if (r)
2863 return r;
2864 }
2865 }
2866
2867 /* dst.x, <- 1.0 */
2868 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2869 alu.op = ALU_OP1_MOV;
2870 alu.src[0].sel = V_SQ_ALU_SRC_1; /*1.0*/
2871 alu.src[0].chan = 0;
2872 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
2873 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 0) & 1;
2874 r = r600_bytecode_add_alu(ctx->bc, &alu);
2875 if (r)
2876 return r;
2877
2878 /* dst.y = max(src.x, 0.0) */
2879 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2880 alu.op = ALU_OP2_MAX;
2881 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
2882 alu.src[1].sel = V_SQ_ALU_SRC_0; /*0.0*/
2883 alu.src[1].chan = 0;
2884 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
2885 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 1) & 1;
2886 r = r600_bytecode_add_alu(ctx->bc, &alu);
2887 if (r)
2888 return r;
2889
2890 /* dst.w, <- 1.0 */
2891 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2892 alu.op = ALU_OP1_MOV;
2893 alu.src[0].sel = V_SQ_ALU_SRC_1;
2894 alu.src[0].chan = 0;
2895 tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
2896 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 3) & 1;
2897 alu.last = 1;
2898 r = r600_bytecode_add_alu(ctx->bc, &alu);
2899 if (r)
2900 return r;
2901
2902 return 0;
2903 }
2904
2905 static int tgsi_rsq(struct r600_shader_ctx *ctx)
2906 {
2907 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2908 struct r600_bytecode_alu alu;
2909 int i, r;
2910
2911 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2912
2913 /* XXX:
2914 * For state trackers other than OpenGL, we'll want to use
2915 * _RECIPSQRT_IEEE instead.
2916 */
2917 alu.op = ALU_OP1_RECIPSQRT_CLAMPED;
2918
2919 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
2920 r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
2921 r600_bytecode_src_set_abs(&alu.src[i]);
2922 }
2923 alu.dst.sel = ctx->temp_reg;
2924 alu.dst.write = 1;
2925 alu.last = 1;
2926 r = r600_bytecode_add_alu(ctx->bc, &alu);
2927 if (r)
2928 return r;
2929 /* replicate result */
2930 return tgsi_helper_tempx_replicate(ctx);
2931 }
2932
2933 static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx)
2934 {
2935 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2936 struct r600_bytecode_alu alu;
2937 int i, r;
2938
2939 for (i = 0; i < 4; i++) {
2940 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2941 alu.src[0].sel = ctx->temp_reg;
2942 alu.op = ALU_OP1_MOV;
2943 alu.dst.chan = i;
2944 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2945 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
2946 if (i == 3)
2947 alu.last = 1;
2948 r = r600_bytecode_add_alu(ctx->bc, &alu);
2949 if (r)
2950 return r;
2951 }
2952 return 0;
2953 }
2954
2955 static int tgsi_trans_srcx_replicate(struct r600_shader_ctx *ctx)
2956 {
2957 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2958 struct r600_bytecode_alu alu;
2959 int i, r;
2960
2961 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2962 alu.op = ctx->inst_info->op;
2963 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
2964 r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
2965 }
2966 alu.dst.sel = ctx->temp_reg;
2967 alu.dst.write = 1;
2968 alu.last = 1;
2969 r = r600_bytecode_add_alu(ctx->bc, &alu);
2970 if (r)
2971 return r;
2972 /* replicate result */
2973 return tgsi_helper_tempx_replicate(ctx);
2974 }
2975
2976 static int cayman_pow(struct r600_shader_ctx *ctx)
2977 {
2978 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2979 int i, r;
2980 struct r600_bytecode_alu alu;
2981 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
2982
2983 for (i = 0; i < 3; i++) {
2984 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2985 alu.op = ALU_OP1_LOG_IEEE;
2986 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
2987 alu.dst.sel = ctx->temp_reg;
2988 alu.dst.chan = i;
2989 alu.dst.write = 1;
2990 if (i == 2)
2991 alu.last = 1;
2992 r = r600_bytecode_add_alu(ctx->bc, &alu);
2993 if (r)
2994 return r;
2995 }
2996
2997 /* b * LOG2(a) */
2998 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2999 alu.op = ALU_OP2_MUL;
3000 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
3001 alu.src[1].sel = ctx->temp_reg;
3002 alu.dst.sel = ctx->temp_reg;
3003 alu.dst.write = 1;
3004 alu.last = 1;
3005 r = r600_bytecode_add_alu(ctx->bc, &alu);
3006 if (r)
3007 return r;
3008
3009 for (i = 0; i < last_slot; i++) {
3010 /* POW(a,b) = EXP2(b * LOG2(a))*/
3011 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3012 alu.op = ALU_OP1_EXP_IEEE;
3013 alu.src[0].sel = ctx->temp_reg;
3014
3015 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3016 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
3017 if (i == last_slot - 1)
3018 alu.last = 1;
3019 r = r600_bytecode_add_alu(ctx->bc, &alu);
3020 if (r)
3021 return r;
3022 }
3023 return 0;
3024 }
3025
3026 static int tgsi_pow(struct r600_shader_ctx *ctx)
3027 {
3028 struct r600_bytecode_alu alu;
3029 int r;
3030
3031 /* LOG2(a) */
3032 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3033 alu.op = ALU_OP1_LOG_IEEE;
3034 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
3035 alu.dst.sel = ctx->temp_reg;
3036 alu.dst.write = 1;
3037 alu.last = 1;
3038 r = r600_bytecode_add_alu(ctx->bc, &alu);
3039 if (r)
3040 return r;
3041 /* b * LOG2(a) */
3042 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3043 alu.op = ALU_OP2_MUL;
3044 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
3045 alu.src[1].sel = ctx->temp_reg;
3046 alu.dst.sel = ctx->temp_reg;
3047 alu.dst.write = 1;
3048 alu.last = 1;
3049 r = r600_bytecode_add_alu(ctx->bc, &alu);
3050 if (r)
3051 return r;
3052 /* POW(a,b) = EXP2(b * LOG2(a))*/
3053 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3054 alu.op = ALU_OP1_EXP_IEEE;
3055 alu.src[0].sel = ctx->temp_reg;
3056 alu.dst.sel = ctx->temp_reg;
3057 alu.dst.write = 1;
3058 alu.last = 1;
3059 r = r600_bytecode_add_alu(ctx->bc, &alu);
3060 if (r)
3061 return r;
3062 return tgsi_helper_tempx_replicate(ctx);
3063 }
3064
3065 static int tgsi_divmod(struct r600_shader_ctx *ctx, int mod, int signed_op)
3066 {
3067 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3068 struct r600_bytecode_alu alu;
3069 int i, r, j;
3070 unsigned write_mask = inst->Dst[0].Register.WriteMask;
3071 int tmp0 = ctx->temp_reg;
3072 int tmp1 = r600_get_temp(ctx);
3073 int tmp2 = r600_get_temp(ctx);
3074 int tmp3 = r600_get_temp(ctx);
3075 /* Unsigned path:
3076 *
3077 * we need to represent src1 as src2*q + r, where q - quotient, r - remainder
3078 *
3079 * 1. tmp0.x = rcp (src2) = 2^32/src2 + e, where e is rounding error
3080 * 2. tmp0.z = lo (tmp0.x * src2)
3081 * 3. tmp0.w = -tmp0.z
3082 * 4. tmp0.y = hi (tmp0.x * src2)
3083 * 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z) = abs(lo(rcp*src2))
3084 * 6. tmp0.w = hi (tmp0.z * tmp0.x) = e, rounding error
3085 * 7. tmp1.x = tmp0.x - tmp0.w
3086 * 8. tmp1.y = tmp0.x + tmp0.w
3087 * 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x)
3088 * 10. tmp0.z = hi(tmp0.x * src1) = q
3089 * 11. tmp0.y = lo (tmp0.z * src2) = src2*q = src1 - r
3090 *
3091 * 12. tmp0.w = src1 - tmp0.y = r
3092 * 13. tmp1.x = tmp0.w >= src2 = r >= src2 (uint comparison)
3093 * 14. tmp1.y = src1 >= tmp0.y = r >= 0 (uint comparison)
3094 *
3095 * if DIV
3096 *
3097 * 15. tmp1.z = tmp0.z + 1 = q + 1
3098 * 16. tmp1.w = tmp0.z - 1 = q - 1
3099 *
3100 * else MOD
3101 *
3102 * 15. tmp1.z = tmp0.w - src2 = r - src2
3103 * 16. tmp1.w = tmp0.w + src2 = r + src2
3104 *
3105 * endif
3106 *
3107 * 17. tmp1.x = tmp1.x & tmp1.y
3108 *
3109 * DIV: 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z
3110 * MOD: 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z
3111 *
3112 * 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z
3113 * 20. dst = src2==0 ? MAX_UINT : tmp0.z
3114 *
3115 * Signed path:
3116 *
3117 * Same as unsigned, using abs values of the operands,
3118 * and fixing the sign of the result in the end.
3119 */
3120
3121 for (i = 0; i < 4; i++) {
3122 if (!(write_mask & (1<<i)))
3123 continue;
3124
3125 if (signed_op) {
3126
3127 /* tmp2.x = -src0 */
3128 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3129 alu.op = ALU_OP2_SUB_INT;
3130
3131 alu.dst.sel = tmp2;
3132 alu.dst.chan = 0;
3133 alu.dst.write = 1;
3134
3135 alu.src[0].sel = V_SQ_ALU_SRC_0;
3136
3137 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3138
3139 alu.last = 1;
3140 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3141 return r;
3142
3143 /* tmp2.y = -src1 */
3144 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3145 alu.op = ALU_OP2_SUB_INT;
3146
3147 alu.dst.sel = tmp2;
3148 alu.dst.chan = 1;
3149 alu.dst.write = 1;
3150
3151 alu.src[0].sel = V_SQ_ALU_SRC_0;
3152
3153 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3154
3155 alu.last = 1;
3156 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3157 return r;
3158
3159 /* tmp2.z sign bit is set if src0 and src2 signs are different */
3160 /* it will be a sign of the quotient */
3161 if (!mod) {
3162
3163 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3164 alu.op = ALU_OP2_XOR_INT;
3165
3166 alu.dst.sel = tmp2;
3167 alu.dst.chan = 2;
3168 alu.dst.write = 1;
3169
3170 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3171 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3172
3173 alu.last = 1;
3174 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3175 return r;
3176 }
3177
3178 /* tmp2.x = |src0| */
3179 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3180 alu.op = ALU_OP3_CNDGE_INT;
3181 alu.is_op3 = 1;
3182
3183 alu.dst.sel = tmp2;
3184 alu.dst.chan = 0;
3185 alu.dst.write = 1;
3186
3187 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3188 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3189 alu.src[2].sel = tmp2;
3190 alu.src[2].chan = 0;
3191
3192 alu.last = 1;
3193 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3194 return r;
3195
3196 /* tmp2.y = |src1| */
3197 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3198 alu.op = ALU_OP3_CNDGE_INT;
3199 alu.is_op3 = 1;
3200
3201 alu.dst.sel = tmp2;
3202 alu.dst.chan = 1;
3203 alu.dst.write = 1;
3204
3205 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
3206 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3207 alu.src[2].sel = tmp2;
3208 alu.src[2].chan = 1;
3209
3210 alu.last = 1;
3211 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3212 return r;
3213
3214 }
3215
3216 /* 1. tmp0.x = rcp_u (src2) = 2^32/src2 + e, where e is rounding error */
3217 if (ctx->bc->chip_class == CAYMAN) {
3218 /* tmp3.x = u2f(src2) */
3219 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3220 alu.op = ALU_OP1_UINT_TO_FLT;
3221
3222 alu.dst.sel = tmp3;
3223 alu.dst.chan = 0;
3224 alu.dst.write = 1;
3225
3226 if (signed_op) {
3227 alu.src[0].sel = tmp2;
3228 alu.src[0].chan = 1;
3229 } else {
3230 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
3231 }
3232
3233 alu.last = 1;
3234 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3235 return r;
3236
3237 /* tmp0.x = recip(tmp3.x) */
3238 for (j = 0 ; j < 3; j++) {
3239 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3240 alu.op = ALU_OP1_RECIP_IEEE;
3241
3242 alu.dst.sel = tmp0;
3243 alu.dst.chan = j;
3244 alu.dst.write = (j == 0);
3245
3246 alu.src[0].sel = tmp3;
3247 alu.src[0].chan = 0;
3248
3249 if (j == 2)
3250 alu.last = 1;
3251 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3252 return r;
3253 }
3254
3255 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3256 alu.op = ALU_OP2_MUL;
3257
3258 alu.src[0].sel = tmp0;
3259 alu.src[0].chan = 0;
3260
3261 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
3262 alu.src[1].value = 0x4f800000;
3263
3264 alu.dst.sel = tmp3;
3265 alu.dst.write = 1;
3266 alu.last = 1;
3267 r = r600_bytecode_add_alu(ctx->bc, &alu);
3268 if (r)
3269 return r;
3270
3271 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3272 alu.op = ALU_OP1_FLT_TO_UINT;
3273
3274 alu.dst.sel = tmp0;
3275 alu.dst.chan = 0;
3276 alu.dst.write = 1;
3277
3278 alu.src[0].sel = tmp3;
3279 alu.src[0].chan = 0;
3280
3281 alu.last = 1;
3282 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3283 return r;
3284
3285 } else {
3286 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3287 alu.op = ALU_OP1_RECIP_UINT;
3288
3289 alu.dst.sel = tmp0;
3290 alu.dst.chan = 0;
3291 alu.dst.write = 1;
3292
3293 if (signed_op) {
3294 alu.src[0].sel = tmp2;
3295 alu.src[0].chan = 1;
3296 } else {
3297 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
3298 }
3299
3300 alu.last = 1;
3301 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3302 return r;
3303 }
3304
3305 /* 2. tmp0.z = lo (tmp0.x * src2) */
3306 if (ctx->bc->chip_class == CAYMAN) {
3307 for (j = 0 ; j < 4; j++) {
3308 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3309 alu.op = ALU_OP2_MULLO_UINT;
3310
3311 alu.dst.sel = tmp0;
3312 alu.dst.chan = j;
3313 alu.dst.write = (j == 2);
3314
3315 alu.src[0].sel = tmp0;
3316 alu.src[0].chan = 0;
3317 if (signed_op) {
3318 alu.src[1].sel = tmp2;
3319 alu.src[1].chan = 1;
3320 } else {
3321 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3322 }
3323
3324 alu.last = (j == 3);
3325 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3326 return r;
3327 }
3328 } else {
3329 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3330 alu.op = ALU_OP2_MULLO_UINT;
3331
3332 alu.dst.sel = tmp0;
3333 alu.dst.chan = 2;
3334 alu.dst.write = 1;
3335
3336 alu.src[0].sel = tmp0;
3337 alu.src[0].chan = 0;
3338 if (signed_op) {
3339 alu.src[1].sel = tmp2;
3340 alu.src[1].chan = 1;
3341 } else {
3342 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3343 }
3344
3345 alu.last = 1;
3346 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3347 return r;
3348 }
3349
3350 /* 3. tmp0.w = -tmp0.z */
3351 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3352 alu.op = ALU_OP2_SUB_INT;
3353
3354 alu.dst.sel = tmp0;
3355 alu.dst.chan = 3;
3356 alu.dst.write = 1;
3357
3358 alu.src[0].sel = V_SQ_ALU_SRC_0;
3359 alu.src[1].sel = tmp0;
3360 alu.src[1].chan = 2;
3361
3362 alu.last = 1;
3363 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3364 return r;
3365
3366 /* 4. tmp0.y = hi (tmp0.x * src2) */
3367 if (ctx->bc->chip_class == CAYMAN) {
3368 for (j = 0 ; j < 4; j++) {
3369 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3370 alu.op = ALU_OP2_MULHI_UINT;
3371
3372 alu.dst.sel = tmp0;
3373 alu.dst.chan = j;
3374 alu.dst.write = (j == 1);
3375
3376 alu.src[0].sel = tmp0;
3377 alu.src[0].chan = 0;
3378
3379 if (signed_op) {
3380 alu.src[1].sel = tmp2;
3381 alu.src[1].chan = 1;
3382 } else {
3383 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3384 }
3385 alu.last = (j == 3);
3386 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3387 return r;
3388 }
3389 } else {
3390 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3391 alu.op = ALU_OP2_MULHI_UINT;
3392
3393 alu.dst.sel = tmp0;
3394 alu.dst.chan = 1;
3395 alu.dst.write = 1;
3396
3397 alu.src[0].sel = tmp0;
3398 alu.src[0].chan = 0;
3399
3400 if (signed_op) {
3401 alu.src[1].sel = tmp2;
3402 alu.src[1].chan = 1;
3403 } else {
3404 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3405 }
3406
3407 alu.last = 1;
3408 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3409 return r;
3410 }
3411
3412 /* 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z) = abs(lo(rcp*src)) */
3413 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3414 alu.op = ALU_OP3_CNDE_INT;
3415 alu.is_op3 = 1;
3416
3417 alu.dst.sel = tmp0;
3418 alu.dst.chan = 2;
3419 alu.dst.write = 1;
3420
3421 alu.src[0].sel = tmp0;
3422 alu.src[0].chan = 1;
3423 alu.src[1].sel = tmp0;
3424 alu.src[1].chan = 3;
3425 alu.src[2].sel = tmp0;
3426 alu.src[2].chan = 2;
3427
3428 alu.last = 1;
3429 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3430 return r;
3431
3432 /* 6. tmp0.w = hi (tmp0.z * tmp0.x) = e, rounding error */
3433 if (ctx->bc->chip_class == CAYMAN) {
3434 for (j = 0 ; j < 4; j++) {
3435 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3436 alu.op = ALU_OP2_MULHI_UINT;
3437
3438 alu.dst.sel = tmp0;
3439 alu.dst.chan = j;
3440 alu.dst.write = (j == 3);
3441
3442 alu.src[0].sel = tmp0;
3443 alu.src[0].chan = 2;
3444
3445 alu.src[1].sel = tmp0;
3446 alu.src[1].chan = 0;
3447
3448 alu.last = (j == 3);
3449 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3450 return r;
3451 }
3452 } else {
3453 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3454 alu.op = ALU_OP2_MULHI_UINT;
3455
3456 alu.dst.sel = tmp0;
3457 alu.dst.chan = 3;
3458 alu.dst.write = 1;
3459
3460 alu.src[0].sel = tmp0;
3461 alu.src[0].chan = 2;
3462
3463 alu.src[1].sel = tmp0;
3464 alu.src[1].chan = 0;
3465
3466 alu.last = 1;
3467 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3468 return r;
3469 }
3470
3471 /* 7. tmp1.x = tmp0.x - tmp0.w */
3472 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3473 alu.op = ALU_OP2_SUB_INT;
3474
3475 alu.dst.sel = tmp1;
3476 alu.dst.chan = 0;
3477 alu.dst.write = 1;
3478
3479 alu.src[0].sel = tmp0;
3480 alu.src[0].chan = 0;
3481 alu.src[1].sel = tmp0;
3482 alu.src[1].chan = 3;
3483
3484 alu.last = 1;
3485 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3486 return r;
3487
3488 /* 8. tmp1.y = tmp0.x + tmp0.w */
3489 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3490 alu.op = ALU_OP2_ADD_INT;
3491
3492 alu.dst.sel = tmp1;
3493 alu.dst.chan = 1;
3494 alu.dst.write = 1;
3495
3496 alu.src[0].sel = tmp0;
3497 alu.src[0].chan = 0;
3498 alu.src[1].sel = tmp0;
3499 alu.src[1].chan = 3;
3500
3501 alu.last = 1;
3502 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3503 return r;
3504
3505 /* 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) */
3506 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3507 alu.op = ALU_OP3_CNDE_INT;
3508 alu.is_op3 = 1;
3509
3510 alu.dst.sel = tmp0;
3511 alu.dst.chan = 0;
3512 alu.dst.write = 1;
3513
3514 alu.src[0].sel = tmp0;
3515 alu.src[0].chan = 1;
3516 alu.src[1].sel = tmp1;
3517 alu.src[1].chan = 1;
3518 alu.src[2].sel = tmp1;
3519 alu.src[2].chan = 0;
3520
3521 alu.last = 1;
3522 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3523 return r;
3524
3525 /* 10. tmp0.z = hi(tmp0.x * src1) = q */
3526 if (ctx->bc->chip_class == CAYMAN) {
3527 for (j = 0 ; j < 4; j++) {
3528 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3529 alu.op = ALU_OP2_MULHI_UINT;
3530
3531 alu.dst.sel = tmp0;
3532 alu.dst.chan = j;
3533 alu.dst.write = (j == 2);
3534
3535 alu.src[0].sel = tmp0;
3536 alu.src[0].chan = 0;
3537
3538 if (signed_op) {
3539 alu.src[1].sel = tmp2;
3540 alu.src[1].chan = 0;
3541 } else {
3542 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3543 }
3544
3545 alu.last = (j == 3);
3546 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3547 return r;
3548 }
3549 } else {
3550 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3551 alu.op = ALU_OP2_MULHI_UINT;
3552
3553 alu.dst.sel = tmp0;
3554 alu.dst.chan = 2;
3555 alu.dst.write = 1;
3556
3557 alu.src[0].sel = tmp0;
3558 alu.src[0].chan = 0;
3559
3560 if (signed_op) {
3561 alu.src[1].sel = tmp2;
3562 alu.src[1].chan = 0;
3563 } else {
3564 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3565 }
3566
3567 alu.last = 1;
3568 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3569 return r;
3570 }
3571
3572 /* 11. tmp0.y = lo (src2 * tmp0.z) = src2*q = src1 - r */
3573 if (ctx->bc->chip_class == CAYMAN) {
3574 for (j = 0 ; j < 4; j++) {
3575 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3576 alu.op = ALU_OP2_MULLO_UINT;
3577
3578 alu.dst.sel = tmp0;
3579 alu.dst.chan = j;
3580 alu.dst.write = (j == 1);
3581
3582 if (signed_op) {
3583 alu.src[0].sel = tmp2;
3584 alu.src[0].chan = 1;
3585 } else {
3586 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
3587 }
3588
3589 alu.src[1].sel = tmp0;
3590 alu.src[1].chan = 2;
3591
3592 alu.last = (j == 3);
3593 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3594 return r;
3595 }
3596 } else {
3597 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3598 alu.op = ALU_OP2_MULLO_UINT;
3599
3600 alu.dst.sel = tmp0;
3601 alu.dst.chan = 1;
3602 alu.dst.write = 1;
3603
3604 if (signed_op) {
3605 alu.src[0].sel = tmp2;
3606 alu.src[0].chan = 1;
3607 } else {
3608 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
3609 }
3610
3611 alu.src[1].sel = tmp0;
3612 alu.src[1].chan = 2;
3613
3614 alu.last = 1;
3615 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3616 return r;
3617 }
3618
3619 /* 12. tmp0.w = src1 - tmp0.y = r */
3620 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3621 alu.op = ALU_OP2_SUB_INT;
3622
3623 alu.dst.sel = tmp0;
3624 alu.dst.chan = 3;
3625 alu.dst.write = 1;
3626
3627 if (signed_op) {
3628 alu.src[0].sel = tmp2;
3629 alu.src[0].chan = 0;
3630 } else {
3631 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3632 }
3633
3634 alu.src[1].sel = tmp0;
3635 alu.src[1].chan = 1;
3636
3637 alu.last = 1;
3638 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3639 return r;
3640
3641 /* 13. tmp1.x = tmp0.w >= src2 = r >= src2 */
3642 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3643 alu.op = ALU_OP2_SETGE_UINT;
3644
3645 alu.dst.sel = tmp1;
3646 alu.dst.chan = 0;
3647 alu.dst.write = 1;
3648
3649 alu.src[0].sel = tmp0;
3650 alu.src[0].chan = 3;
3651 if (signed_op) {
3652 alu.src[1].sel = tmp2;
3653 alu.src[1].chan = 1;
3654 } else {
3655 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3656 }
3657
3658 alu.last = 1;
3659 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3660 return r;
3661
3662 /* 14. tmp1.y = src1 >= tmp0.y = r >= 0 */
3663 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3664 alu.op = ALU_OP2_SETGE_UINT;
3665
3666 alu.dst.sel = tmp1;
3667 alu.dst.chan = 1;
3668 alu.dst.write = 1;
3669
3670 if (signed_op) {
3671 alu.src[0].sel = tmp2;
3672 alu.src[0].chan = 0;
3673 } else {
3674 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3675 }
3676
3677 alu.src[1].sel = tmp0;
3678 alu.src[1].chan = 1;
3679
3680 alu.last = 1;
3681 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3682 return r;
3683
3684 if (mod) { /* UMOD */
3685
3686 /* 15. tmp1.z = tmp0.w - src2 = r - src2 */
3687 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3688 alu.op = ALU_OP2_SUB_INT;
3689
3690 alu.dst.sel = tmp1;
3691 alu.dst.chan = 2;
3692 alu.dst.write = 1;
3693
3694 alu.src[0].sel = tmp0;
3695 alu.src[0].chan = 3;
3696
3697 if (signed_op) {
3698 alu.src[1].sel = tmp2;
3699 alu.src[1].chan = 1;
3700 } else {
3701 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3702 }
3703
3704 alu.last = 1;
3705 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3706 return r;
3707
3708 /* 16. tmp1.w = tmp0.w + src2 = r + src2 */
3709 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3710 alu.op = ALU_OP2_ADD_INT;
3711
3712 alu.dst.sel = tmp1;
3713 alu.dst.chan = 3;
3714 alu.dst.write = 1;
3715
3716 alu.src[0].sel = tmp0;
3717 alu.src[0].chan = 3;
3718 if (signed_op) {
3719 alu.src[1].sel = tmp2;
3720 alu.src[1].chan = 1;
3721 } else {
3722 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3723 }
3724
3725 alu.last = 1;
3726 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3727 return r;
3728
3729 } else { /* UDIV */
3730
3731 /* 15. tmp1.z = tmp0.z + 1 = q + 1 DIV */
3732 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3733 alu.op = ALU_OP2_ADD_INT;
3734
3735 alu.dst.sel = tmp1;
3736 alu.dst.chan = 2;
3737 alu.dst.write = 1;
3738
3739 alu.src[0].sel = tmp0;
3740 alu.src[0].chan = 2;
3741 alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
3742
3743 alu.last = 1;
3744 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3745 return r;
3746
3747 /* 16. tmp1.w = tmp0.z - 1 = q - 1 */
3748 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3749 alu.op = ALU_OP2_ADD_INT;
3750
3751 alu.dst.sel = tmp1;
3752 alu.dst.chan = 3;
3753 alu.dst.write = 1;
3754
3755 alu.src[0].sel = tmp0;
3756 alu.src[0].chan = 2;
3757 alu.src[1].sel = V_SQ_ALU_SRC_M_1_INT;
3758
3759 alu.last = 1;
3760 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3761 return r;
3762
3763 }
3764
3765 /* 17. tmp1.x = tmp1.x & tmp1.y */
3766 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3767 alu.op = ALU_OP2_AND_INT;
3768
3769 alu.dst.sel = tmp1;
3770 alu.dst.chan = 0;
3771 alu.dst.write = 1;
3772
3773 alu.src[0].sel = tmp1;
3774 alu.src[0].chan = 0;
3775 alu.src[1].sel = tmp1;
3776 alu.src[1].chan = 1;
3777
3778 alu.last = 1;
3779 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3780 return r;
3781
3782 /* 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z DIV */
3783 /* 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z MOD */
3784 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3785 alu.op = ALU_OP3_CNDE_INT;
3786 alu.is_op3 = 1;
3787
3788 alu.dst.sel = tmp0;
3789 alu.dst.chan = 2;
3790 alu.dst.write = 1;
3791
3792 alu.src[0].sel = tmp1;
3793 alu.src[0].chan = 0;
3794 alu.src[1].sel = tmp0;
3795 alu.src[1].chan = mod ? 3 : 2;
3796 alu.src[2].sel = tmp1;
3797 alu.src[2].chan = 2;
3798
3799 alu.last = 1;
3800 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3801 return r;
3802
3803 /* 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z */
3804 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3805 alu.op = ALU_OP3_CNDE_INT;
3806 alu.is_op3 = 1;
3807
3808 if (signed_op) {
3809 alu.dst.sel = tmp0;
3810 alu.dst.chan = 2;
3811 alu.dst.write = 1;
3812 } else {
3813 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3814 }
3815
3816 alu.src[0].sel = tmp1;
3817 alu.src[0].chan = 1;
3818 alu.src[1].sel = tmp1;
3819 alu.src[1].chan = 3;
3820 alu.src[2].sel = tmp0;
3821 alu.src[2].chan = 2;
3822
3823 alu.last = 1;
3824 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3825 return r;
3826
3827 if (signed_op) {
3828
3829 /* fix the sign of the result */
3830
3831 if (mod) {
3832
3833 /* tmp0.x = -tmp0.z */
3834 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3835 alu.op = ALU_OP2_SUB_INT;
3836
3837 alu.dst.sel = tmp0;
3838 alu.dst.chan = 0;
3839 alu.dst.write = 1;
3840
3841 alu.src[0].sel = V_SQ_ALU_SRC_0;
3842 alu.src[1].sel = tmp0;
3843 alu.src[1].chan = 2;
3844
3845 alu.last = 1;
3846 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3847 return r;
3848
3849 /* sign of the remainder is the same as the sign of src0 */
3850 /* tmp0.x = src0>=0 ? tmp0.z : tmp0.x */
3851 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3852 alu.op = ALU_OP3_CNDGE_INT;
3853 alu.is_op3 = 1;
3854
3855 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3856
3857 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3858 alu.src[1].sel = tmp0;
3859 alu.src[1].chan = 2;
3860 alu.src[2].sel = tmp0;
3861 alu.src[2].chan = 0;
3862
3863 alu.last = 1;
3864 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3865 return r;
3866
3867 } else {
3868
3869 /* tmp0.x = -tmp0.z */
3870 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3871 alu.op = ALU_OP2_SUB_INT;
3872
3873 alu.dst.sel = tmp0;
3874 alu.dst.chan = 0;
3875 alu.dst.write = 1;
3876
3877 alu.src[0].sel = V_SQ_ALU_SRC_0;
3878 alu.src[1].sel = tmp0;
3879 alu.src[1].chan = 2;
3880
3881 alu.last = 1;
3882 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3883 return r;
3884
3885 /* fix the quotient sign (same as the sign of src0*src1) */
3886 /* tmp0.x = tmp2.z>=0 ? tmp0.z : tmp0.x */
3887 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3888 alu.op = ALU_OP3_CNDGE_INT;
3889 alu.is_op3 = 1;
3890
3891 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3892
3893 alu.src[0].sel = tmp2;
3894 alu.src[0].chan = 2;
3895 alu.src[1].sel = tmp0;
3896 alu.src[1].chan = 2;
3897 alu.src[2].sel = tmp0;
3898 alu.src[2].chan = 0;
3899
3900 alu.last = 1;
3901 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3902 return r;
3903 }
3904 }
3905 }
3906 return 0;
3907 }
3908
3909 static int tgsi_udiv(struct r600_shader_ctx *ctx)
3910 {
3911 return tgsi_divmod(ctx, 0, 0);
3912 }
3913
3914 static int tgsi_umod(struct r600_shader_ctx *ctx)
3915 {
3916 return tgsi_divmod(ctx, 1, 0);
3917 }
3918
3919 static int tgsi_idiv(struct r600_shader_ctx *ctx)
3920 {
3921 return tgsi_divmod(ctx, 0, 1);
3922 }
3923
3924 static int tgsi_imod(struct r600_shader_ctx *ctx)
3925 {
3926 return tgsi_divmod(ctx, 1, 1);
3927 }
3928
3929
3930 static int tgsi_f2i(struct r600_shader_ctx *ctx)
3931 {
3932 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3933 struct r600_bytecode_alu alu;
3934 int i, r;
3935 unsigned write_mask = inst->Dst[0].Register.WriteMask;
3936 int last_inst = tgsi_last_instruction(write_mask);
3937
3938 for (i = 0; i < 4; i++) {
3939 if (!(write_mask & (1<<i)))
3940 continue;
3941
3942 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3943 alu.op = ALU_OP1_TRUNC;
3944
3945 alu.dst.sel = ctx->temp_reg;
3946 alu.dst.chan = i;
3947 alu.dst.write = 1;
3948
3949 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3950 if (i == last_inst)
3951 alu.last = 1;
3952 r = r600_bytecode_add_alu(ctx->bc, &alu);
3953 if (r)
3954 return r;
3955 }
3956
3957 for (i = 0; i < 4; i++) {
3958 if (!(write_mask & (1<<i)))
3959 continue;
3960
3961 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3962 alu.op = ctx->inst_info->op;
3963
3964 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3965
3966 alu.src[0].sel = ctx->temp_reg;
3967 alu.src[0].chan = i;
3968
3969 if (i == last_inst || alu.op == ALU_OP1_FLT_TO_UINT)
3970 alu.last = 1;
3971 r = r600_bytecode_add_alu(ctx->bc, &alu);
3972 if (r)
3973 return r;
3974 }
3975
3976 return 0;
3977 }
3978
3979 static int tgsi_iabs(struct r600_shader_ctx *ctx)
3980 {
3981 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3982 struct r600_bytecode_alu alu;
3983 int i, r;
3984 unsigned write_mask = inst->Dst[0].Register.WriteMask;
3985 int last_inst = tgsi_last_instruction(write_mask);
3986
3987 /* tmp = -src */
3988 for (i = 0; i < 4; i++) {
3989 if (!(write_mask & (1<<i)))
3990 continue;
3991
3992 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3993 alu.op = ALU_OP2_SUB_INT;
3994
3995 alu.dst.sel = ctx->temp_reg;
3996 alu.dst.chan = i;
3997 alu.dst.write = 1;
3998
3999 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4000 alu.src[0].sel = V_SQ_ALU_SRC_0;
4001
4002 if (i == last_inst)
4003 alu.last = 1;
4004 r = r600_bytecode_add_alu(ctx->bc, &alu);
4005 if (r)
4006 return r;
4007 }
4008
4009 /* dst = (src >= 0 ? src : tmp) */
4010 for (i = 0; i < 4; i++) {
4011 if (!(write_mask & (1<<i)))
4012 continue;
4013
4014 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4015 alu.op = ALU_OP3_CNDGE_INT;
4016 alu.is_op3 = 1;
4017 alu.dst.write = 1;
4018
4019 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4020
4021 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4022 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4023 alu.src[2].sel = ctx->temp_reg;
4024 alu.src[2].chan = i;
4025
4026 if (i == last_inst)
4027 alu.last = 1;
4028 r = r600_bytecode_add_alu(ctx->bc, &alu);
4029 if (r)
4030 return r;
4031 }
4032 return 0;
4033 }
4034
4035 static int tgsi_issg(struct r600_shader_ctx *ctx)
4036 {
4037 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4038 struct r600_bytecode_alu alu;
4039 int i, r;
4040 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4041 int last_inst = tgsi_last_instruction(write_mask);
4042
4043 /* tmp = (src >= 0 ? src : -1) */
4044 for (i = 0; i < 4; i++) {
4045 if (!(write_mask & (1<<i)))
4046 continue;
4047
4048 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4049 alu.op = ALU_OP3_CNDGE_INT;
4050 alu.is_op3 = 1;
4051
4052 alu.dst.sel = ctx->temp_reg;
4053 alu.dst.chan = i;
4054 alu.dst.write = 1;
4055
4056 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4057 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4058 alu.src[2].sel = V_SQ_ALU_SRC_M_1_INT;
4059
4060 if (i == last_inst)
4061 alu.last = 1;
4062 r = r600_bytecode_add_alu(ctx->bc, &alu);
4063 if (r)
4064 return r;
4065 }
4066
4067 /* dst = (tmp > 0 ? 1 : tmp) */
4068 for (i = 0; i < 4; i++) {
4069 if (!(write_mask & (1<<i)))
4070 continue;
4071
4072 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4073 alu.op = ALU_OP3_CNDGT_INT;
4074 alu.is_op3 = 1;
4075 alu.dst.write = 1;
4076
4077 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4078
4079 alu.src[0].sel = ctx->temp_reg;
4080 alu.src[0].chan = i;
4081
4082 alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
4083
4084 alu.src[2].sel = ctx->temp_reg;
4085 alu.src[2].chan = i;
4086
4087 if (i == last_inst)
4088 alu.last = 1;
4089 r = r600_bytecode_add_alu(ctx->bc, &alu);
4090 if (r)
4091 return r;
4092 }
4093 return 0;
4094 }
4095
4096
4097
4098 static int tgsi_ssg(struct r600_shader_ctx *ctx)
4099 {
4100 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4101 struct r600_bytecode_alu alu;
4102 int i, r;
4103
4104 /* tmp = (src > 0 ? 1 : src) */
4105 for (i = 0; i < 4; i++) {
4106 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4107 alu.op = ALU_OP3_CNDGT;
4108 alu.is_op3 = 1;
4109
4110 alu.dst.sel = ctx->temp_reg;
4111 alu.dst.chan = i;
4112
4113 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4114 alu.src[1].sel = V_SQ_ALU_SRC_1;
4115 r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
4116
4117 if (i == 3)
4118 alu.last = 1;
4119 r = r600_bytecode_add_alu(ctx->bc, &alu);
4120 if (r)
4121 return r;
4122 }
4123
4124 /* dst = (-tmp > 0 ? -1 : tmp) */
4125 for (i = 0; i < 4; i++) {
4126 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4127 alu.op = ALU_OP3_CNDGT;
4128 alu.is_op3 = 1;
4129 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4130
4131 alu.src[0].sel = ctx->temp_reg;
4132 alu.src[0].chan = i;
4133 alu.src[0].neg = 1;
4134
4135 alu.src[1].sel = V_SQ_ALU_SRC_1;
4136 alu.src[1].neg = 1;
4137
4138 alu.src[2].sel = ctx->temp_reg;
4139 alu.src[2].chan = i;
4140
4141 if (i == 3)
4142 alu.last = 1;
4143 r = r600_bytecode_add_alu(ctx->bc, &alu);
4144 if (r)
4145 return r;
4146 }
4147 return 0;
4148 }
4149
4150 static int tgsi_helper_copy(struct r600_shader_ctx *ctx, struct tgsi_full_instruction *inst)
4151 {
4152 struct r600_bytecode_alu alu;
4153 int i, r;
4154
4155 for (i = 0; i < 4; i++) {
4156 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4157 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) {
4158 alu.op = ALU_OP0_NOP;
4159 alu.dst.chan = i;
4160 } else {
4161 alu.op = ALU_OP1_MOV;
4162 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4163 alu.src[0].sel = ctx->temp_reg;
4164 alu.src[0].chan = i;
4165 }
4166 if (i == 3) {
4167 alu.last = 1;
4168 }
4169 r = r600_bytecode_add_alu(ctx->bc, &alu);
4170 if (r)
4171 return r;
4172 }
4173 return 0;
4174 }
4175
4176 static int tgsi_op3(struct r600_shader_ctx *ctx)
4177 {
4178 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4179 struct r600_bytecode_alu alu;
4180 int i, j, r;
4181 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4182
4183 for (i = 0; i < lasti + 1; i++) {
4184 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4185 continue;
4186
4187 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4188 alu.op = ctx->inst_info->op;
4189 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4190 r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
4191 }
4192
4193 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4194 alu.dst.chan = i;
4195 alu.dst.write = 1;
4196 alu.is_op3 = 1;
4197 if (i == lasti) {
4198 alu.last = 1;
4199 }
4200 r = r600_bytecode_add_alu(ctx->bc, &alu);
4201 if (r)
4202 return r;
4203 }
4204 return 0;
4205 }
4206
4207 static int tgsi_dp(struct r600_shader_ctx *ctx)
4208 {
4209 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4210 struct r600_bytecode_alu alu;
4211 int i, j, r;
4212
4213 for (i = 0; i < 4; i++) {
4214 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4215 alu.op = ctx->inst_info->op;
4216 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4217 r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
4218 }
4219
4220 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4221 alu.dst.chan = i;
4222 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
4223 /* handle some special cases */
4224 switch (ctx->inst_info->tgsi_opcode) {
4225 case TGSI_OPCODE_DP2:
4226 if (i > 1) {
4227 alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
4228 alu.src[0].chan = alu.src[1].chan = 0;
4229 }
4230 break;
4231 case TGSI_OPCODE_DP3:
4232 if (i > 2) {
4233 alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
4234 alu.src[0].chan = alu.src[1].chan = 0;
4235 }
4236 break;
4237 case TGSI_OPCODE_DPH:
4238 if (i == 3) {
4239 alu.src[0].sel = V_SQ_ALU_SRC_1;
4240 alu.src[0].chan = 0;
4241 alu.src[0].neg = 0;
4242 }
4243 break;
4244 default:
4245 break;
4246 }
4247 if (i == 3) {
4248 alu.last = 1;
4249 }
4250 r = r600_bytecode_add_alu(ctx->bc, &alu);
4251 if (r)
4252 return r;
4253 }
4254 return 0;
4255 }
4256
4257 static inline boolean tgsi_tex_src_requires_loading(struct r600_shader_ctx *ctx,
4258 unsigned index)
4259 {
4260 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4261 return (inst->Src[index].Register.File != TGSI_FILE_TEMPORARY &&
4262 inst->Src[index].Register.File != TGSI_FILE_INPUT &&
4263 inst->Src[index].Register.File != TGSI_FILE_OUTPUT) ||
4264 ctx->src[index].neg || ctx->src[index].abs;
4265 }
4266
4267 static inline unsigned tgsi_tex_get_src_gpr(struct r600_shader_ctx *ctx,
4268 unsigned index)
4269 {
4270 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4271 return ctx->file_offset[inst->Src[index].Register.File] + inst->Src[index].Register.Index;
4272 }
4273
4274 static int do_vtx_fetch_inst(struct r600_shader_ctx *ctx, boolean src_requires_loading)
4275 {
4276 struct r600_bytecode_vtx vtx;
4277 struct r600_bytecode_alu alu;
4278 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4279 int src_gpr, r, i;
4280 int id = tgsi_tex_get_src_gpr(ctx, 1);
4281
4282 src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
4283 if (src_requires_loading) {
4284 for (i = 0; i < 4; i++) {
4285 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4286 alu.op = ALU_OP1_MOV;
4287 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4288 alu.dst.sel = ctx->temp_reg;
4289 alu.dst.chan = i;
4290 if (i == 3)
4291 alu.last = 1;
4292 alu.dst.write = 1;
4293 r = r600_bytecode_add_alu(ctx->bc, &alu);
4294 if (r)
4295 return r;
4296 }
4297 src_gpr = ctx->temp_reg;
4298 }
4299
4300 memset(&vtx, 0, sizeof(vtx));
4301 vtx.op = FETCH_OP_VFETCH;
4302 vtx.buffer_id = id + R600_MAX_CONST_BUFFERS;
4303 vtx.fetch_type = 2; /* VTX_FETCH_NO_INDEX_OFFSET */
4304 vtx.src_gpr = src_gpr;
4305 vtx.mega_fetch_count = 16;
4306 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
4307 vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; /* SEL_X */
4308 vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; /* SEL_Y */
4309 vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; /* SEL_Z */
4310 vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; /* SEL_W */
4311 vtx.use_const_fields = 1;
4312 vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */
4313
4314 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
4315 return r;
4316
4317 if (ctx->bc->chip_class >= EVERGREEN)
4318 return 0;
4319
4320 for (i = 0; i < 4; i++) {
4321 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4322 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4323 continue;
4324
4325 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4326 alu.op = ALU_OP2_AND_INT;
4327
4328 alu.dst.chan = i;
4329 alu.dst.sel = vtx.dst_gpr;
4330 alu.dst.write = 1;
4331
4332 alu.src[0].sel = vtx.dst_gpr;
4333 alu.src[0].chan = i;
4334
4335 alu.src[1].sel = 512 + (id * 2);
4336 alu.src[1].chan = i % 4;
4337 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
4338
4339 if (i == lasti)
4340 alu.last = 1;
4341 r = r600_bytecode_add_alu(ctx->bc, &alu);
4342 if (r)
4343 return r;
4344 }
4345
4346 if (inst->Dst[0].Register.WriteMask & 3) {
4347 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4348 alu.op = ALU_OP2_OR_INT;
4349
4350 alu.dst.chan = 3;
4351 alu.dst.sel = vtx.dst_gpr;
4352 alu.dst.write = 1;
4353
4354 alu.src[0].sel = vtx.dst_gpr;
4355 alu.src[0].chan = 3;
4356
4357 alu.src[1].sel = 512 + (id * 2) + 1;
4358 alu.src[1].chan = 0;
4359 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
4360
4361 alu.last = 1;
4362 r = r600_bytecode_add_alu(ctx->bc, &alu);
4363 if (r)
4364 return r;
4365 }
4366 return 0;
4367 }
4368
4369 static int r600_do_buffer_txq(struct r600_shader_ctx *ctx)
4370 {
4371 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4372 struct r600_bytecode_alu alu;
4373 int r;
4374 int id = tgsi_tex_get_src_gpr(ctx, 1);
4375
4376 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4377 alu.op = ALU_OP1_MOV;
4378
4379 if (ctx->bc->chip_class >= EVERGREEN) {
4380 alu.src[0].sel = 512 + (id / 4);
4381 alu.src[0].chan = id % 4;
4382 } else {
4383 /* r600 we have them at channel 2 of the second dword */
4384 alu.src[0].sel = 512 + (id * 2) + 1;
4385 alu.src[0].chan = 1;
4386 }
4387 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
4388 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
4389 alu.last = 1;
4390 r = r600_bytecode_add_alu(ctx->bc, &alu);
4391 if (r)
4392 return r;
4393 return 0;
4394 }
4395
4396 static int tgsi_tex(struct r600_shader_ctx *ctx)
4397 {
4398 static float one_point_five = 1.5f;
4399 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4400 struct r600_bytecode_tex tex;
4401 struct r600_bytecode_alu alu;
4402 unsigned src_gpr;
4403 int r, i, j;
4404 int opcode;
4405 bool read_compressed_msaa = ctx->bc->has_compressed_msaa_texturing &&
4406 inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
4407 (inst->Texture.Texture == TGSI_TEXTURE_2D_MSAA ||
4408 inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA);
4409
4410 bool txf_add_offsets = inst->Texture.NumOffsets &&
4411 inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
4412 inst->Texture.Texture != TGSI_TEXTURE_BUFFER;
4413
4414 /* Texture fetch instructions can only use gprs as source.
4415 * Also they cannot negate the source or take the absolute value */
4416 const boolean src_requires_loading = (inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ &&
4417 tgsi_tex_src_requires_loading(ctx, 0)) ||
4418 read_compressed_msaa || txf_add_offsets;
4419
4420 boolean src_loaded = FALSE;
4421 unsigned sampler_src_reg = inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ ? 0 : 1;
4422 int8_t offset_x = 0, offset_y = 0, offset_z = 0;
4423 boolean has_txq_cube_array_z = false;
4424
4425 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ &&
4426 ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
4427 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)))
4428 if (inst->Dst[0].Register.WriteMask & 4) {
4429 ctx->shader->has_txq_cube_array_z_comp = true;
4430 has_txq_cube_array_z = true;
4431 }
4432
4433 if (inst->Instruction.Opcode == TGSI_OPCODE_TEX2 ||
4434 inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
4435 inst->Instruction.Opcode == TGSI_OPCODE_TXL2)
4436 sampler_src_reg = 2;
4437
4438 src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
4439
4440 if (inst->Texture.Texture == TGSI_TEXTURE_BUFFER) {
4441 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) {
4442 ctx->shader->uses_tex_buffers = true;
4443 return r600_do_buffer_txq(ctx);
4444 }
4445 else if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
4446 if (ctx->bc->chip_class < EVERGREEN)
4447 ctx->shader->uses_tex_buffers = true;
4448 return do_vtx_fetch_inst(ctx, src_requires_loading);
4449 }
4450 }
4451
4452 if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) {
4453 /* TGSI moves the sampler to src reg 3 for TXD */
4454 sampler_src_reg = 3;
4455
4456 for (i = 1; i < 3; i++) {
4457 /* set gradients h/v */
4458 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
4459 tex.op = (i == 1) ? FETCH_OP_SET_GRADIENTS_H :
4460 FETCH_OP_SET_GRADIENTS_V;
4461 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
4462 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
4463
4464 if (tgsi_tex_src_requires_loading(ctx, i)) {
4465 tex.src_gpr = r600_get_temp(ctx);
4466 tex.src_sel_x = 0;
4467 tex.src_sel_y = 1;
4468 tex.src_sel_z = 2;
4469 tex.src_sel_w = 3;
4470
4471 for (j = 0; j < 4; j++) {
4472 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4473 alu.op = ALU_OP1_MOV;
4474 r600_bytecode_src(&alu.src[0], &ctx->src[i], j);
4475 alu.dst.sel = tex.src_gpr;
4476 alu.dst.chan = j;
4477 if (j == 3)
4478 alu.last = 1;
4479 alu.dst.write = 1;
4480 r = r600_bytecode_add_alu(ctx->bc, &alu);
4481 if (r)
4482 return r;
4483 }
4484
4485 } else {
4486 tex.src_gpr = tgsi_tex_get_src_gpr(ctx, i);
4487 tex.src_sel_x = ctx->src[i].swizzle[0];
4488 tex.src_sel_y = ctx->src[i].swizzle[1];
4489 tex.src_sel_z = ctx->src[i].swizzle[2];
4490 tex.src_sel_w = ctx->src[i].swizzle[3];
4491 tex.src_rel = ctx->src[i].rel;
4492 }
4493 tex.dst_gpr = ctx->temp_reg; /* just to avoid confusing the asm scheduler */
4494 tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
4495 if (inst->Texture.Texture != TGSI_TEXTURE_RECT) {
4496 tex.coord_type_x = 1;
4497 tex.coord_type_y = 1;
4498 tex.coord_type_z = 1;
4499 tex.coord_type_w = 1;
4500 }
4501 r = r600_bytecode_add_tex(ctx->bc, &tex);
4502 if (r)
4503 return r;
4504 }
4505 } else if (inst->Instruction.Opcode == TGSI_OPCODE_TXP) {
4506 int out_chan;
4507 /* Add perspective divide */
4508 if (ctx->bc->chip_class == CAYMAN) {
4509 out_chan = 2;
4510 for (i = 0; i < 3; i++) {
4511 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4512 alu.op = ALU_OP1_RECIP_IEEE;
4513 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
4514
4515 alu.dst.sel = ctx->temp_reg;
4516 alu.dst.chan = i;
4517 if (i == 2)
4518 alu.last = 1;
4519 if (out_chan == i)
4520 alu.dst.write = 1;
4521 r = r600_bytecode_add_alu(ctx->bc, &alu);
4522 if (r)
4523 return r;
4524 }
4525
4526 } else {
4527 out_chan = 3;
4528 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4529 alu.op = ALU_OP1_RECIP_IEEE;
4530 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
4531
4532 alu.dst.sel = ctx->temp_reg;
4533 alu.dst.chan = out_chan;
4534 alu.last = 1;
4535 alu.dst.write = 1;
4536 r = r600_bytecode_add_alu(ctx->bc, &alu);
4537 if (r)
4538 return r;
4539 }
4540
4541 for (i = 0; i < 3; i++) {
4542 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4543 alu.op = ALU_OP2_MUL;
4544 alu.src[0].sel = ctx->temp_reg;
4545 alu.src[0].chan = out_chan;
4546 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4547 alu.dst.sel = ctx->temp_reg;
4548 alu.dst.chan = i;
4549 alu.dst.write = 1;
4550 r = r600_bytecode_add_alu(ctx->bc, &alu);
4551 if (r)
4552 return r;
4553 }
4554 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4555 alu.op = ALU_OP1_MOV;
4556 alu.src[0].sel = V_SQ_ALU_SRC_1;
4557 alu.src[0].chan = 0;
4558 alu.dst.sel = ctx->temp_reg;
4559 alu.dst.chan = 3;
4560 alu.last = 1;
4561 alu.dst.write = 1;
4562 r = r600_bytecode_add_alu(ctx->bc, &alu);
4563 if (r)
4564 return r;
4565 src_loaded = TRUE;
4566 src_gpr = ctx->temp_reg;
4567 }
4568
4569 if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
4570 inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
4571 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
4572 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
4573 inst->Instruction.Opcode != TGSI_OPCODE_TXQ &&
4574 inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ) {
4575
4576 static const unsigned src0_swizzle[] = {2, 2, 0, 1};
4577 static const unsigned src1_swizzle[] = {1, 0, 2, 2};
4578
4579 /* tmp1.xyzw = CUBE(R0.zzxy, R0.yxzz) */
4580 for (i = 0; i < 4; i++) {
4581 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4582 alu.op = ALU_OP2_CUBE;
4583 r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
4584 r600_bytecode_src(&alu.src[1], &ctx->src[0], src1_swizzle[i]);
4585 alu.dst.sel = ctx->temp_reg;
4586 alu.dst.chan = i;
4587 if (i == 3)
4588 alu.last = 1;
4589 alu.dst.write = 1;
4590 r = r600_bytecode_add_alu(ctx->bc, &alu);
4591 if (r)
4592 return r;
4593 }
4594
4595 /* tmp1.z = RCP_e(|tmp1.z|) */
4596 if (ctx->bc->chip_class == CAYMAN) {
4597 for (i = 0; i < 3; i++) {
4598 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4599 alu.op = ALU_OP1_RECIP_IEEE;
4600 alu.src[0].sel = ctx->temp_reg;
4601 alu.src[0].chan = 2;
4602 alu.src[0].abs = 1;
4603 alu.dst.sel = ctx->temp_reg;
4604 alu.dst.chan = i;
4605 if (i == 2)
4606 alu.dst.write = 1;
4607 if (i == 2)
4608 alu.last = 1;
4609 r = r600_bytecode_add_alu(ctx->bc, &alu);
4610 if (r)
4611 return r;
4612 }
4613 } else {
4614 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4615 alu.op = ALU_OP1_RECIP_IEEE;
4616 alu.src[0].sel = ctx->temp_reg;
4617 alu.src[0].chan = 2;
4618 alu.src[0].abs = 1;
4619 alu.dst.sel = ctx->temp_reg;
4620 alu.dst.chan = 2;
4621 alu.dst.write = 1;
4622 alu.last = 1;
4623 r = r600_bytecode_add_alu(ctx->bc, &alu);
4624 if (r)
4625 return r;
4626 }
4627
4628 /* MULADD R0.x, R0.x, PS1, (0x3FC00000, 1.5f).x
4629 * MULADD R0.y, R0.y, PS1, (0x3FC00000, 1.5f).x
4630 * muladd has no writemask, have to use another temp
4631 */
4632 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4633 alu.op = ALU_OP3_MULADD;
4634 alu.is_op3 = 1;
4635
4636 alu.src[0].sel = ctx->temp_reg;
4637 alu.src[0].chan = 0;
4638 alu.src[1].sel = ctx->temp_reg;
4639 alu.src[1].chan = 2;
4640
4641 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
4642 alu.src[2].chan = 0;
4643 alu.src[2].value = *(uint32_t *)&one_point_five;
4644
4645 alu.dst.sel = ctx->temp_reg;
4646 alu.dst.chan = 0;
4647 alu.dst.write = 1;
4648
4649 r = r600_bytecode_add_alu(ctx->bc, &alu);
4650 if (r)
4651 return r;
4652
4653 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4654 alu.op = ALU_OP3_MULADD;
4655 alu.is_op3 = 1;
4656
4657 alu.src[0].sel = ctx->temp_reg;
4658 alu.src[0].chan = 1;
4659 alu.src[1].sel = ctx->temp_reg;
4660 alu.src[1].chan = 2;
4661
4662 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
4663 alu.src[2].chan = 0;
4664 alu.src[2].value = *(uint32_t *)&one_point_five;
4665
4666 alu.dst.sel = ctx->temp_reg;
4667 alu.dst.chan = 1;
4668 alu.dst.write = 1;
4669
4670 alu.last = 1;
4671 r = r600_bytecode_add_alu(ctx->bc, &alu);
4672 if (r)
4673 return r;
4674 /* write initial compare value into Z component
4675 - W src 0 for shadow cube
4676 - X src 1 for shadow cube array */
4677 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
4678 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4679 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4680 alu.op = ALU_OP1_MOV;
4681 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
4682 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
4683 else
4684 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
4685 alu.dst.sel = ctx->temp_reg;
4686 alu.dst.chan = 2;
4687 alu.dst.write = 1;
4688 alu.last = 1;
4689 r = r600_bytecode_add_alu(ctx->bc, &alu);
4690 if (r)
4691 return r;
4692 }
4693
4694 if (inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
4695 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4696 if (ctx->bc->chip_class >= EVERGREEN) {
4697 int mytmp = r600_get_temp(ctx);
4698 static const float eight = 8.0f;
4699 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4700 alu.op = ALU_OP1_MOV;
4701 alu.src[0].sel = ctx->temp_reg;
4702 alu.src[0].chan = 3;
4703 alu.dst.sel = mytmp;
4704 alu.dst.chan = 0;
4705 alu.dst.write = 1;
4706 alu.last = 1;
4707 r = r600_bytecode_add_alu(ctx->bc, &alu);
4708 if (r)
4709 return r;
4710
4711 /* have to multiply original layer by 8 and add to face id (temp.w) in Z */
4712 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4713 alu.op = ALU_OP3_MULADD;
4714 alu.is_op3 = 1;
4715 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
4716 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4717 alu.src[1].chan = 0;
4718 alu.src[1].value = *(uint32_t *)&eight;
4719 alu.src[2].sel = mytmp;
4720 alu.src[2].chan = 0;
4721 alu.dst.sel = ctx->temp_reg;
4722 alu.dst.chan = 3;
4723 alu.dst.write = 1;
4724 alu.last = 1;
4725 r = r600_bytecode_add_alu(ctx->bc, &alu);
4726 if (r)
4727 return r;
4728 } else if (ctx->bc->chip_class < EVERGREEN) {
4729 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
4730 tex.op = FETCH_OP_SET_CUBEMAP_INDEX;
4731 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
4732 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
4733 tex.src_gpr = r600_get_temp(ctx);
4734 tex.src_sel_x = 0;
4735 tex.src_sel_y = 0;
4736 tex.src_sel_z = 0;
4737 tex.src_sel_w = 0;
4738 tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
4739 tex.coord_type_x = 1;
4740 tex.coord_type_y = 1;
4741 tex.coord_type_z = 1;
4742 tex.coord_type_w = 1;
4743 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4744 alu.op = ALU_OP1_MOV;
4745 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
4746 alu.dst.sel = tex.src_gpr;
4747 alu.dst.chan = 0;
4748 alu.last = 1;
4749 alu.dst.write = 1;
4750 r = r600_bytecode_add_alu(ctx->bc, &alu);
4751 if (r)
4752 return r;
4753
4754 r = r600_bytecode_add_tex(ctx->bc, &tex);
4755 if (r)
4756 return r;
4757 }
4758
4759 }
4760
4761 /* for cube forms of lod and bias we need to route things */
4762 if (inst->Instruction.Opcode == TGSI_OPCODE_TXB ||
4763 inst->Instruction.Opcode == TGSI_OPCODE_TXL ||
4764 inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
4765 inst->Instruction.Opcode == TGSI_OPCODE_TXL2) {
4766 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4767 alu.op = ALU_OP1_MOV;
4768 if (inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
4769 inst->Instruction.Opcode == TGSI_OPCODE_TXL2)
4770 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
4771 else
4772 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
4773 alu.dst.sel = ctx->temp_reg;
4774 alu.dst.chan = 2;
4775 alu.last = 1;
4776 alu.dst.write = 1;
4777 r = r600_bytecode_add_alu(ctx->bc, &alu);
4778 if (r)
4779 return r;
4780 }
4781
4782 src_loaded = TRUE;
4783 src_gpr = ctx->temp_reg;
4784 }
4785
4786 if (src_requires_loading && !src_loaded) {
4787 for (i = 0; i < 4; i++) {
4788 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4789 alu.op = ALU_OP1_MOV;
4790 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4791 alu.dst.sel = ctx->temp_reg;
4792 alu.dst.chan = i;
4793 if (i == 3)
4794 alu.last = 1;
4795 alu.dst.write = 1;
4796 r = r600_bytecode_add_alu(ctx->bc, &alu);
4797 if (r)
4798 return r;
4799 }
4800 src_loaded = TRUE;
4801 src_gpr = ctx->temp_reg;
4802 }
4803
4804 /* get offset values */
4805 if (inst->Texture.NumOffsets) {
4806 assert(inst->Texture.NumOffsets == 1);
4807
4808 /* The texture offset feature doesn't work with the TXF instruction
4809 * and must be emulated by adding the offset to the texture coordinates. */
4810 if (txf_add_offsets) {
4811 const struct tgsi_texture_offset *off = inst->TexOffsets;
4812
4813 switch (inst->Texture.Texture) {
4814 case TGSI_TEXTURE_3D:
4815 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4816 alu.op = ALU_OP2_ADD_INT;
4817 alu.src[0].sel = src_gpr;
4818 alu.src[0].chan = 2;
4819 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4820 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleZ];
4821 alu.dst.sel = src_gpr;
4822 alu.dst.chan = 2;
4823 alu.dst.write = 1;
4824 alu.last = 1;
4825 r = r600_bytecode_add_alu(ctx->bc, &alu);
4826 if (r)
4827 return r;
4828 /* fall through */
4829
4830 case TGSI_TEXTURE_2D:
4831 case TGSI_TEXTURE_SHADOW2D:
4832 case TGSI_TEXTURE_RECT:
4833 case TGSI_TEXTURE_SHADOWRECT:
4834 case TGSI_TEXTURE_2D_ARRAY:
4835 case TGSI_TEXTURE_SHADOW2D_ARRAY:
4836 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4837 alu.op = ALU_OP2_ADD_INT;
4838 alu.src[0].sel = src_gpr;
4839 alu.src[0].chan = 1;
4840 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4841 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleY];
4842 alu.dst.sel = src_gpr;
4843 alu.dst.chan = 1;
4844 alu.dst.write = 1;
4845 alu.last = 1;
4846 r = r600_bytecode_add_alu(ctx->bc, &alu);
4847 if (r)
4848 return r;
4849 /* fall through */
4850
4851 case TGSI_TEXTURE_1D:
4852 case TGSI_TEXTURE_SHADOW1D:
4853 case TGSI_TEXTURE_1D_ARRAY:
4854 case TGSI_TEXTURE_SHADOW1D_ARRAY:
4855 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4856 alu.op = ALU_OP2_ADD_INT;
4857 alu.src[0].sel = src_gpr;
4858 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4859 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleX];
4860 alu.dst.sel = src_gpr;
4861 alu.dst.write = 1;
4862 alu.last = 1;
4863 r = r600_bytecode_add_alu(ctx->bc, &alu);
4864 if (r)
4865 return r;
4866 break;
4867 /* texture offsets do not apply to other texture targets */
4868 }
4869 } else {
4870 offset_x = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleX] << 1;
4871 offset_y = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleY] << 1;
4872 offset_z = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleZ] << 1;
4873 }
4874 }
4875
4876 /* Obtain the sample index for reading a compressed MSAA color texture.
4877 * To read the FMASK, we use the ldfptr instruction, which tells us
4878 * where the samples are stored.
4879 * For uncompressed 8x MSAA surfaces, ldfptr should return 0x76543210,
4880 * which is the identity mapping. Each nibble says which physical sample
4881 * should be fetched to get that sample.
4882 *
4883 * Assume src.z contains the sample index. It should be modified like this:
4884 * src.z = (ldfptr() >> (src.z * 4)) & 0xF;
4885 * Then fetch the texel with src.
4886 */
4887 if (read_compressed_msaa) {
4888 unsigned sample_chan = 3;
4889 unsigned temp = r600_get_temp(ctx);
4890 assert(src_loaded);
4891
4892 /* temp.w = ldfptr() */
4893 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
4894 tex.op = FETCH_OP_LD;
4895 tex.inst_mod = 1; /* to indicate this is ldfptr */
4896 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
4897 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
4898 tex.src_gpr = src_gpr;
4899 tex.dst_gpr = temp;
4900 tex.dst_sel_x = 7; /* mask out these components */
4901 tex.dst_sel_y = 7;
4902 tex.dst_sel_z = 7;
4903 tex.dst_sel_w = 0; /* store X */
4904 tex.src_sel_x = 0;
4905 tex.src_sel_y = 1;
4906 tex.src_sel_z = 2;
4907 tex.src_sel_w = 3;
4908 tex.offset_x = offset_x;
4909 tex.offset_y = offset_y;
4910 tex.offset_z = offset_z;
4911 r = r600_bytecode_add_tex(ctx->bc, &tex);
4912 if (r)
4913 return r;
4914
4915 /* temp.x = sample_index*4 */
4916 if (ctx->bc->chip_class == CAYMAN) {
4917 for (i = 0 ; i < 4; i++) {
4918 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4919 alu.op = ALU_OP2_MULLO_INT;
4920 alu.src[0].sel = src_gpr;
4921 alu.src[0].chan = sample_chan;
4922 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4923 alu.src[1].value = 4;
4924 alu.dst.sel = temp;
4925 alu.dst.chan = i;
4926 alu.dst.write = i == 0;
4927 if (i == 3)
4928 alu.last = 1;
4929 r = r600_bytecode_add_alu(ctx->bc, &alu);
4930 if (r)
4931 return r;
4932 }
4933 } else {
4934 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4935 alu.op = ALU_OP2_MULLO_INT;
4936 alu.src[0].sel = src_gpr;
4937 alu.src[0].chan = sample_chan;
4938 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4939 alu.src[1].value = 4;
4940 alu.dst.sel = temp;
4941 alu.dst.chan = 0;
4942 alu.dst.write = 1;
4943 alu.last = 1;
4944 r = r600_bytecode_add_alu(ctx->bc, &alu);
4945 if (r)
4946 return r;
4947 }
4948
4949 /* sample_index = temp.w >> temp.x */
4950 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4951 alu.op = ALU_OP2_LSHR_INT;
4952 alu.src[0].sel = temp;
4953 alu.src[0].chan = 3;
4954 alu.src[1].sel = temp;
4955 alu.src[1].chan = 0;
4956 alu.dst.sel = src_gpr;
4957 alu.dst.chan = sample_chan;
4958 alu.dst.write = 1;
4959 alu.last = 1;
4960 r = r600_bytecode_add_alu(ctx->bc, &alu);
4961 if (r)
4962 return r;
4963
4964 /* sample_index & 0xF */
4965 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4966 alu.op = ALU_OP2_AND_INT;
4967 alu.src[0].sel = src_gpr;
4968 alu.src[0].chan = sample_chan;
4969 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4970 alu.src[1].value = 0xF;
4971 alu.dst.sel = src_gpr;
4972 alu.dst.chan = sample_chan;
4973 alu.dst.write = 1;
4974 alu.last = 1;
4975 r = r600_bytecode_add_alu(ctx->bc, &alu);
4976 if (r)
4977 return r;
4978 #if 0
4979 /* visualize the FMASK */
4980 for (i = 0; i < 4; i++) {
4981 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4982 alu.op = ALU_OP1_INT_TO_FLT;
4983 alu.src[0].sel = src_gpr;
4984 alu.src[0].chan = sample_chan;
4985 alu.dst.sel = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
4986 alu.dst.chan = i;
4987 alu.dst.write = 1;
4988 alu.last = 1;
4989 r = r600_bytecode_add_alu(ctx->bc, &alu);
4990 if (r)
4991 return r;
4992 }
4993 return 0;
4994 #endif
4995 }
4996
4997 /* does this shader want a num layers from TXQ for a cube array? */
4998 if (has_txq_cube_array_z) {
4999 int id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
5000
5001 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5002 alu.op = ALU_OP1_MOV;
5003
5004 alu.src[0].sel = 512 + (id / 4);
5005 alu.src[0].kc_bank = R600_TXQ_CONST_BUFFER;
5006 alu.src[0].chan = id % 4;
5007 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
5008 alu.last = 1;
5009 r = r600_bytecode_add_alu(ctx->bc, &alu);
5010 if (r)
5011 return r;
5012 /* disable writemask from texture instruction */
5013 inst->Dst[0].Register.WriteMask &= ~4;
5014 }
5015
5016 opcode = ctx->inst_info->op;
5017 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
5018 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
5019 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
5020 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
5021 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY ||
5022 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
5023 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
5024 switch (opcode) {
5025 case FETCH_OP_SAMPLE:
5026 opcode = FETCH_OP_SAMPLE_C;
5027 break;
5028 case FETCH_OP_SAMPLE_L:
5029 opcode = FETCH_OP_SAMPLE_C_L;
5030 break;
5031 case FETCH_OP_SAMPLE_LB:
5032 opcode = FETCH_OP_SAMPLE_C_LB;
5033 break;
5034 case FETCH_OP_SAMPLE_G:
5035 opcode = FETCH_OP_SAMPLE_C_G;
5036 break;
5037 }
5038 }
5039
5040 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
5041 tex.op = opcode;
5042
5043 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
5044 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
5045 tex.src_gpr = src_gpr;
5046 tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
5047 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
5048 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
5049 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
5050 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
5051
5052 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ) {
5053 tex.src_sel_x = 4;
5054 tex.src_sel_y = 4;
5055 tex.src_sel_z = 4;
5056 tex.src_sel_w = 4;
5057 } else if (src_loaded) {
5058 tex.src_sel_x = 0;
5059 tex.src_sel_y = 1;
5060 tex.src_sel_z = 2;
5061 tex.src_sel_w = 3;
5062 } else {
5063 tex.src_sel_x = ctx->src[0].swizzle[0];
5064 tex.src_sel_y = ctx->src[0].swizzle[1];
5065 tex.src_sel_z = ctx->src[0].swizzle[2];
5066 tex.src_sel_w = ctx->src[0].swizzle[3];
5067 tex.src_rel = ctx->src[0].rel;
5068 }
5069
5070 if (inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
5071 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
5072 inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
5073 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
5074 tex.src_sel_x = 1;
5075 tex.src_sel_y = 0;
5076 tex.src_sel_z = 3;
5077 tex.src_sel_w = 2; /* route Z compare or Lod value into W */
5078 }
5079
5080 if (inst->Texture.Texture != TGSI_TEXTURE_RECT &&
5081 inst->Texture.Texture != TGSI_TEXTURE_SHADOWRECT) {
5082 tex.coord_type_x = 1;
5083 tex.coord_type_y = 1;
5084 }
5085 tex.coord_type_z = 1;
5086 tex.coord_type_w = 1;
5087
5088 tex.offset_x = offset_x;
5089 tex.offset_y = offset_y;
5090 tex.offset_z = offset_z;
5091
5092 /* Put the depth for comparison in W.
5093 * TGSI_TEXTURE_SHADOW2D_ARRAY already has the depth in W.
5094 * Some instructions expect the depth in Z. */
5095 if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
5096 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
5097 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
5098 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) &&
5099 opcode != FETCH_OP_SAMPLE_C_L &&
5100 opcode != FETCH_OP_SAMPLE_C_LB) {
5101 tex.src_sel_w = tex.src_sel_z;
5102 }
5103
5104 if (inst->Texture.Texture == TGSI_TEXTURE_1D_ARRAY ||
5105 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) {
5106 if (opcode == FETCH_OP_SAMPLE_C_L ||
5107 opcode == FETCH_OP_SAMPLE_C_LB) {
5108 /* the array index is read from Y */
5109 tex.coord_type_y = 0;
5110 } else {
5111 /* the array index is read from Z */
5112 tex.coord_type_z = 0;
5113 tex.src_sel_z = tex.src_sel_y;
5114 }
5115 } else if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
5116 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
5117 ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
5118 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
5119 (ctx->bc->chip_class >= EVERGREEN)))
5120 /* the array index is read from Z */
5121 tex.coord_type_z = 0;
5122
5123 /* mask unused source components */
5124 if (opcode == FETCH_OP_SAMPLE) {
5125 switch (inst->Texture.Texture) {
5126 case TGSI_TEXTURE_2D:
5127 case TGSI_TEXTURE_RECT:
5128 tex.src_sel_z = 7;
5129 tex.src_sel_w = 7;
5130 break;
5131 case TGSI_TEXTURE_1D_ARRAY:
5132 tex.src_sel_y = 7;
5133 tex.src_sel_w = 7;
5134 break;
5135 case TGSI_TEXTURE_1D:
5136 tex.src_sel_y = 7;
5137 tex.src_sel_z = 7;
5138 tex.src_sel_w = 7;
5139 break;
5140 }
5141 }
5142
5143 r = r600_bytecode_add_tex(ctx->bc, &tex);
5144 if (r)
5145 return r;
5146
5147 /* add shadow ambient support - gallium doesn't do it yet */
5148 return 0;
5149 }
5150
5151 static int tgsi_lrp(struct r600_shader_ctx *ctx)
5152 {
5153 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5154 struct r600_bytecode_alu alu;
5155 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5156 unsigned i;
5157 int r;
5158
5159 /* optimize if it's just an equal balance */
5160 if (ctx->src[0].sel == V_SQ_ALU_SRC_0_5) {
5161 for (i = 0; i < lasti + 1; i++) {
5162 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5163 continue;
5164
5165 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5166 alu.op = ALU_OP2_ADD;
5167 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5168 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
5169 alu.omod = 3;
5170 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5171 alu.dst.chan = i;
5172 if (i == lasti) {
5173 alu.last = 1;
5174 }
5175 r = r600_bytecode_add_alu(ctx->bc, &alu);
5176 if (r)
5177 return r;
5178 }
5179 return 0;
5180 }
5181
5182 /* 1 - src0 */
5183 for (i = 0; i < lasti + 1; i++) {
5184 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5185 continue;
5186
5187 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5188 alu.op = ALU_OP2_ADD;
5189 alu.src[0].sel = V_SQ_ALU_SRC_1;
5190 alu.src[0].chan = 0;
5191 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5192 r600_bytecode_src_toggle_neg(&alu.src[1]);
5193 alu.dst.sel = ctx->temp_reg;
5194 alu.dst.chan = i;
5195 if (i == lasti) {
5196 alu.last = 1;
5197 }
5198 alu.dst.write = 1;
5199 r = r600_bytecode_add_alu(ctx->bc, &alu);
5200 if (r)
5201 return r;
5202 }
5203
5204 /* (1 - src0) * src2 */
5205 for (i = 0; i < lasti + 1; i++) {
5206 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5207 continue;
5208
5209 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5210 alu.op = ALU_OP2_MUL;
5211 alu.src[0].sel = ctx->temp_reg;
5212 alu.src[0].chan = i;
5213 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
5214 alu.dst.sel = ctx->temp_reg;
5215 alu.dst.chan = i;
5216 if (i == lasti) {
5217 alu.last = 1;
5218 }
5219 alu.dst.write = 1;
5220 r = r600_bytecode_add_alu(ctx->bc, &alu);
5221 if (r)
5222 return r;
5223 }
5224
5225 /* src0 * src1 + (1 - src0) * src2 */
5226 for (i = 0; i < lasti + 1; i++) {
5227 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5228 continue;
5229
5230 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5231 alu.op = ALU_OP3_MULADD;
5232 alu.is_op3 = 1;
5233 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5234 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5235 alu.src[2].sel = ctx->temp_reg;
5236 alu.src[2].chan = i;
5237
5238 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5239 alu.dst.chan = i;
5240 if (i == lasti) {
5241 alu.last = 1;
5242 }
5243 r = r600_bytecode_add_alu(ctx->bc, &alu);
5244 if (r)
5245 return r;
5246 }
5247 return 0;
5248 }
5249
5250 static int tgsi_cmp(struct r600_shader_ctx *ctx)
5251 {
5252 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5253 struct r600_bytecode_alu alu;
5254 int i, r;
5255 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5256
5257 for (i = 0; i < lasti + 1; i++) {
5258 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5259 continue;
5260
5261 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5262 alu.op = ALU_OP3_CNDGE;
5263 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5264 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
5265 r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
5266 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5267 alu.dst.chan = i;
5268 alu.dst.write = 1;
5269 alu.is_op3 = 1;
5270 if (i == lasti)
5271 alu.last = 1;
5272 r = r600_bytecode_add_alu(ctx->bc, &alu);
5273 if (r)
5274 return r;
5275 }
5276 return 0;
5277 }
5278
5279 static int tgsi_ucmp(struct r600_shader_ctx *ctx)
5280 {
5281 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5282 struct r600_bytecode_alu alu;
5283 int i, r;
5284 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5285
5286 for (i = 0; i < lasti + 1; i++) {
5287 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5288 continue;
5289
5290 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5291 alu.op = ALU_OP3_CNDGE_INT;
5292 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5293 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
5294 r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
5295 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5296 alu.dst.chan = i;
5297 alu.dst.write = 1;
5298 alu.is_op3 = 1;
5299 if (i == lasti)
5300 alu.last = 1;
5301 r = r600_bytecode_add_alu(ctx->bc, &alu);
5302 if (r)
5303 return r;
5304 }
5305 return 0;
5306 }
5307
5308 static int tgsi_xpd(struct r600_shader_ctx *ctx)
5309 {
5310 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5311 static const unsigned int src0_swizzle[] = {2, 0, 1};
5312 static const unsigned int src1_swizzle[] = {1, 2, 0};
5313 struct r600_bytecode_alu alu;
5314 uint32_t use_temp = 0;
5315 int i, r;
5316
5317 if (inst->Dst[0].Register.WriteMask != 0xf)
5318 use_temp = 1;
5319
5320 for (i = 0; i < 4; i++) {
5321 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5322 alu.op = ALU_OP2_MUL;
5323 if (i < 3) {
5324 r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
5325 r600_bytecode_src(&alu.src[1], &ctx->src[1], src1_swizzle[i]);
5326 } else {
5327 alu.src[0].sel = V_SQ_ALU_SRC_0;
5328 alu.src[0].chan = i;
5329 alu.src[1].sel = V_SQ_ALU_SRC_0;
5330 alu.src[1].chan = i;
5331 }
5332
5333 alu.dst.sel = ctx->temp_reg;
5334 alu.dst.chan = i;
5335 alu.dst.write = 1;
5336
5337 if (i == 3)
5338 alu.last = 1;
5339 r = r600_bytecode_add_alu(ctx->bc, &alu);
5340 if (r)
5341 return r;
5342 }
5343
5344 for (i = 0; i < 4; i++) {
5345 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5346 alu.op = ALU_OP3_MULADD;
5347
5348 if (i < 3) {
5349 r600_bytecode_src(&alu.src[0], &ctx->src[0], src1_swizzle[i]);
5350 r600_bytecode_src(&alu.src[1], &ctx->src[1], src0_swizzle[i]);
5351 } else {
5352 alu.src[0].sel = V_SQ_ALU_SRC_0;
5353 alu.src[0].chan = i;
5354 alu.src[1].sel = V_SQ_ALU_SRC_0;
5355 alu.src[1].chan = i;
5356 }
5357
5358 alu.src[2].sel = ctx->temp_reg;
5359 alu.src[2].neg = 1;
5360 alu.src[2].chan = i;
5361
5362 if (use_temp)
5363 alu.dst.sel = ctx->temp_reg;
5364 else
5365 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5366 alu.dst.chan = i;
5367 alu.dst.write = 1;
5368 alu.is_op3 = 1;
5369 if (i == 3)
5370 alu.last = 1;
5371 r = r600_bytecode_add_alu(ctx->bc, &alu);
5372 if (r)
5373 return r;
5374 }
5375 if (use_temp)
5376 return tgsi_helper_copy(ctx, inst);
5377 return 0;
5378 }
5379
5380 static int tgsi_exp(struct r600_shader_ctx *ctx)
5381 {
5382 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5383 struct r600_bytecode_alu alu;
5384 int r;
5385 int i;
5386
5387 /* result.x = 2^floor(src); */
5388 if (inst->Dst[0].Register.WriteMask & 1) {
5389 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5390
5391 alu.op = ALU_OP1_FLOOR;
5392 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5393
5394 alu.dst.sel = ctx->temp_reg;
5395 alu.dst.chan = 0;
5396 alu.dst.write = 1;
5397 alu.last = 1;
5398 r = r600_bytecode_add_alu(ctx->bc, &alu);
5399 if (r)
5400 return r;
5401
5402 if (ctx->bc->chip_class == CAYMAN) {
5403 for (i = 0; i < 3; i++) {
5404 alu.op = ALU_OP1_EXP_IEEE;
5405 alu.src[0].sel = ctx->temp_reg;
5406 alu.src[0].chan = 0;
5407
5408 alu.dst.sel = ctx->temp_reg;
5409 alu.dst.chan = i;
5410 alu.dst.write = i == 0;
5411 alu.last = i == 2;
5412 r = r600_bytecode_add_alu(ctx->bc, &alu);
5413 if (r)
5414 return r;
5415 }
5416 } else {
5417 alu.op = ALU_OP1_EXP_IEEE;
5418 alu.src[0].sel = ctx->temp_reg;
5419 alu.src[0].chan = 0;
5420
5421 alu.dst.sel = ctx->temp_reg;
5422 alu.dst.chan = 0;
5423 alu.dst.write = 1;
5424 alu.last = 1;
5425 r = r600_bytecode_add_alu(ctx->bc, &alu);
5426 if (r)
5427 return r;
5428 }
5429 }
5430
5431 /* result.y = tmp - floor(tmp); */
5432 if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
5433 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5434
5435 alu.op = ALU_OP1_FRACT;
5436 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5437
5438 alu.dst.sel = ctx->temp_reg;
5439 #if 0
5440 r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5441 if (r)
5442 return r;
5443 #endif
5444 alu.dst.write = 1;
5445 alu.dst.chan = 1;
5446
5447 alu.last = 1;
5448
5449 r = r600_bytecode_add_alu(ctx->bc, &alu);
5450 if (r)
5451 return r;
5452 }
5453
5454 /* result.z = RoughApprox2ToX(tmp);*/
5455 if ((inst->Dst[0].Register.WriteMask >> 2) & 0x1) {
5456 if (ctx->bc->chip_class == CAYMAN) {
5457 for (i = 0; i < 3; i++) {
5458 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5459 alu.op = ALU_OP1_EXP_IEEE;
5460 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5461
5462 alu.dst.sel = ctx->temp_reg;
5463 alu.dst.chan = i;
5464 if (i == 2) {
5465 alu.dst.write = 1;
5466 alu.last = 1;
5467 }
5468
5469 r = r600_bytecode_add_alu(ctx->bc, &alu);
5470 if (r)
5471 return r;
5472 }
5473 } else {
5474 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5475 alu.op = ALU_OP1_EXP_IEEE;
5476 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5477
5478 alu.dst.sel = ctx->temp_reg;
5479 alu.dst.write = 1;
5480 alu.dst.chan = 2;
5481
5482 alu.last = 1;
5483
5484 r = r600_bytecode_add_alu(ctx->bc, &alu);
5485 if (r)
5486 return r;
5487 }
5488 }
5489
5490 /* result.w = 1.0;*/
5491 if ((inst->Dst[0].Register.WriteMask >> 3) & 0x1) {
5492 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5493
5494 alu.op = ALU_OP1_MOV;
5495 alu.src[0].sel = V_SQ_ALU_SRC_1;
5496 alu.src[0].chan = 0;
5497
5498 alu.dst.sel = ctx->temp_reg;
5499 alu.dst.chan = 3;
5500 alu.dst.write = 1;
5501 alu.last = 1;
5502 r = r600_bytecode_add_alu(ctx->bc, &alu);
5503 if (r)
5504 return r;
5505 }
5506 return tgsi_helper_copy(ctx, inst);
5507 }
5508
5509 static int tgsi_log(struct r600_shader_ctx *ctx)
5510 {
5511 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5512 struct r600_bytecode_alu alu;
5513 int r;
5514 int i;
5515
5516 /* result.x = floor(log2(|src|)); */
5517 if (inst->Dst[0].Register.WriteMask & 1) {
5518 if (ctx->bc->chip_class == CAYMAN) {
5519 for (i = 0; i < 3; i++) {
5520 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5521
5522 alu.op = ALU_OP1_LOG_IEEE;
5523 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5524 r600_bytecode_src_set_abs(&alu.src[0]);
5525
5526 alu.dst.sel = ctx->temp_reg;
5527 alu.dst.chan = i;
5528 if (i == 0)
5529 alu.dst.write = 1;
5530 if (i == 2)
5531 alu.last = 1;
5532 r = r600_bytecode_add_alu(ctx->bc, &alu);
5533 if (r)
5534 return r;
5535 }
5536
5537 } else {
5538 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5539
5540 alu.op = ALU_OP1_LOG_IEEE;
5541 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5542 r600_bytecode_src_set_abs(&alu.src[0]);
5543
5544 alu.dst.sel = ctx->temp_reg;
5545 alu.dst.chan = 0;
5546 alu.dst.write = 1;
5547 alu.last = 1;
5548 r = r600_bytecode_add_alu(ctx->bc, &alu);
5549 if (r)
5550 return r;
5551 }
5552
5553 alu.op = ALU_OP1_FLOOR;
5554 alu.src[0].sel = ctx->temp_reg;
5555 alu.src[0].chan = 0;
5556
5557 alu.dst.sel = ctx->temp_reg;
5558 alu.dst.chan = 0;
5559 alu.dst.write = 1;
5560 alu.last = 1;
5561
5562 r = r600_bytecode_add_alu(ctx->bc, &alu);
5563 if (r)
5564 return r;
5565 }
5566
5567 /* result.y = |src.x| / (2 ^ floor(log2(|src.x|))); */
5568 if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
5569
5570 if (ctx->bc->chip_class == CAYMAN) {
5571 for (i = 0; i < 3; i++) {
5572 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5573
5574 alu.op = ALU_OP1_LOG_IEEE;
5575 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5576 r600_bytecode_src_set_abs(&alu.src[0]);
5577
5578 alu.dst.sel = ctx->temp_reg;
5579 alu.dst.chan = i;
5580 if (i == 1)
5581 alu.dst.write = 1;
5582 if (i == 2)
5583 alu.last = 1;
5584
5585 r = r600_bytecode_add_alu(ctx->bc, &alu);
5586 if (r)
5587 return r;
5588 }
5589 } else {
5590 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5591
5592 alu.op = ALU_OP1_LOG_IEEE;
5593 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5594 r600_bytecode_src_set_abs(&alu.src[0]);
5595
5596 alu.dst.sel = ctx->temp_reg;
5597 alu.dst.chan = 1;
5598 alu.dst.write = 1;
5599 alu.last = 1;
5600
5601 r = r600_bytecode_add_alu(ctx->bc, &alu);
5602 if (r)
5603 return r;
5604 }
5605
5606 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5607
5608 alu.op = ALU_OP1_FLOOR;
5609 alu.src[0].sel = ctx->temp_reg;
5610 alu.src[0].chan = 1;
5611
5612 alu.dst.sel = ctx->temp_reg;
5613 alu.dst.chan = 1;
5614 alu.dst.write = 1;
5615 alu.last = 1;
5616
5617 r = r600_bytecode_add_alu(ctx->bc, &alu);
5618 if (r)
5619 return r;
5620
5621 if (ctx->bc->chip_class == CAYMAN) {
5622 for (i = 0; i < 3; i++) {
5623 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5624 alu.op = ALU_OP1_EXP_IEEE;
5625 alu.src[0].sel = ctx->temp_reg;
5626 alu.src[0].chan = 1;
5627
5628 alu.dst.sel = ctx->temp_reg;
5629 alu.dst.chan = i;
5630 if (i == 1)
5631 alu.dst.write = 1;
5632 if (i == 2)
5633 alu.last = 1;
5634
5635 r = r600_bytecode_add_alu(ctx->bc, &alu);
5636 if (r)
5637 return r;
5638 }
5639 } else {
5640 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5641 alu.op = ALU_OP1_EXP_IEEE;
5642 alu.src[0].sel = ctx->temp_reg;
5643 alu.src[0].chan = 1;
5644
5645 alu.dst.sel = ctx->temp_reg;
5646 alu.dst.chan = 1;
5647 alu.dst.write = 1;
5648 alu.last = 1;
5649
5650 r = r600_bytecode_add_alu(ctx->bc, &alu);
5651 if (r)
5652 return r;
5653 }
5654
5655 if (ctx->bc->chip_class == CAYMAN) {
5656 for (i = 0; i < 3; i++) {
5657 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5658 alu.op = ALU_OP1_RECIP_IEEE;
5659 alu.src[0].sel = ctx->temp_reg;
5660 alu.src[0].chan = 1;
5661
5662 alu.dst.sel = ctx->temp_reg;
5663 alu.dst.chan = i;
5664 if (i == 1)
5665 alu.dst.write = 1;
5666 if (i == 2)
5667 alu.last = 1;
5668
5669 r = r600_bytecode_add_alu(ctx->bc, &alu);
5670 if (r)
5671 return r;
5672 }
5673 } else {
5674 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5675 alu.op = ALU_OP1_RECIP_IEEE;
5676 alu.src[0].sel = ctx->temp_reg;
5677 alu.src[0].chan = 1;
5678
5679 alu.dst.sel = ctx->temp_reg;
5680 alu.dst.chan = 1;
5681 alu.dst.write = 1;
5682 alu.last = 1;
5683
5684 r = r600_bytecode_add_alu(ctx->bc, &alu);
5685 if (r)
5686 return r;
5687 }
5688
5689 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5690
5691 alu.op = ALU_OP2_MUL;
5692
5693 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5694 r600_bytecode_src_set_abs(&alu.src[0]);
5695
5696 alu.src[1].sel = ctx->temp_reg;
5697 alu.src[1].chan = 1;
5698
5699 alu.dst.sel = ctx->temp_reg;
5700 alu.dst.chan = 1;
5701 alu.dst.write = 1;
5702 alu.last = 1;
5703
5704 r = r600_bytecode_add_alu(ctx->bc, &alu);
5705 if (r)
5706 return r;
5707 }
5708
5709 /* result.z = log2(|src|);*/
5710 if ((inst->Dst[0].Register.WriteMask >> 2) & 1) {
5711 if (ctx->bc->chip_class == CAYMAN) {
5712 for (i = 0; i < 3; i++) {
5713 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5714
5715 alu.op = ALU_OP1_LOG_IEEE;
5716 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5717 r600_bytecode_src_set_abs(&alu.src[0]);
5718
5719 alu.dst.sel = ctx->temp_reg;
5720 if (i == 2)
5721 alu.dst.write = 1;
5722 alu.dst.chan = i;
5723 if (i == 2)
5724 alu.last = 1;
5725
5726 r = r600_bytecode_add_alu(ctx->bc, &alu);
5727 if (r)
5728 return r;
5729 }
5730 } else {
5731 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5732
5733 alu.op = ALU_OP1_LOG_IEEE;
5734 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5735 r600_bytecode_src_set_abs(&alu.src[0]);
5736
5737 alu.dst.sel = ctx->temp_reg;
5738 alu.dst.write = 1;
5739 alu.dst.chan = 2;
5740 alu.last = 1;
5741
5742 r = r600_bytecode_add_alu(ctx->bc, &alu);
5743 if (r)
5744 return r;
5745 }
5746 }
5747
5748 /* result.w = 1.0; */
5749 if ((inst->Dst[0].Register.WriteMask >> 3) & 1) {
5750 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5751
5752 alu.op = ALU_OP1_MOV;
5753 alu.src[0].sel = V_SQ_ALU_SRC_1;
5754 alu.src[0].chan = 0;
5755
5756 alu.dst.sel = ctx->temp_reg;
5757 alu.dst.chan = 3;
5758 alu.dst.write = 1;
5759 alu.last = 1;
5760
5761 r = r600_bytecode_add_alu(ctx->bc, &alu);
5762 if (r)
5763 return r;
5764 }
5765
5766 return tgsi_helper_copy(ctx, inst);
5767 }
5768
5769 static int tgsi_eg_arl(struct r600_shader_ctx *ctx)
5770 {
5771 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5772 struct r600_bytecode_alu alu;
5773 int r;
5774
5775 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5776
5777 switch (inst->Instruction.Opcode) {
5778 case TGSI_OPCODE_ARL:
5779 alu.op = ALU_OP1_FLT_TO_INT_FLOOR;
5780 break;
5781 case TGSI_OPCODE_ARR:
5782 alu.op = ALU_OP1_FLT_TO_INT;
5783 break;
5784 case TGSI_OPCODE_UARL:
5785 alu.op = ALU_OP1_MOV;
5786 break;
5787 default:
5788 assert(0);
5789 return -1;
5790 }
5791
5792 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5793 alu.last = 1;
5794 alu.dst.sel = ctx->bc->ar_reg;
5795 alu.dst.write = 1;
5796 r = r600_bytecode_add_alu(ctx->bc, &alu);
5797 if (r)
5798 return r;
5799
5800 ctx->bc->ar_loaded = 0;
5801 return 0;
5802 }
5803 static int tgsi_r600_arl(struct r600_shader_ctx *ctx)
5804 {
5805 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5806 struct r600_bytecode_alu alu;
5807 int r;
5808
5809 switch (inst->Instruction.Opcode) {
5810 case TGSI_OPCODE_ARL:
5811 memset(&alu, 0, sizeof(alu));
5812 alu.op = ALU_OP1_FLOOR;
5813 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5814 alu.dst.sel = ctx->bc->ar_reg;
5815 alu.dst.write = 1;
5816 alu.last = 1;
5817
5818 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5819 return r;
5820
5821 memset(&alu, 0, sizeof(alu));
5822 alu.op = ALU_OP1_FLT_TO_INT;
5823 alu.src[0].sel = ctx->bc->ar_reg;
5824 alu.dst.sel = ctx->bc->ar_reg;
5825 alu.dst.write = 1;
5826 alu.last = 1;
5827
5828 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5829 return r;
5830 break;
5831 case TGSI_OPCODE_ARR:
5832 memset(&alu, 0, sizeof(alu));
5833 alu.op = ALU_OP1_FLT_TO_INT;
5834 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5835 alu.dst.sel = ctx->bc->ar_reg;
5836 alu.dst.write = 1;
5837 alu.last = 1;
5838
5839 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5840 return r;
5841 break;
5842 case TGSI_OPCODE_UARL:
5843 memset(&alu, 0, sizeof(alu));
5844 alu.op = ALU_OP1_MOV;
5845 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5846 alu.dst.sel = ctx->bc->ar_reg;
5847 alu.dst.write = 1;
5848 alu.last = 1;
5849
5850 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5851 return r;
5852 break;
5853 default:
5854 assert(0);
5855 return -1;
5856 }
5857
5858 ctx->bc->ar_loaded = 0;
5859 return 0;
5860 }
5861
5862 static int tgsi_opdst(struct r600_shader_ctx *ctx)
5863 {
5864 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5865 struct r600_bytecode_alu alu;
5866 int i, r = 0;
5867
5868 for (i = 0; i < 4; i++) {
5869 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5870
5871 alu.op = ALU_OP2_MUL;
5872 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5873
5874 if (i == 0 || i == 3) {
5875 alu.src[0].sel = V_SQ_ALU_SRC_1;
5876 } else {
5877 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5878 }
5879
5880 if (i == 0 || i == 2) {
5881 alu.src[1].sel = V_SQ_ALU_SRC_1;
5882 } else {
5883 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5884 }
5885 if (i == 3)
5886 alu.last = 1;
5887 r = r600_bytecode_add_alu(ctx->bc, &alu);
5888 if (r)
5889 return r;
5890 }
5891 return 0;
5892 }
5893
5894 static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode, int alu_type)
5895 {
5896 struct r600_bytecode_alu alu;
5897 int r;
5898
5899 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5900 alu.op = opcode;
5901 alu.execute_mask = 1;
5902 alu.update_pred = 1;
5903
5904 alu.dst.sel = ctx->temp_reg;
5905 alu.dst.write = 1;
5906 alu.dst.chan = 0;
5907
5908 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5909 alu.src[1].sel = V_SQ_ALU_SRC_0;
5910 alu.src[1].chan = 0;
5911
5912 alu.last = 1;
5913
5914 r = r600_bytecode_add_alu_type(ctx->bc, &alu, alu_type);
5915 if (r)
5916 return r;
5917 return 0;
5918 }
5919
5920 static int pops(struct r600_shader_ctx *ctx, int pops)
5921 {
5922 unsigned force_pop = ctx->bc->force_add_cf;
5923
5924 if (!force_pop) {
5925 int alu_pop = 3;
5926 if (ctx->bc->cf_last) {
5927 if (ctx->bc->cf_last->op == CF_OP_ALU)
5928 alu_pop = 0;
5929 else if (ctx->bc->cf_last->op == CF_OP_ALU_POP_AFTER)
5930 alu_pop = 1;
5931 }
5932 alu_pop += pops;
5933 if (alu_pop == 1) {
5934 ctx->bc->cf_last->op = CF_OP_ALU_POP_AFTER;
5935 ctx->bc->force_add_cf = 1;
5936 } else if (alu_pop == 2) {
5937 ctx->bc->cf_last->op = CF_OP_ALU_POP2_AFTER;
5938 ctx->bc->force_add_cf = 1;
5939 } else {
5940 force_pop = 1;
5941 }
5942 }
5943
5944 if (force_pop) {
5945 r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP);
5946 ctx->bc->cf_last->pop_count = pops;
5947 ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
5948 }
5949
5950 return 0;
5951 }
5952
5953 static inline void callstack_update_max_depth(struct r600_shader_ctx *ctx,
5954 unsigned reason)
5955 {
5956 struct r600_stack_info *stack = &ctx->bc->stack;
5957 unsigned elements, entries;
5958
5959 unsigned entry_size = stack->entry_size;
5960
5961 elements = (stack->loop + stack->push_wqm ) * entry_size;
5962 elements += stack->push;
5963
5964 switch (ctx->bc->chip_class) {
5965 case R600:
5966 case R700:
5967 /* pre-r8xx: if any non-WQM PUSH instruction is invoked, 2 elements on
5968 * the stack must be reserved to hold the current active/continue
5969 * masks */
5970 if (reason == FC_PUSH_VPM) {
5971 elements += 2;
5972 }
5973 break;
5974
5975 case CAYMAN:
5976 /* r9xx: any stack operation on empty stack consumes 2 additional
5977 * elements */
5978 elements += 2;
5979
5980 /* fallthrough */
5981 /* FIXME: do the two elements added above cover the cases for the
5982 * r8xx+ below? */
5983
5984 case EVERGREEN:
5985 /* r8xx+: 2 extra elements are not always required, but one extra
5986 * element must be added for each of the following cases:
5987 * 1. There is an ALU_ELSE_AFTER instruction at the point of greatest
5988 * stack usage.
5989 * (Currently we don't use ALU_ELSE_AFTER.)
5990 * 2. There are LOOP/WQM frames on the stack when any flavor of non-WQM
5991 * PUSH instruction executed.
5992 *
5993 * NOTE: it seems we also need to reserve additional element in some
5994 * other cases, e.g. when we have 4 levels of PUSH_VPM in the shader,
5995 * then STACK_SIZE should be 2 instead of 1 */
5996 if (reason == FC_PUSH_VPM) {
5997 elements += 1;
5998 }
5999 break;
6000
6001 default:
6002 assert(0);
6003 break;
6004 }
6005
6006 /* NOTE: it seems STACK_SIZE is interpreted by hw as if entry_size is 4
6007 * for all chips, so we use 4 in the final formula, not the real entry_size
6008 * for the chip */
6009 entry_size = 4;
6010
6011 entries = (elements + (entry_size - 1)) / entry_size;
6012
6013 if (entries > stack->max_entries)
6014 stack->max_entries = entries;
6015 }
6016
6017 static inline void callstack_pop(struct r600_shader_ctx *ctx, unsigned reason)
6018 {
6019 switch(reason) {
6020 case FC_PUSH_VPM:
6021 --ctx->bc->stack.push;
6022 assert(ctx->bc->stack.push >= 0);
6023 break;
6024 case FC_PUSH_WQM:
6025 --ctx->bc->stack.push_wqm;
6026 assert(ctx->bc->stack.push_wqm >= 0);
6027 break;
6028 case FC_LOOP:
6029 --ctx->bc->stack.loop;
6030 assert(ctx->bc->stack.loop >= 0);
6031 break;
6032 default:
6033 assert(0);
6034 break;
6035 }
6036 }
6037
6038 static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason)
6039 {
6040 switch (reason) {
6041 case FC_PUSH_VPM:
6042 ++ctx->bc->stack.push;
6043 break;
6044 case FC_PUSH_WQM:
6045 ++ctx->bc->stack.push_wqm;
6046 case FC_LOOP:
6047 ++ctx->bc->stack.loop;
6048 break;
6049 default:
6050 assert(0);
6051 }
6052
6053 callstack_update_max_depth(ctx, reason);
6054 }
6055
6056 static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp)
6057 {
6058 struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[fc_sp];
6059
6060 sp->mid = realloc((void *)sp->mid,
6061 sizeof(struct r600_bytecode_cf *) * (sp->num_mid + 1));
6062 sp->mid[sp->num_mid] = ctx->bc->cf_last;
6063 sp->num_mid++;
6064 }
6065
6066 static void fc_pushlevel(struct r600_shader_ctx *ctx, int type)
6067 {
6068 ctx->bc->fc_sp++;
6069 ctx->bc->fc_stack[ctx->bc->fc_sp].type = type;
6070 ctx->bc->fc_stack[ctx->bc->fc_sp].start = ctx->bc->cf_last;
6071 }
6072
6073 static void fc_poplevel(struct r600_shader_ctx *ctx)
6074 {
6075 struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[ctx->bc->fc_sp];
6076 free(sp->mid);
6077 sp->mid = NULL;
6078 sp->num_mid = 0;
6079 sp->start = NULL;
6080 sp->type = 0;
6081 ctx->bc->fc_sp--;
6082 }
6083
6084 #if 0
6085 static int emit_return(struct r600_shader_ctx *ctx)
6086 {
6087 r600_bytecode_add_cfinst(ctx->bc, CF_OP_RETURN));
6088 return 0;
6089 }
6090
6091 static int emit_jump_to_offset(struct r600_shader_ctx *ctx, int pops, int offset)
6092 {
6093
6094 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP));
6095 ctx->bc->cf_last->pop_count = pops;
6096 /* XXX work out offset */
6097 return 0;
6098 }
6099
6100 static int emit_setret_in_loop_flag(struct r600_shader_ctx *ctx, unsigned flag_value)
6101 {
6102 return 0;
6103 }
6104
6105 static void emit_testflag(struct r600_shader_ctx *ctx)
6106 {
6107
6108 }
6109
6110 static void emit_return_on_flag(struct r600_shader_ctx *ctx, unsigned ifidx)
6111 {
6112 emit_testflag(ctx);
6113 emit_jump_to_offset(ctx, 1, 4);
6114 emit_setret_in_loop_flag(ctx, V_SQ_ALU_SRC_0);
6115 pops(ctx, ifidx + 1);
6116 emit_return(ctx);
6117 }
6118
6119 static void break_loop_on_flag(struct r600_shader_ctx *ctx, unsigned fc_sp)
6120 {
6121 emit_testflag(ctx);
6122
6123 r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
6124 ctx->bc->cf_last->pop_count = 1;
6125
6126 fc_set_mid(ctx, fc_sp);
6127
6128 pops(ctx, 1);
6129 }
6130 #endif
6131
6132 static int emit_if(struct r600_shader_ctx *ctx, int opcode)
6133 {
6134 int alu_type = CF_OP_ALU_PUSH_BEFORE;
6135
6136 /* There is a hardware bug on Cayman where a BREAK/CONTINUE followed by
6137 * LOOP_STARTxxx for nested loops may put the branch stack into a state
6138 * such that ALU_PUSH_BEFORE doesn't work as expected. Workaround this
6139 * by replacing the ALU_PUSH_BEFORE with a PUSH + ALU */
6140 if (ctx->bc->chip_class == CAYMAN && ctx->bc->stack.loop > 1) {
6141 r600_bytecode_add_cfinst(ctx->bc, CF_OP_PUSH);
6142 ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
6143 alu_type = CF_OP_ALU;
6144 }
6145
6146 emit_logic_pred(ctx, opcode, alu_type);
6147
6148 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
6149
6150 fc_pushlevel(ctx, FC_IF);
6151
6152 callstack_push(ctx, FC_PUSH_VPM);
6153 return 0;
6154 }
6155
6156 static int tgsi_if(struct r600_shader_ctx *ctx)
6157 {
6158 return emit_if(ctx, ALU_OP2_PRED_SETNE);
6159 }
6160
6161 static int tgsi_uif(struct r600_shader_ctx *ctx)
6162 {
6163 return emit_if(ctx, ALU_OP2_PRED_SETNE_INT);
6164 }
6165
6166 static int tgsi_else(struct r600_shader_ctx *ctx)
6167 {
6168 r600_bytecode_add_cfinst(ctx->bc, CF_OP_ELSE);
6169 ctx->bc->cf_last->pop_count = 1;
6170
6171 fc_set_mid(ctx, ctx->bc->fc_sp);
6172 ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id;
6173 return 0;
6174 }
6175
6176 static int tgsi_endif(struct r600_shader_ctx *ctx)
6177 {
6178 pops(ctx, 1);
6179 if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_IF) {
6180 R600_ERR("if/endif unbalanced in shader\n");
6181 return -1;
6182 }
6183
6184 if (ctx->bc->fc_stack[ctx->bc->fc_sp].mid == NULL) {
6185 ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2;
6186 ctx->bc->fc_stack[ctx->bc->fc_sp].start->pop_count = 1;
6187 } else {
6188 ctx->bc->fc_stack[ctx->bc->fc_sp].mid[0]->cf_addr = ctx->bc->cf_last->id + 2;
6189 }
6190 fc_poplevel(ctx);
6191
6192 callstack_pop(ctx, FC_PUSH_VPM);
6193 return 0;
6194 }
6195
6196 static int tgsi_bgnloop(struct r600_shader_ctx *ctx)
6197 {
6198 /* LOOP_START_DX10 ignores the LOOP_CONFIG* registers, so it is not
6199 * limited to 4096 iterations, like the other LOOP_* instructions. */
6200 r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_START_DX10);
6201
6202 fc_pushlevel(ctx, FC_LOOP);
6203
6204 /* check stack depth */
6205 callstack_push(ctx, FC_LOOP);
6206 return 0;
6207 }
6208
6209 static int tgsi_endloop(struct r600_shader_ctx *ctx)
6210 {
6211 int i;
6212
6213 r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_END);
6214
6215 if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_LOOP) {
6216 R600_ERR("loop/endloop in shader code are not paired.\n");
6217 return -EINVAL;
6218 }
6219
6220 /* fixup loop pointers - from r600isa
6221 LOOP END points to CF after LOOP START,
6222 LOOP START point to CF after LOOP END
6223 BRK/CONT point to LOOP END CF
6224 */
6225 ctx->bc->cf_last->cf_addr = ctx->bc->fc_stack[ctx->bc->fc_sp].start->id + 2;
6226
6227 ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2;
6228
6229 for (i = 0; i < ctx->bc->fc_stack[ctx->bc->fc_sp].num_mid; i++) {
6230 ctx->bc->fc_stack[ctx->bc->fc_sp].mid[i]->cf_addr = ctx->bc->cf_last->id;
6231 }
6232 /* XXX add LOOPRET support */
6233 fc_poplevel(ctx);
6234 callstack_pop(ctx, FC_LOOP);
6235 return 0;
6236 }
6237
6238 static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx)
6239 {
6240 unsigned int fscp;
6241
6242 for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
6243 {
6244 if (FC_LOOP == ctx->bc->fc_stack[fscp].type)
6245 break;
6246 }
6247
6248 if (fscp == 0) {
6249 R600_ERR("Break not inside loop/endloop pair\n");
6250 return -EINVAL;
6251 }
6252
6253 r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
6254
6255 fc_set_mid(ctx, fscp);
6256
6257 return 0;
6258 }
6259
6260 static int tgsi_gs_emit(struct r600_shader_ctx *ctx)
6261 {
6262 if (ctx->inst_info->op == CF_OP_EMIT_VERTEX)
6263 emit_gs_ring_writes(ctx, TRUE);
6264
6265 return r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
6266 }
6267
6268 static int tgsi_umad(struct r600_shader_ctx *ctx)
6269 {
6270 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6271 struct r600_bytecode_alu alu;
6272 int i, j, k, r;
6273 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6274
6275 /* src0 * src1 */
6276 for (i = 0; i < lasti + 1; i++) {
6277 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6278 continue;
6279
6280 if (ctx->bc->chip_class == CAYMAN) {
6281 for (j = 0 ; j < 4; j++) {
6282 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6283
6284 alu.op = ALU_OP2_MULLO_UINT;
6285 for (k = 0; k < inst->Instruction.NumSrcRegs; k++) {
6286 r600_bytecode_src(&alu.src[k], &ctx->src[k], i);
6287 }
6288 tgsi_dst(ctx, &inst->Dst[0], j, &alu.dst);
6289 alu.dst.sel = ctx->temp_reg;
6290 alu.dst.write = (j == i);
6291 if (j == 3)
6292 alu.last = 1;
6293 r = r600_bytecode_add_alu(ctx->bc, &alu);
6294 if (r)
6295 return r;
6296 }
6297 } else {
6298 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6299
6300 alu.dst.chan = i;
6301 alu.dst.sel = ctx->temp_reg;
6302 alu.dst.write = 1;
6303
6304 alu.op = ALU_OP2_MULLO_UINT;
6305 for (j = 0; j < 2; j++) {
6306 r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
6307 }
6308
6309 alu.last = 1;
6310 r = r600_bytecode_add_alu(ctx->bc, &alu);
6311 if (r)
6312 return r;
6313 }
6314 }
6315
6316
6317 for (i = 0; i < lasti + 1; i++) {
6318 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6319 continue;
6320
6321 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6322 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6323
6324 alu.op = ALU_OP2_ADD_INT;
6325
6326 alu.src[0].sel = ctx->temp_reg;
6327 alu.src[0].chan = i;
6328
6329 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6330 if (i == lasti) {
6331 alu.last = 1;
6332 }
6333 r = r600_bytecode_add_alu(ctx->bc, &alu);
6334 if (r)
6335 return r;
6336 }
6337 return 0;
6338 }
6339
6340 static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
6341 {TGSI_OPCODE_ARL, 0, ALU_OP0_NOP, tgsi_r600_arl},
6342 {TGSI_OPCODE_MOV, 0, ALU_OP1_MOV, tgsi_op2},
6343 {TGSI_OPCODE_LIT, 0, ALU_OP0_NOP, tgsi_lit},
6344
6345 /* XXX:
6346 * For state trackers other than OpenGL, we'll want to use
6347 * _RECIP_IEEE instead.
6348 */
6349 {TGSI_OPCODE_RCP, 0, ALU_OP1_RECIP_CLAMPED, tgsi_trans_srcx_replicate},
6350
6351 {TGSI_OPCODE_RSQ, 0, ALU_OP0_NOP, tgsi_rsq},
6352 {TGSI_OPCODE_EXP, 0, ALU_OP0_NOP, tgsi_exp},
6353 {TGSI_OPCODE_LOG, 0, ALU_OP0_NOP, tgsi_log},
6354 {TGSI_OPCODE_MUL, 0, ALU_OP2_MUL, tgsi_op2},
6355 {TGSI_OPCODE_ADD, 0, ALU_OP2_ADD, tgsi_op2},
6356 {TGSI_OPCODE_DP3, 0, ALU_OP2_DOT4, tgsi_dp},
6357 {TGSI_OPCODE_DP4, 0, ALU_OP2_DOT4, tgsi_dp},
6358 {TGSI_OPCODE_DST, 0, ALU_OP0_NOP, tgsi_opdst},
6359 {TGSI_OPCODE_MIN, 0, ALU_OP2_MIN, tgsi_op2},
6360 {TGSI_OPCODE_MAX, 0, ALU_OP2_MAX, tgsi_op2},
6361 {TGSI_OPCODE_SLT, 0, ALU_OP2_SETGT, tgsi_op2_swap},
6362 {TGSI_OPCODE_SGE, 0, ALU_OP2_SETGE, tgsi_op2},
6363 {TGSI_OPCODE_MAD, 1, ALU_OP3_MULADD, tgsi_op3},
6364 {TGSI_OPCODE_SUB, 0, ALU_OP2_ADD, tgsi_op2},
6365 {TGSI_OPCODE_LRP, 0, ALU_OP0_NOP, tgsi_lrp},
6366 {TGSI_OPCODE_CND, 0, ALU_OP0_NOP, tgsi_unsupported},
6367 /* gap */
6368 {20, 0, ALU_OP0_NOP, tgsi_unsupported},
6369 {TGSI_OPCODE_DP2A, 0, ALU_OP0_NOP, tgsi_unsupported},
6370 /* gap */
6371 {22, 0, ALU_OP0_NOP, tgsi_unsupported},
6372 {23, 0, ALU_OP0_NOP, tgsi_unsupported},
6373 {TGSI_OPCODE_FRC, 0, ALU_OP1_FRACT, tgsi_op2},
6374 {TGSI_OPCODE_CLAMP, 0, ALU_OP0_NOP, tgsi_unsupported},
6375 {TGSI_OPCODE_FLR, 0, ALU_OP1_FLOOR, tgsi_op2},
6376 {TGSI_OPCODE_ROUND, 0, ALU_OP1_RNDNE, tgsi_op2},
6377 {TGSI_OPCODE_EX2, 0, ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
6378 {TGSI_OPCODE_LG2, 0, ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
6379 {TGSI_OPCODE_POW, 0, ALU_OP0_NOP, tgsi_pow},
6380 {TGSI_OPCODE_XPD, 0, ALU_OP0_NOP, tgsi_xpd},
6381 /* gap */
6382 {32, 0, ALU_OP0_NOP, tgsi_unsupported},
6383 {TGSI_OPCODE_ABS, 0, ALU_OP1_MOV, tgsi_op2},
6384 {TGSI_OPCODE_RCC, 0, ALU_OP0_NOP, tgsi_unsupported},
6385 {TGSI_OPCODE_DPH, 0, ALU_OP2_DOT4, tgsi_dp},
6386 {TGSI_OPCODE_COS, 0, ALU_OP1_COS, tgsi_trig},
6387 {TGSI_OPCODE_DDX, 0, FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
6388 {TGSI_OPCODE_DDY, 0, FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
6389 {TGSI_OPCODE_KILL, 0, ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */
6390 {TGSI_OPCODE_PK2H, 0, ALU_OP0_NOP, tgsi_unsupported},
6391 {TGSI_OPCODE_PK2US, 0, ALU_OP0_NOP, tgsi_unsupported},
6392 {TGSI_OPCODE_PK4B, 0, ALU_OP0_NOP, tgsi_unsupported},
6393 {TGSI_OPCODE_PK4UB, 0, ALU_OP0_NOP, tgsi_unsupported},
6394 {TGSI_OPCODE_RFL, 0, ALU_OP0_NOP, tgsi_unsupported},
6395 {TGSI_OPCODE_SEQ, 0, ALU_OP2_SETE, tgsi_op2},
6396 {TGSI_OPCODE_SFL, 0, ALU_OP0_NOP, tgsi_unsupported},
6397 {TGSI_OPCODE_SGT, 0, ALU_OP2_SETGT, tgsi_op2},
6398 {TGSI_OPCODE_SIN, 0, ALU_OP1_SIN, tgsi_trig},
6399 {TGSI_OPCODE_SLE, 0, ALU_OP2_SETGE, tgsi_op2_swap},
6400 {TGSI_OPCODE_SNE, 0, ALU_OP2_SETNE, tgsi_op2},
6401 {TGSI_OPCODE_STR, 0, ALU_OP0_NOP, tgsi_unsupported},
6402 {TGSI_OPCODE_TEX, 0, FETCH_OP_SAMPLE, tgsi_tex},
6403 {TGSI_OPCODE_TXD, 0, FETCH_OP_SAMPLE_G, tgsi_tex},
6404 {TGSI_OPCODE_TXP, 0, FETCH_OP_SAMPLE, tgsi_tex},
6405 {TGSI_OPCODE_UP2H, 0, ALU_OP0_NOP, tgsi_unsupported},
6406 {TGSI_OPCODE_UP2US, 0, ALU_OP0_NOP, tgsi_unsupported},
6407 {TGSI_OPCODE_UP4B, 0, ALU_OP0_NOP, tgsi_unsupported},
6408 {TGSI_OPCODE_UP4UB, 0, ALU_OP0_NOP, tgsi_unsupported},
6409 {TGSI_OPCODE_X2D, 0, ALU_OP0_NOP, tgsi_unsupported},
6410 {TGSI_OPCODE_ARA, 0, ALU_OP0_NOP, tgsi_unsupported},
6411 {TGSI_OPCODE_ARR, 0, ALU_OP0_NOP, tgsi_r600_arl},
6412 {TGSI_OPCODE_BRA, 0, ALU_OP0_NOP, tgsi_unsupported},
6413 {TGSI_OPCODE_CAL, 0, ALU_OP0_NOP, tgsi_unsupported},
6414 {TGSI_OPCODE_RET, 0, ALU_OP0_NOP, tgsi_unsupported},
6415 {TGSI_OPCODE_SSG, 0, ALU_OP0_NOP, tgsi_ssg},
6416 {TGSI_OPCODE_CMP, 0, ALU_OP0_NOP, tgsi_cmp},
6417 {TGSI_OPCODE_SCS, 0, ALU_OP0_NOP, tgsi_scs},
6418 {TGSI_OPCODE_TXB, 0, FETCH_OP_SAMPLE_LB, tgsi_tex},
6419 {TGSI_OPCODE_NRM, 0, ALU_OP0_NOP, tgsi_unsupported},
6420 {TGSI_OPCODE_DIV, 0, ALU_OP0_NOP, tgsi_unsupported},
6421 {TGSI_OPCODE_DP2, 0, ALU_OP2_DOT4, tgsi_dp},
6422 {TGSI_OPCODE_TXL, 0, FETCH_OP_SAMPLE_L, tgsi_tex},
6423 {TGSI_OPCODE_BRK, 0, CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
6424 {TGSI_OPCODE_IF, 0, ALU_OP0_NOP, tgsi_if},
6425 {TGSI_OPCODE_UIF, 0, ALU_OP0_NOP, tgsi_uif},
6426 {76, 0, ALU_OP0_NOP, tgsi_unsupported},
6427 {TGSI_OPCODE_ELSE, 0, ALU_OP0_NOP, tgsi_else},
6428 {TGSI_OPCODE_ENDIF, 0, ALU_OP0_NOP, tgsi_endif},
6429 /* gap */
6430 {79, 0, ALU_OP0_NOP, tgsi_unsupported},
6431 {80, 0, ALU_OP0_NOP, tgsi_unsupported},
6432 {TGSI_OPCODE_PUSHA, 0, ALU_OP0_NOP, tgsi_unsupported},
6433 {TGSI_OPCODE_POPA, 0, ALU_OP0_NOP, tgsi_unsupported},
6434 {TGSI_OPCODE_CEIL, 0, ALU_OP1_CEIL, tgsi_op2},
6435 {TGSI_OPCODE_I2F, 0, ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
6436 {TGSI_OPCODE_NOT, 0, ALU_OP1_NOT_INT, tgsi_op2},
6437 {TGSI_OPCODE_TRUNC, 0, ALU_OP1_TRUNC, tgsi_op2},
6438 {TGSI_OPCODE_SHL, 0, ALU_OP2_LSHL_INT, tgsi_op2_trans},
6439 /* gap */
6440 {88, 0, ALU_OP0_NOP, tgsi_unsupported},
6441 {TGSI_OPCODE_AND, 0, ALU_OP2_AND_INT, tgsi_op2},
6442 {TGSI_OPCODE_OR, 0, ALU_OP2_OR_INT, tgsi_op2},
6443 {TGSI_OPCODE_MOD, 0, ALU_OP0_NOP, tgsi_imod},
6444 {TGSI_OPCODE_XOR, 0, ALU_OP2_XOR_INT, tgsi_op2},
6445 {TGSI_OPCODE_SAD, 0, ALU_OP0_NOP, tgsi_unsupported},
6446 {TGSI_OPCODE_TXF, 0, FETCH_OP_LD, tgsi_tex},
6447 {TGSI_OPCODE_TXQ, 0, FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
6448 {TGSI_OPCODE_CONT, 0, CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
6449 {TGSI_OPCODE_EMIT, 0, CF_OP_EMIT_VERTEX, tgsi_gs_emit},
6450 {TGSI_OPCODE_ENDPRIM, 0, CF_OP_CUT_VERTEX, tgsi_gs_emit},
6451 {TGSI_OPCODE_BGNLOOP, 0, ALU_OP0_NOP, tgsi_bgnloop},
6452 {TGSI_OPCODE_BGNSUB, 0, ALU_OP0_NOP, tgsi_unsupported},
6453 {TGSI_OPCODE_ENDLOOP, 0, ALU_OP0_NOP, tgsi_endloop},
6454 {TGSI_OPCODE_ENDSUB, 0, ALU_OP0_NOP, tgsi_unsupported},
6455 {TGSI_OPCODE_TXQ_LZ, 0, FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
6456 /* gap */
6457 {104, 0, ALU_OP0_NOP, tgsi_unsupported},
6458 {105, 0, ALU_OP0_NOP, tgsi_unsupported},
6459 {106, 0, ALU_OP0_NOP, tgsi_unsupported},
6460 {TGSI_OPCODE_NOP, 0, ALU_OP0_NOP, tgsi_unsupported},
6461 {TGSI_OPCODE_FSEQ, 0, ALU_OP2_SETE_DX10, tgsi_op2},
6462 {TGSI_OPCODE_FSGE, 0, ALU_OP2_SETGE_DX10, tgsi_op2},
6463 {TGSI_OPCODE_FSLT, 0, ALU_OP2_SETGT_DX10, tgsi_op2_swap},
6464 {TGSI_OPCODE_FSNE, 0, ALU_OP2_SETNE_DX10, tgsi_op2_swap},
6465 {TGSI_OPCODE_NRM4, 0, ALU_OP0_NOP, tgsi_unsupported},
6466 {TGSI_OPCODE_CALLNZ, 0, ALU_OP0_NOP, tgsi_unsupported},
6467 /* gap */
6468 {114, 0, ALU_OP0_NOP, tgsi_unsupported},
6469 {TGSI_OPCODE_BREAKC, 0, ALU_OP0_NOP, tgsi_unsupported},
6470 {TGSI_OPCODE_KILL_IF, 0, ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */
6471 {TGSI_OPCODE_END, 0, ALU_OP0_NOP, tgsi_end}, /* aka HALT */
6472 /* gap */
6473 {118, 0, ALU_OP0_NOP, tgsi_unsupported},
6474 {TGSI_OPCODE_F2I, 0, ALU_OP1_FLT_TO_INT, tgsi_op2_trans},
6475 {TGSI_OPCODE_IDIV, 0, ALU_OP0_NOP, tgsi_idiv},
6476 {TGSI_OPCODE_IMAX, 0, ALU_OP2_MAX_INT, tgsi_op2},
6477 {TGSI_OPCODE_IMIN, 0, ALU_OP2_MIN_INT, tgsi_op2},
6478 {TGSI_OPCODE_INEG, 0, ALU_OP2_SUB_INT, tgsi_ineg},
6479 {TGSI_OPCODE_ISGE, 0, ALU_OP2_SETGE_INT, tgsi_op2},
6480 {TGSI_OPCODE_ISHR, 0, ALU_OP2_ASHR_INT, tgsi_op2_trans},
6481 {TGSI_OPCODE_ISLT, 0, ALU_OP2_SETGT_INT, tgsi_op2_swap},
6482 {TGSI_OPCODE_F2U, 0, ALU_OP1_FLT_TO_UINT, tgsi_op2_trans},
6483 {TGSI_OPCODE_U2F, 0, ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
6484 {TGSI_OPCODE_UADD, 0, ALU_OP2_ADD_INT, tgsi_op2},
6485 {TGSI_OPCODE_UDIV, 0, ALU_OP0_NOP, tgsi_udiv},
6486 {TGSI_OPCODE_UMAD, 0, ALU_OP0_NOP, tgsi_umad},
6487 {TGSI_OPCODE_UMAX, 0, ALU_OP2_MAX_UINT, tgsi_op2},
6488 {TGSI_OPCODE_UMIN, 0, ALU_OP2_MIN_UINT, tgsi_op2},
6489 {TGSI_OPCODE_UMOD, 0, ALU_OP0_NOP, tgsi_umod},
6490 {TGSI_OPCODE_UMUL, 0, ALU_OP2_MULLO_UINT, tgsi_op2_trans},
6491 {TGSI_OPCODE_USEQ, 0, ALU_OP2_SETE_INT, tgsi_op2},
6492 {TGSI_OPCODE_USGE, 0, ALU_OP2_SETGE_UINT, tgsi_op2},
6493 {TGSI_OPCODE_USHR, 0, ALU_OP2_LSHR_INT, tgsi_op2_trans},
6494 {TGSI_OPCODE_USLT, 0, ALU_OP2_SETGT_UINT, tgsi_op2_swap},
6495 {TGSI_OPCODE_USNE, 0, ALU_OP2_SETNE_INT, tgsi_op2_swap},
6496 {TGSI_OPCODE_SWITCH, 0, ALU_OP0_NOP, tgsi_unsupported},
6497 {TGSI_OPCODE_CASE, 0, ALU_OP0_NOP, tgsi_unsupported},
6498 {TGSI_OPCODE_DEFAULT, 0, ALU_OP0_NOP, tgsi_unsupported},
6499 {TGSI_OPCODE_ENDSWITCH, 0, ALU_OP0_NOP, tgsi_unsupported},
6500 {TGSI_OPCODE_SAMPLE, 0, 0, tgsi_unsupported},
6501 {TGSI_OPCODE_SAMPLE_I, 0, 0, tgsi_unsupported},
6502 {TGSI_OPCODE_SAMPLE_I_MS, 0, 0, tgsi_unsupported},
6503 {TGSI_OPCODE_SAMPLE_B, 0, 0, tgsi_unsupported},
6504 {TGSI_OPCODE_SAMPLE_C, 0, 0, tgsi_unsupported},
6505 {TGSI_OPCODE_SAMPLE_C_LZ, 0, 0, tgsi_unsupported},
6506 {TGSI_OPCODE_SAMPLE_D, 0, 0, tgsi_unsupported},
6507 {TGSI_OPCODE_SAMPLE_L, 0, 0, tgsi_unsupported},
6508 {TGSI_OPCODE_GATHER4, 0, 0, tgsi_unsupported},
6509 {TGSI_OPCODE_SVIEWINFO, 0, 0, tgsi_unsupported},
6510 {TGSI_OPCODE_SAMPLE_POS, 0, 0, tgsi_unsupported},
6511 {TGSI_OPCODE_SAMPLE_INFO, 0, 0, tgsi_unsupported},
6512 {TGSI_OPCODE_UARL, 0, ALU_OP1_MOVA_INT, tgsi_r600_arl},
6513 {TGSI_OPCODE_UCMP, 0, ALU_OP0_NOP, tgsi_ucmp},
6514 {TGSI_OPCODE_IABS, 0, 0, tgsi_iabs},
6515 {TGSI_OPCODE_ISSG, 0, 0, tgsi_issg},
6516 {TGSI_OPCODE_LOAD, 0, ALU_OP0_NOP, tgsi_unsupported},
6517 {TGSI_OPCODE_STORE, 0, ALU_OP0_NOP, tgsi_unsupported},
6518 {TGSI_OPCODE_MFENCE, 0, ALU_OP0_NOP, tgsi_unsupported},
6519 {TGSI_OPCODE_LFENCE, 0, ALU_OP0_NOP, tgsi_unsupported},
6520 {TGSI_OPCODE_SFENCE, 0, ALU_OP0_NOP, tgsi_unsupported},
6521 {TGSI_OPCODE_BARRIER, 0, ALU_OP0_NOP, tgsi_unsupported},
6522 {TGSI_OPCODE_ATOMUADD, 0, ALU_OP0_NOP, tgsi_unsupported},
6523 {TGSI_OPCODE_ATOMXCHG, 0, ALU_OP0_NOP, tgsi_unsupported},
6524 {TGSI_OPCODE_ATOMCAS, 0, ALU_OP0_NOP, tgsi_unsupported},
6525 {TGSI_OPCODE_ATOMAND, 0, ALU_OP0_NOP, tgsi_unsupported},
6526 {TGSI_OPCODE_ATOMOR, 0, ALU_OP0_NOP, tgsi_unsupported},
6527 {TGSI_OPCODE_ATOMXOR, 0, ALU_OP0_NOP, tgsi_unsupported},
6528 {TGSI_OPCODE_ATOMUMIN, 0, ALU_OP0_NOP, tgsi_unsupported},
6529 {TGSI_OPCODE_ATOMUMAX, 0, ALU_OP0_NOP, tgsi_unsupported},
6530 {TGSI_OPCODE_ATOMIMIN, 0, ALU_OP0_NOP, tgsi_unsupported},
6531 {TGSI_OPCODE_ATOMIMAX, 0, ALU_OP0_NOP, tgsi_unsupported},
6532 {TGSI_OPCODE_TEX2, 0, FETCH_OP_SAMPLE, tgsi_tex},
6533 {TGSI_OPCODE_TXB2, 0, FETCH_OP_SAMPLE_LB, tgsi_tex},
6534 {TGSI_OPCODE_TXL2, 0, FETCH_OP_SAMPLE_L, tgsi_tex},
6535 {TGSI_OPCODE_LAST, 0, ALU_OP0_NOP, tgsi_unsupported},
6536 };
6537
6538 static struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = {
6539 {TGSI_OPCODE_ARL, 0, ALU_OP0_NOP, tgsi_eg_arl},
6540 {TGSI_OPCODE_MOV, 0, ALU_OP1_MOV, tgsi_op2},
6541 {TGSI_OPCODE_LIT, 0, ALU_OP0_NOP, tgsi_lit},
6542 {TGSI_OPCODE_RCP, 0, ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate},
6543 {TGSI_OPCODE_RSQ, 0, ALU_OP1_RECIPSQRT_IEEE, tgsi_rsq},
6544 {TGSI_OPCODE_EXP, 0, ALU_OP0_NOP, tgsi_exp},
6545 {TGSI_OPCODE_LOG, 0, ALU_OP0_NOP, tgsi_log},
6546 {TGSI_OPCODE_MUL, 0, ALU_OP2_MUL, tgsi_op2},
6547 {TGSI_OPCODE_ADD, 0, ALU_OP2_ADD, tgsi_op2},
6548 {TGSI_OPCODE_DP3, 0, ALU_OP2_DOT4, tgsi_dp},
6549 {TGSI_OPCODE_DP4, 0, ALU_OP2_DOT4, tgsi_dp},
6550 {TGSI_OPCODE_DST, 0, ALU_OP0_NOP, tgsi_opdst},
6551 {TGSI_OPCODE_MIN, 0, ALU_OP2_MIN, tgsi_op2},
6552 {TGSI_OPCODE_MAX, 0, ALU_OP2_MAX, tgsi_op2},
6553 {TGSI_OPCODE_SLT, 0, ALU_OP2_SETGT, tgsi_op2_swap},
6554 {TGSI_OPCODE_SGE, 0, ALU_OP2_SETGE, tgsi_op2},
6555 {TGSI_OPCODE_MAD, 1, ALU_OP3_MULADD, tgsi_op3},
6556 {TGSI_OPCODE_SUB, 0, ALU_OP2_ADD, tgsi_op2},
6557 {TGSI_OPCODE_LRP, 0, ALU_OP0_NOP, tgsi_lrp},
6558 {TGSI_OPCODE_CND, 0, ALU_OP0_NOP, tgsi_unsupported},
6559 /* gap */
6560 {20, 0, ALU_OP0_NOP, tgsi_unsupported},
6561 {TGSI_OPCODE_DP2A, 0, ALU_OP0_NOP, tgsi_unsupported},
6562 /* gap */
6563 {22, 0, ALU_OP0_NOP, tgsi_unsupported},
6564 {23, 0, ALU_OP0_NOP, tgsi_unsupported},
6565 {TGSI_OPCODE_FRC, 0, ALU_OP1_FRACT, tgsi_op2},
6566 {TGSI_OPCODE_CLAMP, 0, ALU_OP0_NOP, tgsi_unsupported},
6567 {TGSI_OPCODE_FLR, 0, ALU_OP1_FLOOR, tgsi_op2},
6568 {TGSI_OPCODE_ROUND, 0, ALU_OP1_RNDNE, tgsi_op2},
6569 {TGSI_OPCODE_EX2, 0, ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
6570 {TGSI_OPCODE_LG2, 0, ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
6571 {TGSI_OPCODE_POW, 0, ALU_OP0_NOP, tgsi_pow},
6572 {TGSI_OPCODE_XPD, 0, ALU_OP0_NOP, tgsi_xpd},
6573 /* gap */
6574 {32, 0, ALU_OP0_NOP, tgsi_unsupported},
6575 {TGSI_OPCODE_ABS, 0, ALU_OP1_MOV, tgsi_op2},
6576 {TGSI_OPCODE_RCC, 0, ALU_OP0_NOP, tgsi_unsupported},
6577 {TGSI_OPCODE_DPH, 0, ALU_OP2_DOT4, tgsi_dp},
6578 {TGSI_OPCODE_COS, 0, ALU_OP1_COS, tgsi_trig},
6579 {TGSI_OPCODE_DDX, 0, FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
6580 {TGSI_OPCODE_DDY, 0, FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
6581 {TGSI_OPCODE_KILL, 0, ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */
6582 {TGSI_OPCODE_PK2H, 0, ALU_OP0_NOP, tgsi_unsupported},
6583 {TGSI_OPCODE_PK2US, 0, ALU_OP0_NOP, tgsi_unsupported},
6584 {TGSI_OPCODE_PK4B, 0, ALU_OP0_NOP, tgsi_unsupported},
6585 {TGSI_OPCODE_PK4UB, 0, ALU_OP0_NOP, tgsi_unsupported},
6586 {TGSI_OPCODE_RFL, 0, ALU_OP0_NOP, tgsi_unsupported},
6587 {TGSI_OPCODE_SEQ, 0, ALU_OP2_SETE, tgsi_op2},
6588 {TGSI_OPCODE_SFL, 0, ALU_OP0_NOP, tgsi_unsupported},
6589 {TGSI_OPCODE_SGT, 0, ALU_OP2_SETGT, tgsi_op2},
6590 {TGSI_OPCODE_SIN, 0, ALU_OP1_SIN, tgsi_trig},
6591 {TGSI_OPCODE_SLE, 0, ALU_OP2_SETGE, tgsi_op2_swap},
6592 {TGSI_OPCODE_SNE, 0, ALU_OP2_SETNE, tgsi_op2},
6593 {TGSI_OPCODE_STR, 0, ALU_OP0_NOP, tgsi_unsupported},
6594 {TGSI_OPCODE_TEX, 0, FETCH_OP_SAMPLE, tgsi_tex},
6595 {TGSI_OPCODE_TXD, 0, FETCH_OP_SAMPLE_G, tgsi_tex},
6596 {TGSI_OPCODE_TXP, 0, FETCH_OP_SAMPLE, tgsi_tex},
6597 {TGSI_OPCODE_UP2H, 0, ALU_OP0_NOP, tgsi_unsupported},
6598 {TGSI_OPCODE_UP2US, 0, ALU_OP0_NOP, tgsi_unsupported},
6599 {TGSI_OPCODE_UP4B, 0, ALU_OP0_NOP, tgsi_unsupported},
6600 {TGSI_OPCODE_UP4UB, 0, ALU_OP0_NOP, tgsi_unsupported},
6601 {TGSI_OPCODE_X2D, 0, ALU_OP0_NOP, tgsi_unsupported},
6602 {TGSI_OPCODE_ARA, 0, ALU_OP0_NOP, tgsi_unsupported},
6603 {TGSI_OPCODE_ARR, 0, ALU_OP0_NOP, tgsi_eg_arl},
6604 {TGSI_OPCODE_BRA, 0, ALU_OP0_NOP, tgsi_unsupported},
6605 {TGSI_OPCODE_CAL, 0, ALU_OP0_NOP, tgsi_unsupported},
6606 {TGSI_OPCODE_RET, 0, ALU_OP0_NOP, tgsi_unsupported},
6607 {TGSI_OPCODE_SSG, 0, ALU_OP0_NOP, tgsi_ssg},
6608 {TGSI_OPCODE_CMP, 0, ALU_OP0_NOP, tgsi_cmp},
6609 {TGSI_OPCODE_SCS, 0, ALU_OP0_NOP, tgsi_scs},
6610 {TGSI_OPCODE_TXB, 0, FETCH_OP_SAMPLE_LB, tgsi_tex},
6611 {TGSI_OPCODE_NRM, 0, ALU_OP0_NOP, tgsi_unsupported},
6612 {TGSI_OPCODE_DIV, 0, ALU_OP0_NOP, tgsi_unsupported},
6613 {TGSI_OPCODE_DP2, 0, ALU_OP2_DOT4, tgsi_dp},
6614 {TGSI_OPCODE_TXL, 0, FETCH_OP_SAMPLE_L, tgsi_tex},
6615 {TGSI_OPCODE_BRK, 0, CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
6616 {TGSI_OPCODE_IF, 0, ALU_OP0_NOP, tgsi_if},
6617 {TGSI_OPCODE_UIF, 0, ALU_OP0_NOP, tgsi_uif},
6618 {76, 0, ALU_OP0_NOP, tgsi_unsupported},
6619 {TGSI_OPCODE_ELSE, 0, ALU_OP0_NOP, tgsi_else},
6620 {TGSI_OPCODE_ENDIF, 0, ALU_OP0_NOP, tgsi_endif},
6621 /* gap */
6622 {79, 0, ALU_OP0_NOP, tgsi_unsupported},
6623 {80, 0, ALU_OP0_NOP, tgsi_unsupported},
6624 {TGSI_OPCODE_PUSHA, 0, ALU_OP0_NOP, tgsi_unsupported},
6625 {TGSI_OPCODE_POPA, 0, ALU_OP0_NOP, tgsi_unsupported},
6626 {TGSI_OPCODE_CEIL, 0, ALU_OP1_CEIL, tgsi_op2},
6627 {TGSI_OPCODE_I2F, 0, ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
6628 {TGSI_OPCODE_NOT, 0, ALU_OP1_NOT_INT, tgsi_op2},
6629 {TGSI_OPCODE_TRUNC, 0, ALU_OP1_TRUNC, tgsi_op2},
6630 {TGSI_OPCODE_SHL, 0, ALU_OP2_LSHL_INT, tgsi_op2},
6631 /* gap */
6632 {88, 0, ALU_OP0_NOP, tgsi_unsupported},
6633 {TGSI_OPCODE_AND, 0, ALU_OP2_AND_INT, tgsi_op2},
6634 {TGSI_OPCODE_OR, 0, ALU_OP2_OR_INT, tgsi_op2},
6635 {TGSI_OPCODE_MOD, 0, ALU_OP0_NOP, tgsi_imod},
6636 {TGSI_OPCODE_XOR, 0, ALU_OP2_XOR_INT, tgsi_op2},
6637 {TGSI_OPCODE_SAD, 0, ALU_OP0_NOP, tgsi_unsupported},
6638 {TGSI_OPCODE_TXF, 0, FETCH_OP_LD, tgsi_tex},
6639 {TGSI_OPCODE_TXQ, 0, FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
6640 {TGSI_OPCODE_CONT, 0, CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
6641 {TGSI_OPCODE_EMIT, 0, CF_OP_EMIT_VERTEX, tgsi_gs_emit},
6642 {TGSI_OPCODE_ENDPRIM, 0, CF_OP_CUT_VERTEX, tgsi_gs_emit},
6643 {TGSI_OPCODE_BGNLOOP, 0, ALU_OP0_NOP, tgsi_bgnloop},
6644 {TGSI_OPCODE_BGNSUB, 0, ALU_OP0_NOP, tgsi_unsupported},
6645 {TGSI_OPCODE_ENDLOOP, 0, ALU_OP0_NOP, tgsi_endloop},
6646 {TGSI_OPCODE_ENDSUB, 0, ALU_OP0_NOP, tgsi_unsupported},
6647 {TGSI_OPCODE_TXQ_LZ, 0, FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
6648 /* gap */
6649 {104, 0, ALU_OP0_NOP, tgsi_unsupported},
6650 {105, 0, ALU_OP0_NOP, tgsi_unsupported},
6651 {106, 0, ALU_OP0_NOP, tgsi_unsupported},
6652 {TGSI_OPCODE_NOP, 0, ALU_OP0_NOP, tgsi_unsupported},
6653 {TGSI_OPCODE_FSEQ, 0, ALU_OP2_SETE_DX10, tgsi_op2},
6654 {TGSI_OPCODE_FSGE, 0, ALU_OP2_SETGE_DX10, tgsi_op2},
6655 {TGSI_OPCODE_FSLT, 0, ALU_OP2_SETGT_DX10, tgsi_op2_swap},
6656 {TGSI_OPCODE_FSNE, 0, ALU_OP2_SETNE_DX10, tgsi_op2_swap},
6657 {TGSI_OPCODE_NRM4, 0, ALU_OP0_NOP, tgsi_unsupported},
6658 {TGSI_OPCODE_CALLNZ, 0, ALU_OP0_NOP, tgsi_unsupported},
6659 /* gap */
6660 {114, 0, ALU_OP0_NOP, tgsi_unsupported},
6661 {TGSI_OPCODE_BREAKC, 0, ALU_OP0_NOP, tgsi_unsupported},
6662 {TGSI_OPCODE_KILL_IF, 0, ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */
6663 {TGSI_OPCODE_END, 0, ALU_OP0_NOP, tgsi_end}, /* aka HALT */
6664 /* gap */
6665 {118, 0, ALU_OP0_NOP, tgsi_unsupported},
6666 {TGSI_OPCODE_F2I, 0, ALU_OP1_FLT_TO_INT, tgsi_f2i},
6667 {TGSI_OPCODE_IDIV, 0, ALU_OP0_NOP, tgsi_idiv},
6668 {TGSI_OPCODE_IMAX, 0, ALU_OP2_MAX_INT, tgsi_op2},
6669 {TGSI_OPCODE_IMIN, 0, ALU_OP2_MIN_INT, tgsi_op2},
6670 {TGSI_OPCODE_INEG, 0, ALU_OP2_SUB_INT, tgsi_ineg},
6671 {TGSI_OPCODE_ISGE, 0, ALU_OP2_SETGE_INT, tgsi_op2},
6672 {TGSI_OPCODE_ISHR, 0, ALU_OP2_ASHR_INT, tgsi_op2},
6673 {TGSI_OPCODE_ISLT, 0, ALU_OP2_SETGT_INT, tgsi_op2_swap},
6674 {TGSI_OPCODE_F2U, 0, ALU_OP1_FLT_TO_UINT, tgsi_f2i},
6675 {TGSI_OPCODE_U2F, 0, ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
6676 {TGSI_OPCODE_UADD, 0, ALU_OP2_ADD_INT, tgsi_op2},
6677 {TGSI_OPCODE_UDIV, 0, ALU_OP0_NOP, tgsi_udiv},
6678 {TGSI_OPCODE_UMAD, 0, ALU_OP0_NOP, tgsi_umad},
6679 {TGSI_OPCODE_UMAX, 0, ALU_OP2_MAX_UINT, tgsi_op2},
6680 {TGSI_OPCODE_UMIN, 0, ALU_OP2_MIN_UINT, tgsi_op2},
6681 {TGSI_OPCODE_UMOD, 0, ALU_OP0_NOP, tgsi_umod},
6682 {TGSI_OPCODE_UMUL, 0, ALU_OP2_MULLO_UINT, tgsi_op2_trans},
6683 {TGSI_OPCODE_USEQ, 0, ALU_OP2_SETE_INT, tgsi_op2},
6684 {TGSI_OPCODE_USGE, 0, ALU_OP2_SETGE_UINT, tgsi_op2},
6685 {TGSI_OPCODE_USHR, 0, ALU_OP2_LSHR_INT, tgsi_op2},
6686 {TGSI_OPCODE_USLT, 0, ALU_OP2_SETGT_UINT, tgsi_op2_swap},
6687 {TGSI_OPCODE_USNE, 0, ALU_OP2_SETNE_INT, tgsi_op2},
6688 {TGSI_OPCODE_SWITCH, 0, ALU_OP0_NOP, tgsi_unsupported},
6689 {TGSI_OPCODE_CASE, 0, ALU_OP0_NOP, tgsi_unsupported},
6690 {TGSI_OPCODE_DEFAULT, 0, ALU_OP0_NOP, tgsi_unsupported},
6691 {TGSI_OPCODE_ENDSWITCH, 0, ALU_OP0_NOP, tgsi_unsupported},
6692 {TGSI_OPCODE_SAMPLE, 0, 0, tgsi_unsupported},
6693 {TGSI_OPCODE_SAMPLE_I, 0, 0, tgsi_unsupported},
6694 {TGSI_OPCODE_SAMPLE_I_MS, 0, 0, tgsi_unsupported},
6695 {TGSI_OPCODE_SAMPLE_B, 0, 0, tgsi_unsupported},
6696 {TGSI_OPCODE_SAMPLE_C, 0, 0, tgsi_unsupported},
6697 {TGSI_OPCODE_SAMPLE_C_LZ, 0, 0, tgsi_unsupported},
6698 {TGSI_OPCODE_SAMPLE_D, 0, 0, tgsi_unsupported},
6699 {TGSI_OPCODE_SAMPLE_L, 0, 0, tgsi_unsupported},
6700 {TGSI_OPCODE_GATHER4, 0, 0, tgsi_unsupported},
6701 {TGSI_OPCODE_SVIEWINFO, 0, 0, tgsi_unsupported},
6702 {TGSI_OPCODE_SAMPLE_POS, 0, 0, tgsi_unsupported},
6703 {TGSI_OPCODE_SAMPLE_INFO, 0, 0, tgsi_unsupported},
6704 {TGSI_OPCODE_UARL, 0, ALU_OP1_MOVA_INT, tgsi_eg_arl},
6705 {TGSI_OPCODE_UCMP, 0, ALU_OP0_NOP, tgsi_ucmp},
6706 {TGSI_OPCODE_IABS, 0, 0, tgsi_iabs},
6707 {TGSI_OPCODE_ISSG, 0, 0, tgsi_issg},
6708 {TGSI_OPCODE_LOAD, 0, ALU_OP0_NOP, tgsi_unsupported},
6709 {TGSI_OPCODE_STORE, 0, ALU_OP0_NOP, tgsi_unsupported},
6710 {TGSI_OPCODE_MFENCE, 0, ALU_OP0_NOP, tgsi_unsupported},
6711 {TGSI_OPCODE_LFENCE, 0, ALU_OP0_NOP, tgsi_unsupported},
6712 {TGSI_OPCODE_SFENCE, 0, ALU_OP0_NOP, tgsi_unsupported},
6713 {TGSI_OPCODE_BARRIER, 0, ALU_OP0_NOP, tgsi_unsupported},
6714 {TGSI_OPCODE_ATOMUADD, 0, ALU_OP0_NOP, tgsi_unsupported},
6715 {TGSI_OPCODE_ATOMXCHG, 0, ALU_OP0_NOP, tgsi_unsupported},
6716 {TGSI_OPCODE_ATOMCAS, 0, ALU_OP0_NOP, tgsi_unsupported},
6717 {TGSI_OPCODE_ATOMAND, 0, ALU_OP0_NOP, tgsi_unsupported},
6718 {TGSI_OPCODE_ATOMOR, 0, ALU_OP0_NOP, tgsi_unsupported},
6719 {TGSI_OPCODE_ATOMXOR, 0, ALU_OP0_NOP, tgsi_unsupported},
6720 {TGSI_OPCODE_ATOMUMIN, 0, ALU_OP0_NOP, tgsi_unsupported},
6721 {TGSI_OPCODE_ATOMUMAX, 0, ALU_OP0_NOP, tgsi_unsupported},
6722 {TGSI_OPCODE_ATOMIMIN, 0, ALU_OP0_NOP, tgsi_unsupported},
6723 {TGSI_OPCODE_ATOMIMAX, 0, ALU_OP0_NOP, tgsi_unsupported},
6724 {TGSI_OPCODE_TEX2, 0, FETCH_OP_SAMPLE, tgsi_tex},
6725 {TGSI_OPCODE_TXB2, 0, FETCH_OP_SAMPLE_LB, tgsi_tex},
6726 {TGSI_OPCODE_TXL2, 0, FETCH_OP_SAMPLE_L, tgsi_tex},
6727 {TGSI_OPCODE_LAST, 0, ALU_OP0_NOP, tgsi_unsupported},
6728 };
6729
6730 static struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = {
6731 {TGSI_OPCODE_ARL, 0, ALU_OP0_NOP, tgsi_eg_arl},
6732 {TGSI_OPCODE_MOV, 0, ALU_OP1_MOV, tgsi_op2},
6733 {TGSI_OPCODE_LIT, 0, ALU_OP0_NOP, tgsi_lit},
6734 {TGSI_OPCODE_RCP, 0, ALU_OP1_RECIP_IEEE, cayman_emit_float_instr},
6735 {TGSI_OPCODE_RSQ, 0, ALU_OP1_RECIPSQRT_IEEE, cayman_emit_float_instr},
6736 {TGSI_OPCODE_EXP, 0, ALU_OP0_NOP, tgsi_exp},
6737 {TGSI_OPCODE_LOG, 0, ALU_OP0_NOP, tgsi_log},
6738 {TGSI_OPCODE_MUL, 0, ALU_OP2_MUL, tgsi_op2},
6739 {TGSI_OPCODE_ADD, 0, ALU_OP2_ADD, tgsi_op2},
6740 {TGSI_OPCODE_DP3, 0, ALU_OP2_DOT4, tgsi_dp},
6741 {TGSI_OPCODE_DP4, 0, ALU_OP2_DOT4, tgsi_dp},
6742 {TGSI_OPCODE_DST, 0, ALU_OP0_NOP, tgsi_opdst},
6743 {TGSI_OPCODE_MIN, 0, ALU_OP2_MIN, tgsi_op2},
6744 {TGSI_OPCODE_MAX, 0, ALU_OP2_MAX, tgsi_op2},
6745 {TGSI_OPCODE_SLT, 0, ALU_OP2_SETGT, tgsi_op2_swap},
6746 {TGSI_OPCODE_SGE, 0, ALU_OP2_SETGE, tgsi_op2},
6747 {TGSI_OPCODE_MAD, 1, ALU_OP3_MULADD, tgsi_op3},
6748 {TGSI_OPCODE_SUB, 0, ALU_OP2_ADD, tgsi_op2},
6749 {TGSI_OPCODE_LRP, 0, ALU_OP0_NOP, tgsi_lrp},
6750 {TGSI_OPCODE_CND, 0, ALU_OP0_NOP, tgsi_unsupported},
6751 /* gap */
6752 {20, 0, ALU_OP0_NOP, tgsi_unsupported},
6753 {TGSI_OPCODE_DP2A, 0, ALU_OP0_NOP, tgsi_unsupported},
6754 /* gap */
6755 {22, 0, ALU_OP0_NOP, tgsi_unsupported},
6756 {23, 0, ALU_OP0_NOP, tgsi_unsupported},
6757 {TGSI_OPCODE_FRC, 0, ALU_OP1_FRACT, tgsi_op2},
6758 {TGSI_OPCODE_CLAMP, 0, ALU_OP0_NOP, tgsi_unsupported},
6759 {TGSI_OPCODE_FLR, 0, ALU_OP1_FLOOR, tgsi_op2},
6760 {TGSI_OPCODE_ROUND, 0, ALU_OP1_RNDNE, tgsi_op2},
6761 {TGSI_OPCODE_EX2, 0, ALU_OP1_EXP_IEEE, cayman_emit_float_instr},
6762 {TGSI_OPCODE_LG2, 0, ALU_OP1_LOG_IEEE, cayman_emit_float_instr},
6763 {TGSI_OPCODE_POW, 0, ALU_OP0_NOP, cayman_pow},
6764 {TGSI_OPCODE_XPD, 0, ALU_OP0_NOP, tgsi_xpd},
6765 /* gap */
6766 {32, 0, ALU_OP0_NOP, tgsi_unsupported},
6767 {TGSI_OPCODE_ABS, 0, ALU_OP1_MOV, tgsi_op2},
6768 {TGSI_OPCODE_RCC, 0, ALU_OP0_NOP, tgsi_unsupported},
6769 {TGSI_OPCODE_DPH, 0, ALU_OP2_DOT4, tgsi_dp},
6770 {TGSI_OPCODE_COS, 0, ALU_OP1_COS, cayman_trig},
6771 {TGSI_OPCODE_DDX, 0, FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
6772 {TGSI_OPCODE_DDY, 0, FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
6773 {TGSI_OPCODE_KILL, 0, ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */
6774 {TGSI_OPCODE_PK2H, 0, ALU_OP0_NOP, tgsi_unsupported},
6775 {TGSI_OPCODE_PK2US, 0, ALU_OP0_NOP, tgsi_unsupported},
6776 {TGSI_OPCODE_PK4B, 0, ALU_OP0_NOP, tgsi_unsupported},
6777 {TGSI_OPCODE_PK4UB, 0, ALU_OP0_NOP, tgsi_unsupported},
6778 {TGSI_OPCODE_RFL, 0, ALU_OP0_NOP, tgsi_unsupported},
6779 {TGSI_OPCODE_SEQ, 0, ALU_OP2_SETE, tgsi_op2},
6780 {TGSI_OPCODE_SFL, 0, ALU_OP0_NOP, tgsi_unsupported},
6781 {TGSI_OPCODE_SGT, 0, ALU_OP2_SETGT, tgsi_op2},
6782 {TGSI_OPCODE_SIN, 0, ALU_OP1_SIN, cayman_trig},
6783 {TGSI_OPCODE_SLE, 0, ALU_OP2_SETGE, tgsi_op2_swap},
6784 {TGSI_OPCODE_SNE, 0, ALU_OP2_SETNE, tgsi_op2},
6785 {TGSI_OPCODE_STR, 0, ALU_OP0_NOP, tgsi_unsupported},
6786 {TGSI_OPCODE_TEX, 0, FETCH_OP_SAMPLE, tgsi_tex},
6787 {TGSI_OPCODE_TXD, 0, FETCH_OP_SAMPLE_G, tgsi_tex},
6788 {TGSI_OPCODE_TXP, 0, FETCH_OP_SAMPLE, tgsi_tex},
6789 {TGSI_OPCODE_UP2H, 0, ALU_OP0_NOP, tgsi_unsupported},
6790 {TGSI_OPCODE_UP2US, 0, ALU_OP0_NOP, tgsi_unsupported},
6791 {TGSI_OPCODE_UP4B, 0, ALU_OP0_NOP, tgsi_unsupported},
6792 {TGSI_OPCODE_UP4UB, 0, ALU_OP0_NOP, tgsi_unsupported},
6793 {TGSI_OPCODE_X2D, 0, ALU_OP0_NOP, tgsi_unsupported},
6794 {TGSI_OPCODE_ARA, 0, ALU_OP0_NOP, tgsi_unsupported},
6795 {TGSI_OPCODE_ARR, 0, ALU_OP0_NOP, tgsi_eg_arl},
6796 {TGSI_OPCODE_BRA, 0, ALU_OP0_NOP, tgsi_unsupported},
6797 {TGSI_OPCODE_CAL, 0, ALU_OP0_NOP, tgsi_unsupported},
6798 {TGSI_OPCODE_RET, 0, ALU_OP0_NOP, tgsi_unsupported},
6799 {TGSI_OPCODE_SSG, 0, ALU_OP0_NOP, tgsi_ssg},
6800 {TGSI_OPCODE_CMP, 0, ALU_OP0_NOP, tgsi_cmp},
6801 {TGSI_OPCODE_SCS, 0, ALU_OP0_NOP, tgsi_scs},
6802 {TGSI_OPCODE_TXB, 0, FETCH_OP_SAMPLE_LB, tgsi_tex},
6803 {TGSI_OPCODE_NRM, 0, ALU_OP0_NOP, tgsi_unsupported},
6804 {TGSI_OPCODE_DIV, 0, ALU_OP0_NOP, tgsi_unsupported},
6805 {TGSI_OPCODE_DP2, 0, ALU_OP2_DOT4, tgsi_dp},
6806 {TGSI_OPCODE_TXL, 0, FETCH_OP_SAMPLE_L, tgsi_tex},
6807 {TGSI_OPCODE_BRK, 0, CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
6808 {TGSI_OPCODE_IF, 0, ALU_OP0_NOP, tgsi_if},
6809 {TGSI_OPCODE_UIF, 0, ALU_OP0_NOP, tgsi_uif},
6810 {76, 0, ALU_OP0_NOP, tgsi_unsupported},
6811 {TGSI_OPCODE_ELSE, 0, ALU_OP0_NOP, tgsi_else},
6812 {TGSI_OPCODE_ENDIF, 0, ALU_OP0_NOP, tgsi_endif},
6813 /* gap */
6814 {79, 0, ALU_OP0_NOP, tgsi_unsupported},
6815 {80, 0, ALU_OP0_NOP, tgsi_unsupported},
6816 {TGSI_OPCODE_PUSHA, 0, ALU_OP0_NOP, tgsi_unsupported},
6817 {TGSI_OPCODE_POPA, 0, ALU_OP0_NOP, tgsi_unsupported},
6818 {TGSI_OPCODE_CEIL, 0, ALU_OP1_CEIL, tgsi_op2},
6819 {TGSI_OPCODE_I2F, 0, ALU_OP1_INT_TO_FLT, tgsi_op2},
6820 {TGSI_OPCODE_NOT, 0, ALU_OP1_NOT_INT, tgsi_op2},
6821 {TGSI_OPCODE_TRUNC, 0, ALU_OP1_TRUNC, tgsi_op2},
6822 {TGSI_OPCODE_SHL, 0, ALU_OP2_LSHL_INT, tgsi_op2},
6823 /* gap */
6824 {88, 0, ALU_OP0_NOP, tgsi_unsupported},
6825 {TGSI_OPCODE_AND, 0, ALU_OP2_AND_INT, tgsi_op2},
6826 {TGSI_OPCODE_OR, 0, ALU_OP2_OR_INT, tgsi_op2},
6827 {TGSI_OPCODE_MOD, 0, ALU_OP0_NOP, tgsi_imod},
6828 {TGSI_OPCODE_XOR, 0, ALU_OP2_XOR_INT, tgsi_op2},
6829 {TGSI_OPCODE_SAD, 0, ALU_OP0_NOP, tgsi_unsupported},
6830 {TGSI_OPCODE_TXF, 0, FETCH_OP_LD, tgsi_tex},
6831 {TGSI_OPCODE_TXQ, 0, FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
6832 {TGSI_OPCODE_CONT, 0, CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
6833 {TGSI_OPCODE_EMIT, 0, CF_OP_EMIT_VERTEX, tgsi_gs_emit},
6834 {TGSI_OPCODE_ENDPRIM, 0, CF_OP_CUT_VERTEX, tgsi_gs_emit},
6835 {TGSI_OPCODE_BGNLOOP, 0, ALU_OP0_NOP, tgsi_bgnloop},
6836 {TGSI_OPCODE_BGNSUB, 0, ALU_OP0_NOP, tgsi_unsupported},
6837 {TGSI_OPCODE_ENDLOOP, 0, ALU_OP0_NOP, tgsi_endloop},
6838 {TGSI_OPCODE_ENDSUB, 0, ALU_OP0_NOP, tgsi_unsupported},
6839 {TGSI_OPCODE_TXQ_LZ, 0, FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
6840 /* gap */
6841 {104, 0, ALU_OP0_NOP, tgsi_unsupported},
6842 {105, 0, ALU_OP0_NOP, tgsi_unsupported},
6843 {106, 0, ALU_OP0_NOP, tgsi_unsupported},
6844 {TGSI_OPCODE_NOP, 0, ALU_OP0_NOP, tgsi_unsupported},
6845 /* gap */
6846 {TGSI_OPCODE_FSEQ, 0, ALU_OP2_SETE_DX10, tgsi_op2},
6847 {TGSI_OPCODE_FSGE, 0, ALU_OP2_SETGE_DX10, tgsi_op2},
6848 {TGSI_OPCODE_FSLT, 0, ALU_OP2_SETGT_DX10, tgsi_op2_swap},
6849 {TGSI_OPCODE_FSNE, 0, ALU_OP2_SETNE_DX10, tgsi_op2_swap},
6850 {TGSI_OPCODE_NRM4, 0, ALU_OP0_NOP, tgsi_unsupported},
6851 {TGSI_OPCODE_CALLNZ, 0, ALU_OP0_NOP, tgsi_unsupported},
6852 /* gap */
6853 {114, 0, ALU_OP0_NOP, tgsi_unsupported},
6854 {TGSI_OPCODE_BREAKC, 0, ALU_OP0_NOP, tgsi_unsupported},
6855 {TGSI_OPCODE_KILL_IF, 0, ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */
6856 {TGSI_OPCODE_END, 0, ALU_OP0_NOP, tgsi_end}, /* aka HALT */
6857 /* gap */
6858 {118, 0, ALU_OP0_NOP, tgsi_unsupported},
6859 {TGSI_OPCODE_F2I, 0, ALU_OP1_FLT_TO_INT, tgsi_op2},
6860 {TGSI_OPCODE_IDIV, 0, ALU_OP0_NOP, tgsi_idiv},
6861 {TGSI_OPCODE_IMAX, 0, ALU_OP2_MAX_INT, tgsi_op2},
6862 {TGSI_OPCODE_IMIN, 0, ALU_OP2_MIN_INT, tgsi_op2},
6863 {TGSI_OPCODE_INEG, 0, ALU_OP2_SUB_INT, tgsi_ineg},
6864 {TGSI_OPCODE_ISGE, 0, ALU_OP2_SETGE_INT, tgsi_op2},
6865 {TGSI_OPCODE_ISHR, 0, ALU_OP2_ASHR_INT, tgsi_op2},
6866 {TGSI_OPCODE_ISLT, 0, ALU_OP2_SETGT_INT, tgsi_op2_swap},
6867 {TGSI_OPCODE_F2U, 0, ALU_OP1_FLT_TO_UINT, tgsi_op2},
6868 {TGSI_OPCODE_U2F, 0, ALU_OP1_UINT_TO_FLT, tgsi_op2},
6869 {TGSI_OPCODE_UADD, 0, ALU_OP2_ADD_INT, tgsi_op2},
6870 {TGSI_OPCODE_UDIV, 0, ALU_OP0_NOP, tgsi_udiv},
6871 {TGSI_OPCODE_UMAD, 0, ALU_OP0_NOP, tgsi_umad},
6872 {TGSI_OPCODE_UMAX, 0, ALU_OP2_MAX_UINT, tgsi_op2},
6873 {TGSI_OPCODE_UMIN, 0, ALU_OP2_MIN_UINT, tgsi_op2},
6874 {TGSI_OPCODE_UMOD, 0, ALU_OP0_NOP, tgsi_umod},
6875 {TGSI_OPCODE_UMUL, 0, ALU_OP2_MULLO_INT, cayman_mul_int_instr},
6876 {TGSI_OPCODE_USEQ, 0, ALU_OP2_SETE_INT, tgsi_op2},
6877 {TGSI_OPCODE_USGE, 0, ALU_OP2_SETGE_UINT, tgsi_op2},
6878 {TGSI_OPCODE_USHR, 0, ALU_OP2_LSHR_INT, tgsi_op2},
6879 {TGSI_OPCODE_USLT, 0, ALU_OP2_SETGT_UINT, tgsi_op2_swap},
6880 {TGSI_OPCODE_USNE, 0, ALU_OP2_SETNE_INT, tgsi_op2},
6881 {TGSI_OPCODE_SWITCH, 0, ALU_OP0_NOP, tgsi_unsupported},
6882 {TGSI_OPCODE_CASE, 0, ALU_OP0_NOP, tgsi_unsupported},
6883 {TGSI_OPCODE_DEFAULT, 0, ALU_OP0_NOP, tgsi_unsupported},
6884 {TGSI_OPCODE_ENDSWITCH, 0, ALU_OP0_NOP, tgsi_unsupported},
6885 {TGSI_OPCODE_SAMPLE, 0, 0, tgsi_unsupported},
6886 {TGSI_OPCODE_SAMPLE_I, 0, 0, tgsi_unsupported},
6887 {TGSI_OPCODE_SAMPLE_I_MS, 0, 0, tgsi_unsupported},
6888 {TGSI_OPCODE_SAMPLE_B, 0, 0, tgsi_unsupported},
6889 {TGSI_OPCODE_SAMPLE_C, 0, 0, tgsi_unsupported},
6890 {TGSI_OPCODE_SAMPLE_C_LZ, 0, 0, tgsi_unsupported},
6891 {TGSI_OPCODE_SAMPLE_D, 0, 0, tgsi_unsupported},
6892 {TGSI_OPCODE_SAMPLE_L, 0, 0, tgsi_unsupported},
6893 {TGSI_OPCODE_GATHER4, 0, 0, tgsi_unsupported},
6894 {TGSI_OPCODE_SVIEWINFO, 0, 0, tgsi_unsupported},
6895 {TGSI_OPCODE_SAMPLE_POS, 0, 0, tgsi_unsupported},
6896 {TGSI_OPCODE_SAMPLE_INFO, 0, 0, tgsi_unsupported},
6897 {TGSI_OPCODE_UARL, 0, ALU_OP1_MOVA_INT, tgsi_eg_arl},
6898 {TGSI_OPCODE_UCMP, 0, ALU_OP0_NOP, tgsi_ucmp},
6899 {TGSI_OPCODE_IABS, 0, 0, tgsi_iabs},
6900 {TGSI_OPCODE_ISSG, 0, 0, tgsi_issg},
6901 {TGSI_OPCODE_LOAD, 0, ALU_OP0_NOP, tgsi_unsupported},
6902 {TGSI_OPCODE_STORE, 0, ALU_OP0_NOP, tgsi_unsupported},
6903 {TGSI_OPCODE_MFENCE, 0, ALU_OP0_NOP, tgsi_unsupported},
6904 {TGSI_OPCODE_LFENCE, 0, ALU_OP0_NOP, tgsi_unsupported},
6905 {TGSI_OPCODE_SFENCE, 0, ALU_OP0_NOP, tgsi_unsupported},
6906 {TGSI_OPCODE_BARRIER, 0, ALU_OP0_NOP, tgsi_unsupported},
6907 {TGSI_OPCODE_ATOMUADD, 0, ALU_OP0_NOP, tgsi_unsupported},
6908 {TGSI_OPCODE_ATOMXCHG, 0, ALU_OP0_NOP, tgsi_unsupported},
6909 {TGSI_OPCODE_ATOMCAS, 0, ALU_OP0_NOP, tgsi_unsupported},
6910 {TGSI_OPCODE_ATOMAND, 0, ALU_OP0_NOP, tgsi_unsupported},
6911 {TGSI_OPCODE_ATOMOR, 0, ALU_OP0_NOP, tgsi_unsupported},
6912 {TGSI_OPCODE_ATOMXOR, 0, ALU_OP0_NOP, tgsi_unsupported},
6913 {TGSI_OPCODE_ATOMUMIN, 0, ALU_OP0_NOP, tgsi_unsupported},
6914 {TGSI_OPCODE_ATOMUMAX, 0, ALU_OP0_NOP, tgsi_unsupported},
6915 {TGSI_OPCODE_ATOMIMIN, 0, ALU_OP0_NOP, tgsi_unsupported},
6916 {TGSI_OPCODE_ATOMIMAX, 0, ALU_OP0_NOP, tgsi_unsupported},
6917 {TGSI_OPCODE_TEX2, 0, FETCH_OP_SAMPLE, tgsi_tex},
6918 {TGSI_OPCODE_TXB2, 0, FETCH_OP_SAMPLE_LB, tgsi_tex},
6919 {TGSI_OPCODE_TXL2, 0, FETCH_OP_SAMPLE_L, tgsi_tex},
6920 {TGSI_OPCODE_LAST, 0, ALU_OP0_NOP, tgsi_unsupported},
6921 };