r600g: allow vs to write to gl_ViewportIndex
[mesa.git] / src / gallium / drivers / r600 / r600_shader.c
1 /*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 */
23 #include "r600_sq.h"
24 #include "r600_llvm.h"
25 #include "r600_formats.h"
26 #include "r600_opcodes.h"
27 #include "r600_shader.h"
28 #include "r600d.h"
29
30 #include "sb/sb_public.h"
31
32 #include "pipe/p_shader_tokens.h"
33 #include "tgsi/tgsi_info.h"
34 #include "tgsi/tgsi_parse.h"
35 #include "tgsi/tgsi_scan.h"
36 #include "tgsi/tgsi_dump.h"
37 #include "util/u_memory.h"
38 #include "util/u_math.h"
39 #include <stdio.h>
40 #include <errno.h>
41
42 /* CAYMAN notes
43 Why CAYMAN got loops for lots of instructions is explained here.
44
45 -These 8xx t-slot only ops are implemented in all vector slots.
46 MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT
47 These 8xx t-slot only opcodes become vector ops, with all four
48 slots expecting the arguments on sources a and b. Result is
49 broadcast to all channels.
50 MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT
51 These 8xx t-slot only opcodes become vector ops in the z, y, and
52 x slots.
53 EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64
54 RECIPSQRT_IEEE/CLAMPED/FF/_64/CLAMPED_64
55 SQRT_IEEE/_64
56 SIN/COS
57 The w slot may have an independent co-issued operation, or if the
58 result is required to be in the w slot, the opcode above may be
59 issued in the w slot as well.
60 The compiler must issue the source argument to slots z, y, and x
61 */
62
63 static int r600_shader_from_tgsi(struct r600_context *rctx,
64 struct r600_pipe_shader *pipeshader,
65 struct r600_shader_key key);
66
67 static void r600_add_gpr_array(struct r600_shader *ps, int start_gpr,
68 int size, unsigned comp_mask) {
69
70 if (!size)
71 return;
72
73 if (ps->num_arrays == ps->max_arrays) {
74 ps->max_arrays += 64;
75 ps->arrays = realloc(ps->arrays, ps->max_arrays *
76 sizeof(struct r600_shader_array));
77 }
78
79 int n = ps->num_arrays;
80 ++ps->num_arrays;
81
82 ps->arrays[n].comp_mask = comp_mask;
83 ps->arrays[n].gpr_start = start_gpr;
84 ps->arrays[n].gpr_count = size;
85 }
86
87 static void r600_dump_streamout(struct pipe_stream_output_info *so)
88 {
89 unsigned i;
90
91 fprintf(stderr, "STREAMOUT\n");
92 for (i = 0; i < so->num_outputs; i++) {
93 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
94 so->output[i].start_component;
95 fprintf(stderr, " %i: MEM_STREAM0_BUF%i[%i..%i] <- OUT[%i].%s%s%s%s%s\n",
96 i, so->output[i].output_buffer,
97 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
98 so->output[i].register_index,
99 mask & 1 ? "x" : "",
100 mask & 2 ? "y" : "",
101 mask & 4 ? "z" : "",
102 mask & 8 ? "w" : "",
103 so->output[i].dst_offset < so->output[i].start_component ? " (will lower)" : "");
104 }
105 }
106
107 static int store_shader(struct pipe_context *ctx,
108 struct r600_pipe_shader *shader)
109 {
110 struct r600_context *rctx = (struct r600_context *)ctx;
111 uint32_t *ptr, i;
112
113 if (shader->bo == NULL) {
114 shader->bo = (struct r600_resource*)
115 pipe_buffer_create(ctx->screen, PIPE_BIND_CUSTOM, PIPE_USAGE_IMMUTABLE, shader->shader.bc.ndw * 4);
116 if (shader->bo == NULL) {
117 return -ENOMEM;
118 }
119 ptr = r600_buffer_map_sync_with_rings(&rctx->b, shader->bo, PIPE_TRANSFER_WRITE);
120 if (R600_BIG_ENDIAN) {
121 for (i = 0; i < shader->shader.bc.ndw; ++i) {
122 ptr[i] = util_cpu_to_le32(shader->shader.bc.bytecode[i]);
123 }
124 } else {
125 memcpy(ptr, shader->shader.bc.bytecode, shader->shader.bc.ndw * sizeof(*ptr));
126 }
127 rctx->b.ws->buffer_unmap(shader->bo->cs_buf);
128 }
129
130 return 0;
131 }
132
133 int r600_pipe_shader_create(struct pipe_context *ctx,
134 struct r600_pipe_shader *shader,
135 struct r600_shader_key key)
136 {
137 struct r600_context *rctx = (struct r600_context *)ctx;
138 struct r600_pipe_shader_selector *sel = shader->selector;
139 int r;
140 bool dump = r600_can_dump_shader(&rctx->screen->b, sel->tokens);
141 unsigned use_sb = !(rctx->screen->b.debug_flags & DBG_NO_SB);
142 unsigned sb_disasm = use_sb || (rctx->screen->b.debug_flags & DBG_SB_DISASM);
143 unsigned export_shader = key.vs_as_es;
144
145 shader->shader.bc.isa = rctx->isa;
146
147 if (dump) {
148 fprintf(stderr, "--------------------------------------------------------------\n");
149 tgsi_dump(sel->tokens, 0);
150
151 if (sel->so.num_outputs) {
152 r600_dump_streamout(&sel->so);
153 }
154 }
155 r = r600_shader_from_tgsi(rctx, shader, key);
156 if (r) {
157 R600_ERR("translation from TGSI failed !\n");
158 goto error;
159 }
160
161 /* disable SB for geom shaders - it can't handle the CF_EMIT instructions */
162 use_sb &= (shader->shader.processor_type != TGSI_PROCESSOR_GEOMETRY);
163
164 /* Check if the bytecode has already been built. When using the llvm
165 * backend, r600_shader_from_tgsi() will take care of building the
166 * bytecode.
167 */
168 if (!shader->shader.bc.bytecode) {
169 r = r600_bytecode_build(&shader->shader.bc);
170 if (r) {
171 R600_ERR("building bytecode failed !\n");
172 goto error;
173 }
174 }
175
176 if (dump && !sb_disasm) {
177 fprintf(stderr, "--------------------------------------------------------------\n");
178 r600_bytecode_disasm(&shader->shader.bc);
179 fprintf(stderr, "______________________________________________________________\n");
180 } else if ((dump && sb_disasm) || use_sb) {
181 r = r600_sb_bytecode_process(rctx, &shader->shader.bc, &shader->shader,
182 dump, use_sb);
183 if (r) {
184 R600_ERR("r600_sb_bytecode_process failed !\n");
185 goto error;
186 }
187 }
188
189 if (shader->gs_copy_shader) {
190 if (dump) {
191 // dump copy shader
192 r = r600_sb_bytecode_process(rctx, &shader->gs_copy_shader->shader.bc,
193 &shader->gs_copy_shader->shader, dump, 0);
194 if (r)
195 goto error;
196 }
197
198 if ((r = store_shader(ctx, shader->gs_copy_shader)))
199 goto error;
200 }
201
202 /* Store the shader in a buffer. */
203 if ((r = store_shader(ctx, shader)))
204 goto error;
205
206 /* Build state. */
207 switch (shader->shader.processor_type) {
208 case TGSI_PROCESSOR_GEOMETRY:
209 if (rctx->b.chip_class >= EVERGREEN) {
210 evergreen_update_gs_state(ctx, shader);
211 evergreen_update_vs_state(ctx, shader->gs_copy_shader);
212 } else {
213 r600_update_gs_state(ctx, shader);
214 r600_update_vs_state(ctx, shader->gs_copy_shader);
215 }
216 break;
217 case TGSI_PROCESSOR_VERTEX:
218 if (rctx->b.chip_class >= EVERGREEN) {
219 if (export_shader)
220 evergreen_update_es_state(ctx, shader);
221 else
222 evergreen_update_vs_state(ctx, shader);
223 } else {
224 if (export_shader)
225 r600_update_es_state(ctx, shader);
226 else
227 r600_update_vs_state(ctx, shader);
228 }
229 break;
230 case TGSI_PROCESSOR_FRAGMENT:
231 if (rctx->b.chip_class >= EVERGREEN) {
232 evergreen_update_ps_state(ctx, shader);
233 } else {
234 r600_update_ps_state(ctx, shader);
235 }
236 break;
237 default:
238 r = -EINVAL;
239 goto error;
240 }
241 return 0;
242
243 error:
244 r600_pipe_shader_destroy(ctx, shader);
245 return r;
246 }
247
248 void r600_pipe_shader_destroy(struct pipe_context *ctx, struct r600_pipe_shader *shader)
249 {
250 pipe_resource_reference((struct pipe_resource**)&shader->bo, NULL);
251 r600_bytecode_clear(&shader->shader.bc);
252 r600_release_command_buffer(&shader->command_buffer);
253 }
254
255 /*
256 * tgsi -> r600 shader
257 */
258 struct r600_shader_tgsi_instruction;
259
260 struct r600_shader_src {
261 unsigned sel;
262 unsigned swizzle[4];
263 unsigned neg;
264 unsigned abs;
265 unsigned rel;
266 unsigned kc_bank;
267 uint32_t value[4];
268 };
269
270 struct r600_shader_ctx {
271 struct tgsi_shader_info info;
272 struct tgsi_parse_context parse;
273 const struct tgsi_token *tokens;
274 unsigned type;
275 unsigned file_offset[TGSI_FILE_COUNT];
276 unsigned temp_reg;
277 struct r600_shader_tgsi_instruction *inst_info;
278 struct r600_bytecode *bc;
279 struct r600_shader *shader;
280 struct r600_shader_src src[4];
281 uint32_t *literals;
282 uint32_t nliterals;
283 uint32_t max_driver_temp_used;
284 boolean use_llvm;
285 /* needed for evergreen interpolation */
286 boolean input_centroid;
287 boolean input_linear;
288 boolean input_perspective;
289 int num_interp_gpr;
290 int face_gpr;
291 int colors_used;
292 boolean clip_vertex_write;
293 unsigned cv_output;
294 unsigned edgeflag_output;
295 int fragcoord_input;
296 int native_integers;
297 int next_ring_offset;
298 int gs_out_ring_offset;
299 int gs_next_vertex;
300 struct r600_shader *gs_for_vs;
301 int gs_export_gpr_treg;
302 };
303
304 struct r600_shader_tgsi_instruction {
305 unsigned tgsi_opcode;
306 unsigned is_op3;
307 unsigned op;
308 int (*process)(struct r600_shader_ctx *ctx);
309 };
310
311 static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, bool ind);
312 static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[];
313 static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx);
314 static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason);
315 static void fc_pushlevel(struct r600_shader_ctx *ctx, int type);
316 static int tgsi_else(struct r600_shader_ctx *ctx);
317 static int tgsi_endif(struct r600_shader_ctx *ctx);
318 static int tgsi_bgnloop(struct r600_shader_ctx *ctx);
319 static int tgsi_endloop(struct r600_shader_ctx *ctx);
320 static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx);
321
322 static int tgsi_is_supported(struct r600_shader_ctx *ctx)
323 {
324 struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction;
325 int j;
326
327 if (i->Instruction.NumDstRegs > 1) {
328 R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs);
329 return -EINVAL;
330 }
331 if (i->Instruction.Predicate) {
332 R600_ERR("predicate unsupported\n");
333 return -EINVAL;
334 }
335 #if 0
336 if (i->Instruction.Label) {
337 R600_ERR("label unsupported\n");
338 return -EINVAL;
339 }
340 #endif
341 for (j = 0; j < i->Instruction.NumSrcRegs; j++) {
342 if (i->Src[j].Register.Dimension) {
343 switch (i->Src[j].Register.File) {
344 case TGSI_FILE_CONSTANT:
345 break;
346 case TGSI_FILE_INPUT:
347 if (ctx->type == TGSI_PROCESSOR_GEOMETRY)
348 break;
349 default:
350 R600_ERR("unsupported src %d (dimension %d)\n", j,
351 i->Src[j].Register.Dimension);
352 return -EINVAL;
353 }
354 }
355 }
356 for (j = 0; j < i->Instruction.NumDstRegs; j++) {
357 if (i->Dst[j].Register.Dimension) {
358 R600_ERR("unsupported dst (dimension)\n");
359 return -EINVAL;
360 }
361 }
362 return 0;
363 }
364
365 static void evergreen_interp_assign_ij_index(struct r600_shader_ctx *ctx,
366 int input)
367 {
368 int ij_index = 0;
369
370 if (ctx->shader->input[input].interpolate == TGSI_INTERPOLATE_PERSPECTIVE) {
371 if (ctx->shader->input[input].centroid)
372 ij_index++;
373 } else if (ctx->shader->input[input].interpolate == TGSI_INTERPOLATE_LINEAR) {
374 /* if we have perspective add one */
375 if (ctx->input_perspective) {
376 ij_index++;
377 /* if we have perspective centroid */
378 if (ctx->input_centroid)
379 ij_index++;
380 }
381 if (ctx->shader->input[input].centroid)
382 ij_index++;
383 }
384
385 ctx->shader->input[input].ij_index = ij_index;
386 }
387
388 static int evergreen_interp_alu(struct r600_shader_ctx *ctx, int input)
389 {
390 int i, r;
391 struct r600_bytecode_alu alu;
392 int gpr = 0, base_chan = 0;
393 int ij_index = ctx->shader->input[input].ij_index;
394
395 /* work out gpr and base_chan from index */
396 gpr = ij_index / 2;
397 base_chan = (2 * (ij_index % 2)) + 1;
398
399 for (i = 0; i < 8; i++) {
400 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
401
402 if (i < 4)
403 alu.op = ALU_OP2_INTERP_ZW;
404 else
405 alu.op = ALU_OP2_INTERP_XY;
406
407 if ((i > 1) && (i < 6)) {
408 alu.dst.sel = ctx->shader->input[input].gpr;
409 alu.dst.write = 1;
410 }
411
412 alu.dst.chan = i % 4;
413
414 alu.src[0].sel = gpr;
415 alu.src[0].chan = (base_chan - (i % 2));
416
417 alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
418
419 alu.bank_swizzle_force = SQ_ALU_VEC_210;
420 if ((i % 4) == 3)
421 alu.last = 1;
422 r = r600_bytecode_add_alu(ctx->bc, &alu);
423 if (r)
424 return r;
425 }
426 return 0;
427 }
428
429 static int evergreen_interp_flat(struct r600_shader_ctx *ctx, int input)
430 {
431 int i, r;
432 struct r600_bytecode_alu alu;
433
434 for (i = 0; i < 4; i++) {
435 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
436
437 alu.op = ALU_OP1_INTERP_LOAD_P0;
438
439 alu.dst.sel = ctx->shader->input[input].gpr;
440 alu.dst.write = 1;
441
442 alu.dst.chan = i;
443
444 alu.src[0].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
445 alu.src[0].chan = i;
446
447 if (i == 3)
448 alu.last = 1;
449 r = r600_bytecode_add_alu(ctx->bc, &alu);
450 if (r)
451 return r;
452 }
453 return 0;
454 }
455
456 /*
457 * Special export handling in shaders
458 *
459 * shader export ARRAY_BASE for EXPORT_POS:
460 * 60 is position
461 * 61 is misc vector
462 * 62, 63 are clip distance vectors
463 *
464 * The use of the values exported in 61-63 are controlled by PA_CL_VS_OUT_CNTL:
465 * VS_OUT_MISC_VEC_ENA - enables the use of all fields in export 61
466 * USE_VTX_POINT_SIZE - point size in the X channel of export 61
467 * USE_VTX_EDGE_FLAG - edge flag in the Y channel of export 61
468 * USE_VTX_RENDER_TARGET_INDX - render target index in the Z channel of export 61
469 * USE_VTX_VIEWPORT_INDX - viewport index in the W channel of export 61
470 * USE_VTX_KILL_FLAG - kill flag in the Z channel of export 61 (mutually
471 * exclusive from render target index)
472 * VS_OUT_CCDIST0_VEC_ENA/VS_OUT_CCDIST1_VEC_ENA - enable clip distance vectors
473 *
474 *
475 * shader export ARRAY_BASE for EXPORT_PIXEL:
476 * 0-7 CB targets
477 * 61 computed Z vector
478 *
479 * The use of the values exported in the computed Z vector are controlled
480 * by DB_SHADER_CONTROL:
481 * Z_EXPORT_ENABLE - Z as a float in RED
482 * STENCIL_REF_EXPORT_ENABLE - stencil ref as int in GREEN
483 * COVERAGE_TO_MASK_ENABLE - alpha to mask in ALPHA
484 * MASK_EXPORT_ENABLE - pixel sample mask in BLUE
485 * DB_SOURCE_FORMAT - export control restrictions
486 *
487 */
488
489
490 /* Map name/sid pair from tgsi to the 8-bit semantic index for SPI setup */
491 static int r600_spi_sid(struct r600_shader_io * io)
492 {
493 int index, name = io->name;
494
495 /* These params are handled differently, they don't need
496 * semantic indices, so we'll use 0 for them.
497 */
498 if (name == TGSI_SEMANTIC_POSITION ||
499 name == TGSI_SEMANTIC_PSIZE ||
500 name == TGSI_SEMANTIC_EDGEFLAG ||
501 name == TGSI_SEMANTIC_FACE)
502 index = 0;
503 else {
504 if (name == TGSI_SEMANTIC_GENERIC) {
505 /* For generic params simply use sid from tgsi */
506 index = io->sid;
507 } else {
508 /* For non-generic params - pack name and sid into 8 bits */
509 index = 0x80 | (name<<3) | (io->sid);
510 }
511
512 /* Make sure that all really used indices have nonzero value, so
513 * we can just compare it to 0 later instead of comparing the name
514 * with different values to detect special cases. */
515 index++;
516 }
517
518 return index;
519 };
520
521 /* turn input into interpolate on EG */
522 static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index)
523 {
524 int r = 0;
525
526 if (ctx->shader->input[index].spi_sid) {
527 ctx->shader->input[index].lds_pos = ctx->shader->nlds++;
528 if (ctx->shader->input[index].interpolate > 0) {
529 evergreen_interp_assign_ij_index(ctx, index);
530 if (!ctx->use_llvm)
531 r = evergreen_interp_alu(ctx, index);
532 } else {
533 if (!ctx->use_llvm)
534 r = evergreen_interp_flat(ctx, index);
535 }
536 }
537 return r;
538 }
539
540 static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back)
541 {
542 struct r600_bytecode_alu alu;
543 int i, r;
544 int gpr_front = ctx->shader->input[front].gpr;
545 int gpr_back = ctx->shader->input[back].gpr;
546
547 for (i = 0; i < 4; i++) {
548 memset(&alu, 0, sizeof(alu));
549 alu.op = ALU_OP3_CNDGT;
550 alu.is_op3 = 1;
551 alu.dst.write = 1;
552 alu.dst.sel = gpr_front;
553 alu.src[0].sel = ctx->face_gpr;
554 alu.src[1].sel = gpr_front;
555 alu.src[2].sel = gpr_back;
556
557 alu.dst.chan = i;
558 alu.src[1].chan = i;
559 alu.src[2].chan = i;
560 alu.last = (i==3);
561
562 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
563 return r;
564 }
565
566 return 0;
567 }
568
569 static int tgsi_declaration(struct r600_shader_ctx *ctx)
570 {
571 struct tgsi_full_declaration *d = &ctx->parse.FullToken.FullDeclaration;
572 int r, i, j, count = d->Range.Last - d->Range.First + 1;
573
574 switch (d->Declaration.File) {
575 case TGSI_FILE_INPUT:
576 i = ctx->shader->ninput;
577 assert(i < Elements(ctx->shader->input));
578 ctx->shader->ninput += count;
579 ctx->shader->input[i].name = d->Semantic.Name;
580 ctx->shader->input[i].sid = d->Semantic.Index;
581 ctx->shader->input[i].interpolate = d->Interp.Interpolate;
582 ctx->shader->input[i].centroid = d->Interp.Centroid;
583 ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First;
584 if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
585 ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]);
586 switch (ctx->shader->input[i].name) {
587 case TGSI_SEMANTIC_FACE:
588 ctx->face_gpr = ctx->shader->input[i].gpr;
589 break;
590 case TGSI_SEMANTIC_COLOR:
591 ctx->colors_used++;
592 break;
593 case TGSI_SEMANTIC_POSITION:
594 ctx->fragcoord_input = i;
595 break;
596 }
597 if (ctx->bc->chip_class >= EVERGREEN) {
598 if ((r = evergreen_interp_input(ctx, i)))
599 return r;
600 }
601 } else if (ctx->type == TGSI_PROCESSOR_GEOMETRY) {
602 /* FIXME probably skip inputs if they aren't passed in the ring */
603 ctx->shader->input[i].ring_offset = ctx->next_ring_offset;
604 ctx->next_ring_offset += 16;
605 if (ctx->shader->input[i].name == TGSI_SEMANTIC_PRIMID)
606 ctx->shader->gs_prim_id_input = true;
607 }
608 for (j = 1; j < count; ++j) {
609 ctx->shader->input[i + j] = ctx->shader->input[i];
610 ctx->shader->input[i + j].gpr += j;
611 }
612 break;
613 case TGSI_FILE_OUTPUT:
614 i = ctx->shader->noutput++;
615 assert(i < Elements(ctx->shader->output));
616 ctx->shader->output[i].name = d->Semantic.Name;
617 ctx->shader->output[i].sid = d->Semantic.Index;
618 ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First;
619 ctx->shader->output[i].interpolate = d->Interp.Interpolate;
620 ctx->shader->output[i].write_mask = d->Declaration.UsageMask;
621 if (ctx->type == TGSI_PROCESSOR_VERTEX ||
622 ctx->type == TGSI_PROCESSOR_GEOMETRY) {
623 ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]);
624 switch (d->Semantic.Name) {
625 case TGSI_SEMANTIC_CLIPDIST:
626 ctx->shader->clip_dist_write |= d->Declaration.UsageMask << (d->Semantic.Index << 2);
627 break;
628 case TGSI_SEMANTIC_PSIZE:
629 ctx->shader->vs_out_misc_write = 1;
630 ctx->shader->vs_out_point_size = 1;
631 break;
632 case TGSI_SEMANTIC_EDGEFLAG:
633 ctx->shader->vs_out_misc_write = 1;
634 ctx->shader->vs_out_edgeflag = 1;
635 ctx->edgeflag_output = i;
636 break;
637 case TGSI_SEMANTIC_VIEWPORT_INDEX:
638 ctx->shader->vs_out_misc_write = 1;
639 ctx->shader->vs_out_viewport = 1;
640 break;
641 case TGSI_SEMANTIC_LAYER:
642 ctx->shader->vs_out_misc_write = 1;
643 ctx->shader->vs_out_layer = 1;
644 break;
645 case TGSI_SEMANTIC_CLIPVERTEX:
646 ctx->clip_vertex_write = TRUE;
647 ctx->cv_output = i;
648 break;
649 }
650 if (ctx->type == TGSI_PROCESSOR_GEOMETRY) {
651 ctx->gs_out_ring_offset += 16;
652 }
653 } else if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
654 switch (d->Semantic.Name) {
655 case TGSI_SEMANTIC_COLOR:
656 ctx->shader->nr_ps_max_color_exports++;
657 break;
658 }
659 }
660 break;
661 case TGSI_FILE_TEMPORARY:
662 if (ctx->info.indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
663 if (d->Array.ArrayID) {
664 r600_add_gpr_array(ctx->shader,
665 ctx->file_offset[TGSI_FILE_TEMPORARY] +
666 d->Range.First,
667 d->Range.Last - d->Range.First + 1, 0x0F);
668 }
669 }
670 break;
671
672 case TGSI_FILE_CONSTANT:
673 case TGSI_FILE_SAMPLER:
674 case TGSI_FILE_ADDRESS:
675 break;
676
677 case TGSI_FILE_SYSTEM_VALUE:
678 if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) {
679 if (!ctx->native_integers) {
680 struct r600_bytecode_alu alu;
681 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
682
683 alu.op = ALU_OP1_INT_TO_FLT;
684 alu.src[0].sel = 0;
685 alu.src[0].chan = 3;
686
687 alu.dst.sel = 0;
688 alu.dst.chan = 3;
689 alu.dst.write = 1;
690 alu.last = 1;
691
692 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
693 return r;
694 }
695 break;
696 } else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID)
697 break;
698 default:
699 R600_ERR("unsupported file %d declaration\n", d->Declaration.File);
700 return -EINVAL;
701 }
702 return 0;
703 }
704
705 static int r600_get_temp(struct r600_shader_ctx *ctx)
706 {
707 return ctx->temp_reg + ctx->max_driver_temp_used++;
708 }
709
710 /*
711 * for evergreen we need to scan the shader to find the number of GPRs we need to
712 * reserve for interpolation.
713 *
714 * we need to know if we are going to emit
715 * any centroid inputs
716 * if perspective and linear are required
717 */
718 static int evergreen_gpr_count(struct r600_shader_ctx *ctx)
719 {
720 int i;
721 int num_baryc;
722
723 ctx->input_linear = FALSE;
724 ctx->input_perspective = FALSE;
725 ctx->input_centroid = FALSE;
726 ctx->num_interp_gpr = 1;
727
728 /* any centroid inputs */
729 for (i = 0; i < ctx->info.num_inputs; i++) {
730 /* skip position/face */
731 if (ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_POSITION ||
732 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_FACE)
733 continue;
734 if (ctx->info.input_interpolate[i] == TGSI_INTERPOLATE_LINEAR)
735 ctx->input_linear = TRUE;
736 if (ctx->info.input_interpolate[i] == TGSI_INTERPOLATE_PERSPECTIVE)
737 ctx->input_perspective = TRUE;
738 if (ctx->info.input_centroid[i])
739 ctx->input_centroid = TRUE;
740 }
741
742 num_baryc = 0;
743 /* ignoring sample for now */
744 if (ctx->input_perspective)
745 num_baryc++;
746 if (ctx->input_linear)
747 num_baryc++;
748 if (ctx->input_centroid)
749 num_baryc *= 2;
750
751 ctx->num_interp_gpr += (num_baryc + 1) >> 1;
752
753 /* XXX PULL MODEL and LINE STIPPLE, FIXED PT POS */
754 return ctx->num_interp_gpr;
755 }
756
757 static void tgsi_src(struct r600_shader_ctx *ctx,
758 const struct tgsi_full_src_register *tgsi_src,
759 struct r600_shader_src *r600_src)
760 {
761 memset(r600_src, 0, sizeof(*r600_src));
762 r600_src->swizzle[0] = tgsi_src->Register.SwizzleX;
763 r600_src->swizzle[1] = tgsi_src->Register.SwizzleY;
764 r600_src->swizzle[2] = tgsi_src->Register.SwizzleZ;
765 r600_src->swizzle[3] = tgsi_src->Register.SwizzleW;
766 r600_src->neg = tgsi_src->Register.Negate;
767 r600_src->abs = tgsi_src->Register.Absolute;
768
769 if (tgsi_src->Register.File == TGSI_FILE_IMMEDIATE) {
770 int index;
771 if ((tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleY) &&
772 (tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleZ) &&
773 (tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleW)) {
774
775 index = tgsi_src->Register.Index * 4 + tgsi_src->Register.SwizzleX;
776 r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel, &r600_src->neg);
777 if (r600_src->sel != V_SQ_ALU_SRC_LITERAL)
778 return;
779 }
780 index = tgsi_src->Register.Index;
781 r600_src->sel = V_SQ_ALU_SRC_LITERAL;
782 memcpy(r600_src->value, ctx->literals + index * 4, sizeof(r600_src->value));
783 } else if (tgsi_src->Register.File == TGSI_FILE_SYSTEM_VALUE) {
784 if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INSTANCEID) {
785 r600_src->swizzle[0] = 3;
786 r600_src->swizzle[1] = 3;
787 r600_src->swizzle[2] = 3;
788 r600_src->swizzle[3] = 3;
789 r600_src->sel = 0;
790 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTEXID) {
791 r600_src->swizzle[0] = 0;
792 r600_src->swizzle[1] = 0;
793 r600_src->swizzle[2] = 0;
794 r600_src->swizzle[3] = 0;
795 r600_src->sel = 0;
796 }
797 } else {
798 if (tgsi_src->Register.Indirect)
799 r600_src->rel = V_SQ_REL_RELATIVE;
800 r600_src->sel = tgsi_src->Register.Index;
801 r600_src->sel += ctx->file_offset[tgsi_src->Register.File];
802 }
803 if (tgsi_src->Register.File == TGSI_FILE_CONSTANT) {
804 if (tgsi_src->Register.Dimension) {
805 r600_src->kc_bank = tgsi_src->Dimension.Index;
806 }
807 }
808 }
809
810 static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
811 unsigned int cb_idx, unsigned int offset, unsigned ar_chan,
812 unsigned int dst_reg)
813 {
814 struct r600_bytecode_vtx vtx;
815 unsigned int ar_reg;
816 int r;
817
818 if (offset) {
819 struct r600_bytecode_alu alu;
820
821 memset(&alu, 0, sizeof(alu));
822
823 alu.op = ALU_OP2_ADD_INT;
824 alu.src[0].sel = ctx->bc->ar_reg;
825 alu.src[0].chan = ar_chan;
826
827 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
828 alu.src[1].value = offset;
829
830 alu.dst.sel = dst_reg;
831 alu.dst.chan = ar_chan;
832 alu.dst.write = 1;
833 alu.last = 1;
834
835 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
836 return r;
837
838 ar_reg = dst_reg;
839 } else {
840 ar_reg = ctx->bc->ar_reg;
841 }
842
843 memset(&vtx, 0, sizeof(vtx));
844 vtx.buffer_id = cb_idx;
845 vtx.fetch_type = 2; /* VTX_FETCH_NO_INDEX_OFFSET */
846 vtx.src_gpr = ar_reg;
847 vtx.src_sel_x = ar_chan;
848 vtx.mega_fetch_count = 16;
849 vtx.dst_gpr = dst_reg;
850 vtx.dst_sel_x = 0; /* SEL_X */
851 vtx.dst_sel_y = 1; /* SEL_Y */
852 vtx.dst_sel_z = 2; /* SEL_Z */
853 vtx.dst_sel_w = 3; /* SEL_W */
854 vtx.data_format = FMT_32_32_32_32_FLOAT;
855 vtx.num_format_all = 2; /* NUM_FORMAT_SCALED */
856 vtx.format_comp_all = 1; /* FORMAT_COMP_SIGNED */
857 vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */
858 vtx.endian = r600_endian_swap(32);
859
860 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
861 return r;
862
863 return 0;
864 }
865
866 static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
867 {
868 struct r600_bytecode_vtx vtx;
869 int r;
870 unsigned index = src->Register.Index;
871 unsigned vtx_id = src->Dimension.Index;
872 int offset_reg = vtx_id / 3;
873 int offset_chan = vtx_id % 3;
874
875 /* offsets of per-vertex data in ESGS ring are passed to GS in R0.x, R0.y,
876 * R0.w, R1.x, R1.y, R1.z (it seems R0.z is used for PrimitiveID) */
877
878 if (offset_reg == 0 && offset_chan == 2)
879 offset_chan = 3;
880
881 if (src->Dimension.Indirect) {
882 int treg[3];
883 int t2;
884 struct r600_bytecode_alu alu;
885 int r, i;
886
887 /* you have got to be shitting me -
888 we have to put the R0.x/y/w into Rt.x Rt+1.x Rt+2.x then index reg from Rt.
889 at least this is what fglrx seems to do. */
890 for (i = 0; i < 3; i++) {
891 treg[i] = r600_get_temp(ctx);
892 }
893 t2 = r600_get_temp(ctx);
894 for (i = 0; i < 3; i++) {
895 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
896 alu.op = ALU_OP1_MOV;
897 alu.src[0].sel = 0;
898 alu.src[0].chan = i == 2 ? 3 : i;
899 alu.dst.sel = treg[i];
900 alu.dst.chan = 0;
901 alu.dst.write = 1;
902 alu.last = 1;
903 r = r600_bytecode_add_alu(ctx->bc, &alu);
904 if (r)
905 return r;
906 }
907 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
908 alu.op = ALU_OP1_MOV;
909 alu.src[0].sel = treg[0];
910 alu.src[0].rel = 1;
911 alu.dst.sel = t2;
912 alu.dst.write = 1;
913 alu.last = 1;
914 r = r600_bytecode_add_alu(ctx->bc, &alu);
915 if (r)
916 return r;
917 offset_reg = t2;
918 }
919
920
921 memset(&vtx, 0, sizeof(vtx));
922 vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
923 vtx.fetch_type = 2; /* VTX_FETCH_NO_INDEX_OFFSET */
924 vtx.src_gpr = offset_reg;
925 vtx.src_sel_x = offset_chan;
926 vtx.offset = index * 16; /*bytes*/
927 vtx.mega_fetch_count = 16;
928 vtx.dst_gpr = dst_reg;
929 vtx.dst_sel_x = 0; /* SEL_X */
930 vtx.dst_sel_y = 1; /* SEL_Y */
931 vtx.dst_sel_z = 2; /* SEL_Z */
932 vtx.dst_sel_w = 3; /* SEL_W */
933 if (ctx->bc->chip_class >= EVERGREEN) {
934 vtx.use_const_fields = 1;
935 } else {
936 vtx.data_format = FMT_32_32_32_32_FLOAT;
937 }
938
939 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
940 return r;
941
942 return 0;
943 }
944
945 static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx)
946 {
947 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
948 int i;
949
950 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
951 struct tgsi_full_src_register *src = &inst->Src[i];
952
953 if (src->Register.File == TGSI_FILE_INPUT) {
954 if (ctx->shader->input[src->Register.Index].name == TGSI_SEMANTIC_PRIMID) {
955 /* primitive id is in R0.z */
956 ctx->src[i].sel = 0;
957 ctx->src[i].swizzle[0] = 2;
958 }
959 }
960 if (src->Register.File == TGSI_FILE_INPUT && src->Register.Dimension) {
961 int treg = r600_get_temp(ctx);
962
963 fetch_gs_input(ctx, src, treg);
964 ctx->src[i].sel = treg;
965 }
966 }
967 return 0;
968 }
969
970 static int tgsi_split_constant(struct r600_shader_ctx *ctx)
971 {
972 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
973 struct r600_bytecode_alu alu;
974 int i, j, k, nconst, r;
975
976 for (i = 0, nconst = 0; i < inst->Instruction.NumSrcRegs; i++) {
977 if (inst->Src[i].Register.File == TGSI_FILE_CONSTANT) {
978 nconst++;
979 }
980 tgsi_src(ctx, &inst->Src[i], &ctx->src[i]);
981 }
982 for (i = 0, j = nconst - 1; i < inst->Instruction.NumSrcRegs; i++) {
983 if (inst->Src[i].Register.File != TGSI_FILE_CONSTANT) {
984 continue;
985 }
986
987 if (ctx->src[i].rel) {
988 int chan = inst->Src[i].Indirect.Swizzle;
989 int treg = r600_get_temp(ctx);
990 if ((r = tgsi_fetch_rel_const(ctx, ctx->src[i].kc_bank, ctx->src[i].sel - 512, chan, treg)))
991 return r;
992
993 ctx->src[i].kc_bank = 0;
994 ctx->src[i].sel = treg;
995 ctx->src[i].rel = 0;
996 j--;
997 } else if (j > 0) {
998 int treg = r600_get_temp(ctx);
999 for (k = 0; k < 4; k++) {
1000 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1001 alu.op = ALU_OP1_MOV;
1002 alu.src[0].sel = ctx->src[i].sel;
1003 alu.src[0].chan = k;
1004 alu.src[0].rel = ctx->src[i].rel;
1005 alu.dst.sel = treg;
1006 alu.dst.chan = k;
1007 alu.dst.write = 1;
1008 if (k == 3)
1009 alu.last = 1;
1010 r = r600_bytecode_add_alu(ctx->bc, &alu);
1011 if (r)
1012 return r;
1013 }
1014 ctx->src[i].sel = treg;
1015 ctx->src[i].rel =0;
1016 j--;
1017 }
1018 }
1019 return 0;
1020 }
1021
1022 /* need to move any immediate into a temp - for trig functions which use literal for PI stuff */
1023 static int tgsi_split_literal_constant(struct r600_shader_ctx *ctx)
1024 {
1025 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1026 struct r600_bytecode_alu alu;
1027 int i, j, k, nliteral, r;
1028
1029 for (i = 0, nliteral = 0; i < inst->Instruction.NumSrcRegs; i++) {
1030 if (ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
1031 nliteral++;
1032 }
1033 }
1034 for (i = 0, j = nliteral - 1; i < inst->Instruction.NumSrcRegs; i++) {
1035 if (j > 0 && ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
1036 int treg = r600_get_temp(ctx);
1037 for (k = 0; k < 4; k++) {
1038 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1039 alu.op = ALU_OP1_MOV;
1040 alu.src[0].sel = ctx->src[i].sel;
1041 alu.src[0].chan = k;
1042 alu.src[0].value = ctx->src[i].value[k];
1043 alu.dst.sel = treg;
1044 alu.dst.chan = k;
1045 alu.dst.write = 1;
1046 if (k == 3)
1047 alu.last = 1;
1048 r = r600_bytecode_add_alu(ctx->bc, &alu);
1049 if (r)
1050 return r;
1051 }
1052 ctx->src[i].sel = treg;
1053 j--;
1054 }
1055 }
1056 return 0;
1057 }
1058
1059 static int process_twoside_color_inputs(struct r600_shader_ctx *ctx)
1060 {
1061 int i, r, count = ctx->shader->ninput;
1062
1063 for (i = 0; i < count; i++) {
1064 if (ctx->shader->input[i].name == TGSI_SEMANTIC_COLOR) {
1065 r = select_twoside_color(ctx, i, ctx->shader->input[i].back_color_input);
1066 if (r)
1067 return r;
1068 }
1069 }
1070 return 0;
1071 }
1072
1073 static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output_info *so)
1074 {
1075 unsigned so_gpr[PIPE_MAX_SHADER_OUTPUTS];
1076 int i, j, r;
1077
1078 /* Sanity checking. */
1079 if (so->num_outputs > PIPE_MAX_SHADER_OUTPUTS) {
1080 R600_ERR("Too many stream outputs: %d\n", so->num_outputs);
1081 r = -EINVAL;
1082 goto out_err;
1083 }
1084 for (i = 0; i < so->num_outputs; i++) {
1085 if (so->output[i].output_buffer >= 4) {
1086 R600_ERR("Exceeded the max number of stream output buffers, got: %d\n",
1087 so->output[i].output_buffer);
1088 r = -EINVAL;
1089 goto out_err;
1090 }
1091 }
1092
1093 /* Initialize locations where the outputs are stored. */
1094 for (i = 0; i < so->num_outputs; i++) {
1095 so_gpr[i] = ctx->shader->output[so->output[i].register_index].gpr;
1096
1097 /* Lower outputs with dst_offset < start_component.
1098 *
1099 * We can only output 4D vectors with a write mask, e.g. we can
1100 * only output the W component at offset 3, etc. If we want
1101 * to store Y, Z, or W at buffer offset 0, we need to use MOV
1102 * to move it to X and output X. */
1103 if (so->output[i].dst_offset < so->output[i].start_component) {
1104 unsigned tmp = r600_get_temp(ctx);
1105
1106 for (j = 0; j < so->output[i].num_components; j++) {
1107 struct r600_bytecode_alu alu;
1108 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1109 alu.op = ALU_OP1_MOV;
1110 alu.src[0].sel = so_gpr[i];
1111 alu.src[0].chan = so->output[i].start_component + j;
1112
1113 alu.dst.sel = tmp;
1114 alu.dst.chan = j;
1115 alu.dst.write = 1;
1116 if (j == so->output[i].num_components - 1)
1117 alu.last = 1;
1118 r = r600_bytecode_add_alu(ctx->bc, &alu);
1119 if (r)
1120 return r;
1121 }
1122 so->output[i].start_component = 0;
1123 so_gpr[i] = tmp;
1124 }
1125 }
1126
1127 /* Write outputs to buffers. */
1128 for (i = 0; i < so->num_outputs; i++) {
1129 struct r600_bytecode_output output;
1130
1131 memset(&output, 0, sizeof(struct r600_bytecode_output));
1132 output.gpr = so_gpr[i];
1133 output.elem_size = so->output[i].num_components;
1134 output.array_base = so->output[i].dst_offset - so->output[i].start_component;
1135 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
1136 output.burst_count = 1;
1137 /* array_size is an upper limit for the burst_count
1138 * with MEM_STREAM instructions */
1139 output.array_size = 0xFFF;
1140 output.comp_mask = ((1 << so->output[i].num_components) - 1) << so->output[i].start_component;
1141 if (ctx->bc->chip_class >= EVERGREEN) {
1142 switch (so->output[i].output_buffer) {
1143 case 0:
1144 output.op = CF_OP_MEM_STREAM0_BUF0;
1145 break;
1146 case 1:
1147 output.op = CF_OP_MEM_STREAM0_BUF1;
1148 break;
1149 case 2:
1150 output.op = CF_OP_MEM_STREAM0_BUF2;
1151 break;
1152 case 3:
1153 output.op = CF_OP_MEM_STREAM0_BUF3;
1154 break;
1155 }
1156 } else {
1157 switch (so->output[i].output_buffer) {
1158 case 0:
1159 output.op = CF_OP_MEM_STREAM0;
1160 break;
1161 case 1:
1162 output.op = CF_OP_MEM_STREAM1;
1163 break;
1164 case 2:
1165 output.op = CF_OP_MEM_STREAM2;
1166 break;
1167 case 3:
1168 output.op = CF_OP_MEM_STREAM3;
1169 break;
1170 }
1171 }
1172 r = r600_bytecode_add_output(ctx->bc, &output);
1173 if (r)
1174 goto out_err;
1175 }
1176 return 0;
1177 out_err:
1178 return r;
1179 }
1180
1181 static void convert_edgeflag_to_int(struct r600_shader_ctx *ctx)
1182 {
1183 struct r600_bytecode_alu alu;
1184 unsigned reg;
1185
1186 if (!ctx->shader->vs_out_edgeflag)
1187 return;
1188
1189 reg = ctx->shader->output[ctx->edgeflag_output].gpr;
1190
1191 /* clamp(x, 0, 1) */
1192 memset(&alu, 0, sizeof(alu));
1193 alu.op = ALU_OP1_MOV;
1194 alu.src[0].sel = reg;
1195 alu.dst.sel = reg;
1196 alu.dst.write = 1;
1197 alu.dst.clamp = 1;
1198 alu.last = 1;
1199 r600_bytecode_add_alu(ctx->bc, &alu);
1200
1201 memset(&alu, 0, sizeof(alu));
1202 alu.op = ALU_OP1_FLT_TO_INT;
1203 alu.src[0].sel = reg;
1204 alu.dst.sel = reg;
1205 alu.dst.write = 1;
1206 alu.last = 1;
1207 r600_bytecode_add_alu(ctx->bc, &alu);
1208 }
1209
1210 static int generate_gs_copy_shader(struct r600_context *rctx,
1211 struct r600_pipe_shader *gs,
1212 struct pipe_stream_output_info *so)
1213 {
1214 struct r600_shader_ctx ctx = {};
1215 struct r600_shader *gs_shader = &gs->shader;
1216 struct r600_pipe_shader *cshader;
1217 int ocnt = gs_shader->noutput;
1218 struct r600_bytecode_alu alu;
1219 struct r600_bytecode_vtx vtx;
1220 struct r600_bytecode_output output;
1221 struct r600_bytecode_cf *cf_jump, *cf_pop,
1222 *last_exp_pos = NULL, *last_exp_param = NULL;
1223 int i, next_clip_pos = 61, next_param = 0;
1224
1225 cshader = calloc(1, sizeof(struct r600_pipe_shader));
1226 if (!cshader)
1227 return 0;
1228
1229 memcpy(cshader->shader.output, gs_shader->output, ocnt *
1230 sizeof(struct r600_shader_io));
1231
1232 cshader->shader.noutput = ocnt;
1233
1234 ctx.shader = &cshader->shader;
1235 ctx.bc = &ctx.shader->bc;
1236 ctx.type = ctx.bc->type = TGSI_PROCESSOR_VERTEX;
1237
1238 r600_bytecode_init(ctx.bc, rctx->b.chip_class, rctx->b.family,
1239 rctx->screen->has_compressed_msaa_texturing);
1240
1241 ctx.bc->isa = rctx->isa;
1242
1243 /* R0.x = R0.x & 0x3fffffff */
1244 memset(&alu, 0, sizeof(alu));
1245 alu.op = ALU_OP2_AND_INT;
1246 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1247 alu.src[1].value = 0x3fffffff;
1248 alu.dst.write = 1;
1249 r600_bytecode_add_alu(ctx.bc, &alu);
1250
1251 /* R0.y = R0.x >> 30 */
1252 memset(&alu, 0, sizeof(alu));
1253 alu.op = ALU_OP2_LSHR_INT;
1254 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1255 alu.src[1].value = 0x1e;
1256 alu.dst.chan = 1;
1257 alu.dst.write = 1;
1258 alu.last = 1;
1259 r600_bytecode_add_alu(ctx.bc, &alu);
1260
1261 /* PRED_SETE_INT __, R0.y, 0 */
1262 memset(&alu, 0, sizeof(alu));
1263 alu.op = ALU_OP2_PRED_SETE_INT;
1264 alu.src[0].chan = 1;
1265 alu.src[1].sel = V_SQ_ALU_SRC_0;
1266 alu.execute_mask = 1;
1267 alu.update_pred = 1;
1268 alu.last = 1;
1269 r600_bytecode_add_alu_type(ctx.bc, &alu, CF_OP_ALU_PUSH_BEFORE);
1270
1271 r600_bytecode_add_cfinst(ctx.bc, CF_OP_JUMP);
1272 cf_jump = ctx.bc->cf_last;
1273
1274 /* fetch vertex data from GSVS ring */
1275 for (i = 0; i < ocnt; ++i) {
1276 struct r600_shader_io *out = &ctx.shader->output[i];
1277 out->gpr = i + 1;
1278 out->ring_offset = i * 16;
1279
1280 memset(&vtx, 0, sizeof(vtx));
1281 vtx.op = FETCH_OP_VFETCH;
1282 vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
1283 vtx.fetch_type = 2;
1284 vtx.offset = out->ring_offset;
1285 vtx.dst_gpr = out->gpr;
1286 vtx.dst_sel_x = 0;
1287 vtx.dst_sel_y = 1;
1288 vtx.dst_sel_z = 2;
1289 vtx.dst_sel_w = 3;
1290 if (rctx->b.chip_class >= EVERGREEN) {
1291 vtx.use_const_fields = 1;
1292 } else {
1293 vtx.data_format = FMT_32_32_32_32_FLOAT;
1294 }
1295
1296 r600_bytecode_add_vtx(ctx.bc, &vtx);
1297 }
1298
1299 /* XXX handle clipvertex, streamout? */
1300 emit_streamout(&ctx, so);
1301
1302 /* export vertex data */
1303 /* XXX factor out common code with r600_shader_from_tgsi ? */
1304 for (i = 0; i < ocnt; ++i) {
1305 struct r600_shader_io *out = &ctx.shader->output[i];
1306
1307 if (out->name == TGSI_SEMANTIC_CLIPVERTEX)
1308 continue;
1309
1310 memset(&output, 0, sizeof(output));
1311 output.gpr = out->gpr;
1312 output.elem_size = 3;
1313 output.swizzle_x = 0;
1314 output.swizzle_y = 1;
1315 output.swizzle_z = 2;
1316 output.swizzle_w = 3;
1317 output.burst_count = 1;
1318 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
1319 output.op = CF_OP_EXPORT;
1320 switch (out->name) {
1321 case TGSI_SEMANTIC_POSITION:
1322 output.array_base = 60;
1323 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1324 break;
1325
1326 case TGSI_SEMANTIC_PSIZE:
1327 output.array_base = 61;
1328 if (next_clip_pos == 61)
1329 next_clip_pos = 62;
1330 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1331 output.swizzle_y = 7;
1332 output.swizzle_z = 7;
1333 output.swizzle_w = 7;
1334 ctx.shader->vs_out_misc_write = 1;
1335 ctx.shader->vs_out_point_size = 1;
1336 break;
1337 case TGSI_SEMANTIC_LAYER:
1338 if (out->spi_sid) {
1339 /* duplicate it as PARAM to pass to the pixel shader */
1340 output.array_base = next_param++;
1341 r600_bytecode_add_output(ctx.bc, &output);
1342 last_exp_param = ctx.bc->cf_last;
1343 }
1344 output.array_base = 61;
1345 if (next_clip_pos == 61)
1346 next_clip_pos = 62;
1347 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1348 output.swizzle_x = 7;
1349 output.swizzle_y = 7;
1350 output.swizzle_z = 0;
1351 output.swizzle_w = 7;
1352 ctx.shader->vs_out_misc_write = 1;
1353 ctx.shader->vs_out_layer = 1;
1354 break;
1355 case TGSI_SEMANTIC_VIEWPORT_INDEX:
1356 if (out->spi_sid) {
1357 /* duplicate it as PARAM to pass to the pixel shader */
1358 output.array_base = next_param++;
1359 r600_bytecode_add_output(ctx.bc, &output);
1360 last_exp_param = ctx.bc->cf_last;
1361 }
1362 output.array_base = 61;
1363 if (next_clip_pos == 61)
1364 next_clip_pos = 62;
1365 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1366 ctx.shader->vs_out_misc_write = 1;
1367 ctx.shader->vs_out_viewport = 1;
1368 output.swizzle_x = 7;
1369 output.swizzle_y = 7;
1370 output.swizzle_z = 7;
1371 output.swizzle_w = 0;
1372 break;
1373 case TGSI_SEMANTIC_CLIPDIST:
1374 /* spi_sid is 0 for clipdistance outputs that were generated
1375 * for clipvertex - we don't need to pass them to PS */
1376 ctx.shader->clip_dist_write = gs->shader.clip_dist_write;
1377 if (out->spi_sid) {
1378 /* duplicate it as PARAM to pass to the pixel shader */
1379 output.array_base = next_param++;
1380 r600_bytecode_add_output(ctx.bc, &output);
1381 last_exp_param = ctx.bc->cf_last;
1382 }
1383 output.array_base = next_clip_pos++;
1384 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1385 break;
1386 case TGSI_SEMANTIC_FOG:
1387 output.swizzle_y = 4; /* 0 */
1388 output.swizzle_z = 4; /* 0 */
1389 output.swizzle_w = 5; /* 1 */
1390 break;
1391 default:
1392 output.array_base = next_param++;
1393 break;
1394 }
1395 r600_bytecode_add_output(ctx.bc, &output);
1396 if (output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM)
1397 last_exp_param = ctx.bc->cf_last;
1398 else
1399 last_exp_pos = ctx.bc->cf_last;
1400 }
1401
1402 if (!last_exp_pos) {
1403 memset(&output, 0, sizeof(output));
1404 output.gpr = 0;
1405 output.elem_size = 3;
1406 output.swizzle_x = 7;
1407 output.swizzle_y = 7;
1408 output.swizzle_z = 7;
1409 output.swizzle_w = 7;
1410 output.burst_count = 1;
1411 output.type = 2;
1412 output.op = CF_OP_EXPORT;
1413 output.array_base = 60;
1414 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1415 r600_bytecode_add_output(ctx.bc, &output);
1416 last_exp_pos = ctx.bc->cf_last;
1417 }
1418
1419 if (!last_exp_param) {
1420 memset(&output, 0, sizeof(output));
1421 output.gpr = 0;
1422 output.elem_size = 3;
1423 output.swizzle_x = 7;
1424 output.swizzle_y = 7;
1425 output.swizzle_z = 7;
1426 output.swizzle_w = 7;
1427 output.burst_count = 1;
1428 output.type = 2;
1429 output.op = CF_OP_EXPORT;
1430 output.array_base = next_param++;
1431 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
1432 r600_bytecode_add_output(ctx.bc, &output);
1433 last_exp_param = ctx.bc->cf_last;
1434 }
1435
1436 last_exp_pos->op = CF_OP_EXPORT_DONE;
1437 last_exp_param->op = CF_OP_EXPORT_DONE;
1438
1439 r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
1440 cf_pop = ctx.bc->cf_last;
1441
1442 cf_jump->cf_addr = cf_pop->id + 2;
1443 cf_jump->pop_count = 1;
1444 cf_pop->cf_addr = cf_pop->id + 2;
1445 cf_pop->pop_count = 1;
1446
1447 if (ctx.bc->chip_class == CAYMAN)
1448 cm_bytecode_add_cf_end(ctx.bc);
1449 else {
1450 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
1451 ctx.bc->cf_last->end_of_program = 1;
1452 }
1453
1454 gs->gs_copy_shader = cshader;
1455
1456 ctx.bc->nstack = 1;
1457 cshader->shader.ring_item_size = ocnt * 16;
1458
1459 return r600_bytecode_build(ctx.bc);
1460 }
1461
1462 static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, bool ind)
1463 {
1464 struct r600_bytecode_output output;
1465 int i, k, ring_offset;
1466
1467 for (i = 0; i < ctx->shader->noutput; i++) {
1468 if (ctx->gs_for_vs) {
1469 /* for ES we need to lookup corresponding ring offset expected by GS
1470 * (map this output to GS input by name and sid) */
1471 /* FIXME precompute offsets */
1472 ring_offset = -1;
1473 for(k = 0; k < ctx->gs_for_vs->ninput; ++k) {
1474 struct r600_shader_io *in = &ctx->gs_for_vs->input[k];
1475 struct r600_shader_io *out = &ctx->shader->output[i];
1476 if (in->name == out->name && in->sid == out->sid)
1477 ring_offset = in->ring_offset;
1478 }
1479
1480 if (ring_offset == -1)
1481 continue;
1482 } else
1483 ring_offset = i * 16;
1484
1485 /* next_ring_offset after parsing input decls contains total size of
1486 * single vertex data, gs_next_vertex - current vertex index */
1487 if (!ind)
1488 ring_offset += ctx->gs_out_ring_offset * ctx->gs_next_vertex;
1489
1490 /* get a temp and add the ring offset to the next vertex base in the shader */
1491 memset(&output, 0, sizeof(struct r600_bytecode_output));
1492 output.gpr = ctx->shader->output[i].gpr;
1493 output.elem_size = 3;
1494 output.comp_mask = 0xF;
1495 output.burst_count = 1;
1496
1497 if (ind)
1498 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
1499 else
1500 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
1501 output.op = CF_OP_MEM_RING;
1502
1503
1504 if (ind) {
1505 output.array_base = ring_offset >> 2; /* in dwords */
1506 output.array_size = 0xfff;
1507 output.index_gpr = ctx->gs_export_gpr_treg;
1508 } else
1509 output.array_base = ring_offset >> 2; /* in dwords */
1510 r600_bytecode_add_output(ctx->bc, &output);
1511 }
1512
1513 if (ind) {
1514 struct r600_bytecode_alu alu;
1515 int r;
1516
1517 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1518 alu.op = ALU_OP2_ADD_INT;
1519 alu.src[0].sel = ctx->gs_export_gpr_treg;
1520 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1521 alu.src[1].value = ctx->gs_out_ring_offset >> 4;
1522 alu.dst.sel = ctx->gs_export_gpr_treg;
1523 alu.dst.write = 1;
1524 alu.last = 1;
1525 r = r600_bytecode_add_alu(ctx->bc, &alu);
1526 if (r)
1527 return r;
1528 }
1529 ++ctx->gs_next_vertex;
1530 return 0;
1531 }
1532
1533 static int r600_shader_from_tgsi(struct r600_context *rctx,
1534 struct r600_pipe_shader *pipeshader,
1535 struct r600_shader_key key)
1536 {
1537 struct r600_screen *rscreen = rctx->screen;
1538 struct r600_shader *shader = &pipeshader->shader;
1539 struct tgsi_token *tokens = pipeshader->selector->tokens;
1540 struct pipe_stream_output_info so = pipeshader->selector->so;
1541 struct tgsi_full_immediate *immediate;
1542 struct tgsi_full_property *property;
1543 struct r600_shader_ctx ctx;
1544 struct r600_bytecode_output output[32];
1545 unsigned output_done, noutput;
1546 unsigned opcode;
1547 int i, j, k, r = 0;
1548 int next_param_base = 0, next_clip_base;
1549 int max_color_exports = MAX2(key.nr_cbufs, 1);
1550 /* Declarations used by llvm code */
1551 bool use_llvm = false;
1552 bool indirect_gprs;
1553 bool ring_outputs = false;
1554 bool pos_emitted = false;
1555
1556 #ifdef R600_USE_LLVM
1557 use_llvm = rscreen->b.debug_flags & DBG_LLVM;
1558 #endif
1559 ctx.bc = &shader->bc;
1560 ctx.shader = shader;
1561 ctx.native_integers = true;
1562
1563 shader->vs_as_es = key.vs_as_es;
1564
1565 r600_bytecode_init(ctx.bc, rscreen->b.chip_class, rscreen->b.family,
1566 rscreen->has_compressed_msaa_texturing);
1567 ctx.tokens = tokens;
1568 tgsi_scan_shader(tokens, &ctx.info);
1569 shader->indirect_files = ctx.info.indirect_files;
1570 indirect_gprs = ctx.info.indirect_files & ~(1 << TGSI_FILE_CONSTANT);
1571 tgsi_parse_init(&ctx.parse, tokens);
1572 ctx.type = ctx.parse.FullHeader.Processor.Processor;
1573 shader->processor_type = ctx.type;
1574 ctx.bc->type = shader->processor_type;
1575
1576 ring_outputs = key.vs_as_es || (ctx.type == TGSI_PROCESSOR_GEOMETRY);
1577
1578 if (key.vs_as_es) {
1579 ctx.gs_for_vs = &rctx->gs_shader->current->shader;
1580 } else {
1581 ctx.gs_for_vs = NULL;
1582 }
1583
1584 ctx.next_ring_offset = 0;
1585 ctx.gs_out_ring_offset = 0;
1586 ctx.gs_next_vertex = 0;
1587
1588 ctx.face_gpr = -1;
1589 ctx.fragcoord_input = -1;
1590 ctx.colors_used = 0;
1591 ctx.clip_vertex_write = 0;
1592
1593 shader->nr_ps_color_exports = 0;
1594 shader->nr_ps_max_color_exports = 0;
1595
1596 shader->two_side = key.color_two_side;
1597
1598 /* register allocations */
1599 /* Values [0,127] correspond to GPR[0..127].
1600 * Values [128,159] correspond to constant buffer bank 0
1601 * Values [160,191] correspond to constant buffer bank 1
1602 * Values [256,511] correspond to cfile constants c[0..255]. (Gone on EG)
1603 * Values [256,287] correspond to constant buffer bank 2 (EG)
1604 * Values [288,319] correspond to constant buffer bank 3 (EG)
1605 * Other special values are shown in the list below.
1606 * 244 ALU_SRC_1_DBL_L: special constant 1.0 double-float, LSW. (RV670+)
1607 * 245 ALU_SRC_1_DBL_M: special constant 1.0 double-float, MSW. (RV670+)
1608 * 246 ALU_SRC_0_5_DBL_L: special constant 0.5 double-float, LSW. (RV670+)
1609 * 247 ALU_SRC_0_5_DBL_M: special constant 0.5 double-float, MSW. (RV670+)
1610 * 248 SQ_ALU_SRC_0: special constant 0.0.
1611 * 249 SQ_ALU_SRC_1: special constant 1.0 float.
1612 * 250 SQ_ALU_SRC_1_INT: special constant 1 integer.
1613 * 251 SQ_ALU_SRC_M_1_INT: special constant -1 integer.
1614 * 252 SQ_ALU_SRC_0_5: special constant 0.5 float.
1615 * 253 SQ_ALU_SRC_LITERAL: literal constant.
1616 * 254 SQ_ALU_SRC_PV: previous vector result.
1617 * 255 SQ_ALU_SRC_PS: previous scalar result.
1618 */
1619 for (i = 0; i < TGSI_FILE_COUNT; i++) {
1620 ctx.file_offset[i] = 0;
1621 }
1622
1623 #ifdef R600_USE_LLVM
1624 if (use_llvm && ctx.info.indirect_files && (ctx.info.indirect_files & (1 << TGSI_FILE_CONSTANT)) != ctx.info.indirect_files) {
1625 fprintf(stderr, "Warning: R600 LLVM backend does not support "
1626 "indirect adressing. Falling back to TGSI "
1627 "backend.\n");
1628 use_llvm = 0;
1629 }
1630 #endif
1631 if (ctx.type == TGSI_PROCESSOR_VERTEX) {
1632 ctx.file_offset[TGSI_FILE_INPUT] = 1;
1633 if (!use_llvm) {
1634 r600_bytecode_add_cfinst(ctx.bc, CF_OP_CALL_FS);
1635 }
1636 }
1637 if (ctx.type == TGSI_PROCESSOR_FRAGMENT && ctx.bc->chip_class >= EVERGREEN) {
1638 ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx);
1639 }
1640 if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
1641 /* FIXME 1 would be enough in some cases (3 or less input vertices) */
1642 ctx.file_offset[TGSI_FILE_INPUT] = 2;
1643 }
1644 ctx.use_llvm = use_llvm;
1645
1646 if (use_llvm) {
1647 ctx.file_offset[TGSI_FILE_OUTPUT] =
1648 ctx.file_offset[TGSI_FILE_INPUT];
1649 } else {
1650 ctx.file_offset[TGSI_FILE_OUTPUT] =
1651 ctx.file_offset[TGSI_FILE_INPUT] +
1652 ctx.info.file_max[TGSI_FILE_INPUT] + 1;
1653 }
1654 ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] +
1655 ctx.info.file_max[TGSI_FILE_OUTPUT] + 1;
1656
1657 /* Outside the GPR range. This will be translated to one of the
1658 * kcache banks later. */
1659 ctx.file_offset[TGSI_FILE_CONSTANT] = 512;
1660
1661 ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL;
1662 ctx.bc->ar_reg = ctx.file_offset[TGSI_FILE_TEMPORARY] +
1663 ctx.info.file_max[TGSI_FILE_TEMPORARY] + 1;
1664 if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
1665 ctx.gs_export_gpr_treg = ctx.bc->ar_reg + 1;
1666 ctx.temp_reg = ctx.bc->ar_reg + 2;
1667 } else
1668 ctx.temp_reg = ctx.bc->ar_reg + 1;
1669
1670 if (indirect_gprs) {
1671 shader->max_arrays = 0;
1672 shader->num_arrays = 0;
1673
1674 if (ctx.info.indirect_files & (1 << TGSI_FILE_INPUT)) {
1675 r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_INPUT],
1676 ctx.file_offset[TGSI_FILE_OUTPUT] -
1677 ctx.file_offset[TGSI_FILE_INPUT],
1678 0x0F);
1679 }
1680 if (ctx.info.indirect_files & (1 << TGSI_FILE_OUTPUT)) {
1681 r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_OUTPUT],
1682 ctx.file_offset[TGSI_FILE_TEMPORARY] -
1683 ctx.file_offset[TGSI_FILE_OUTPUT],
1684 0x0F);
1685 }
1686 }
1687
1688 ctx.nliterals = 0;
1689 ctx.literals = NULL;
1690 shader->fs_write_all = FALSE;
1691 while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
1692 tgsi_parse_token(&ctx.parse);
1693 switch (ctx.parse.FullToken.Token.Type) {
1694 case TGSI_TOKEN_TYPE_IMMEDIATE:
1695 immediate = &ctx.parse.FullToken.FullImmediate;
1696 ctx.literals = realloc(ctx.literals, (ctx.nliterals + 1) * 16);
1697 if(ctx.literals == NULL) {
1698 r = -ENOMEM;
1699 goto out_err;
1700 }
1701 ctx.literals[ctx.nliterals * 4 + 0] = immediate->u[0].Uint;
1702 ctx.literals[ctx.nliterals * 4 + 1] = immediate->u[1].Uint;
1703 ctx.literals[ctx.nliterals * 4 + 2] = immediate->u[2].Uint;
1704 ctx.literals[ctx.nliterals * 4 + 3] = immediate->u[3].Uint;
1705 ctx.nliterals++;
1706 break;
1707 case TGSI_TOKEN_TYPE_DECLARATION:
1708 r = tgsi_declaration(&ctx);
1709 if (r)
1710 goto out_err;
1711 break;
1712 case TGSI_TOKEN_TYPE_INSTRUCTION:
1713 break;
1714 case TGSI_TOKEN_TYPE_PROPERTY:
1715 property = &ctx.parse.FullToken.FullProperty;
1716 switch (property->Property.PropertyName) {
1717 case TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS:
1718 if (property->u[0].Data == 1)
1719 shader->fs_write_all = TRUE;
1720 break;
1721 case TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION:
1722 if (property->u[0].Data == 1)
1723 shader->vs_position_window_space = TRUE;
1724 break;
1725 case TGSI_PROPERTY_VS_PROHIBIT_UCPS:
1726 /* we don't need this one */
1727 break;
1728 case TGSI_PROPERTY_GS_INPUT_PRIM:
1729 shader->gs_input_prim = property->u[0].Data;
1730 break;
1731 case TGSI_PROPERTY_GS_OUTPUT_PRIM:
1732 shader->gs_output_prim = property->u[0].Data;
1733 break;
1734 case TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES:
1735 shader->gs_max_out_vertices = property->u[0].Data;
1736 break;
1737 }
1738 break;
1739 default:
1740 R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type);
1741 r = -EINVAL;
1742 goto out_err;
1743 }
1744 }
1745
1746 shader->ring_item_size = ctx.next_ring_offset;
1747
1748 /* Process two side if needed */
1749 if (shader->two_side && ctx.colors_used) {
1750 int i, count = ctx.shader->ninput;
1751 unsigned next_lds_loc = ctx.shader->nlds;
1752
1753 /* additional inputs will be allocated right after the existing inputs,
1754 * we won't need them after the color selection, so we don't need to
1755 * reserve these gprs for the rest of the shader code and to adjust
1756 * output offsets etc. */
1757 int gpr = ctx.file_offset[TGSI_FILE_INPUT] +
1758 ctx.info.file_max[TGSI_FILE_INPUT] + 1;
1759
1760 if (ctx.face_gpr == -1) {
1761 i = ctx.shader->ninput++;
1762 ctx.shader->input[i].name = TGSI_SEMANTIC_FACE;
1763 ctx.shader->input[i].spi_sid = 0;
1764 ctx.shader->input[i].gpr = gpr++;
1765 ctx.face_gpr = ctx.shader->input[i].gpr;
1766 }
1767
1768 for (i = 0; i < count; i++) {
1769 if (ctx.shader->input[i].name == TGSI_SEMANTIC_COLOR) {
1770 int ni = ctx.shader->ninput++;
1771 memcpy(&ctx.shader->input[ni],&ctx.shader->input[i], sizeof(struct r600_shader_io));
1772 ctx.shader->input[ni].name = TGSI_SEMANTIC_BCOLOR;
1773 ctx.shader->input[ni].spi_sid = r600_spi_sid(&ctx.shader->input[ni]);
1774 ctx.shader->input[ni].gpr = gpr++;
1775 // TGSI to LLVM needs to know the lds position of inputs.
1776 // Non LLVM path computes it later (in process_twoside_color)
1777 ctx.shader->input[ni].lds_pos = next_lds_loc++;
1778 ctx.shader->input[i].back_color_input = ni;
1779 if (ctx.bc->chip_class >= EVERGREEN) {
1780 if ((r = evergreen_interp_input(&ctx, ni)))
1781 return r;
1782 }
1783 }
1784 }
1785 }
1786
1787 /* LLVM backend setup */
1788 #ifdef R600_USE_LLVM
1789 if (use_llvm) {
1790 struct radeon_llvm_context radeon_llvm_ctx;
1791 LLVMModuleRef mod;
1792 bool dump = r600_can_dump_shader(&rscreen->b, tokens);
1793 boolean use_kill = false;
1794
1795 memset(&radeon_llvm_ctx, 0, sizeof(radeon_llvm_ctx));
1796 radeon_llvm_ctx.type = ctx.type;
1797 radeon_llvm_ctx.two_side = shader->two_side;
1798 radeon_llvm_ctx.face_gpr = ctx.face_gpr;
1799 radeon_llvm_ctx.inputs_count = ctx.shader->ninput + 1;
1800 radeon_llvm_ctx.r600_inputs = ctx.shader->input;
1801 radeon_llvm_ctx.r600_outputs = ctx.shader->output;
1802 radeon_llvm_ctx.color_buffer_count = max_color_exports;
1803 radeon_llvm_ctx.chip_class = ctx.bc->chip_class;
1804 radeon_llvm_ctx.fs_color_all = shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN);
1805 radeon_llvm_ctx.stream_outputs = &so;
1806 radeon_llvm_ctx.clip_vertex = ctx.cv_output;
1807 radeon_llvm_ctx.alpha_to_one = key.alpha_to_one;
1808 radeon_llvm_ctx.has_compressed_msaa_texturing =
1809 ctx.bc->has_compressed_msaa_texturing;
1810 mod = r600_tgsi_llvm(&radeon_llvm_ctx, tokens);
1811 ctx.shader->has_txq_cube_array_z_comp = radeon_llvm_ctx.has_txq_cube_array_z_comp;
1812 ctx.shader->uses_tex_buffers = radeon_llvm_ctx.uses_tex_buffers;
1813
1814 if (r600_llvm_compile(mod, rscreen->b.family, ctx.bc, &use_kill, dump)) {
1815 radeon_llvm_dispose(&radeon_llvm_ctx);
1816 use_llvm = 0;
1817 fprintf(stderr, "R600 LLVM backend failed to compile "
1818 "shader. Falling back to TGSI\n");
1819 } else {
1820 ctx.file_offset[TGSI_FILE_OUTPUT] =
1821 ctx.file_offset[TGSI_FILE_INPUT];
1822 }
1823 if (use_kill)
1824 ctx.shader->uses_kill = use_kill;
1825 radeon_llvm_dispose(&radeon_llvm_ctx);
1826 }
1827 #endif
1828 /* End of LLVM backend setup */
1829
1830 if (shader->fs_write_all && rscreen->b.chip_class >= EVERGREEN)
1831 shader->nr_ps_max_color_exports = 8;
1832
1833 if (!use_llvm) {
1834 if (ctx.fragcoord_input >= 0) {
1835 if (ctx.bc->chip_class == CAYMAN) {
1836 for (j = 0 ; j < 4; j++) {
1837 struct r600_bytecode_alu alu;
1838 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1839 alu.op = ALU_OP1_RECIP_IEEE;
1840 alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
1841 alu.src[0].chan = 3;
1842
1843 alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
1844 alu.dst.chan = j;
1845 alu.dst.write = (j == 3);
1846 alu.last = 1;
1847 if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
1848 return r;
1849 }
1850 } else {
1851 struct r600_bytecode_alu alu;
1852 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1853 alu.op = ALU_OP1_RECIP_IEEE;
1854 alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
1855 alu.src[0].chan = 3;
1856
1857 alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
1858 alu.dst.chan = 3;
1859 alu.dst.write = 1;
1860 alu.last = 1;
1861 if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
1862 return r;
1863 }
1864 }
1865
1866 if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
1867 struct r600_bytecode_alu alu;
1868 int r;
1869
1870 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1871 alu.op = ALU_OP1_MOV;
1872 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
1873 alu.src[0].value = 0;
1874 alu.dst.sel = ctx.gs_export_gpr_treg;
1875 alu.dst.write = 1;
1876 alu.last = 1;
1877 r = r600_bytecode_add_alu(ctx.bc, &alu);
1878 if (r)
1879 return r;
1880 }
1881 if (shader->two_side && ctx.colors_used) {
1882 if ((r = process_twoside_color_inputs(&ctx)))
1883 return r;
1884 }
1885
1886 tgsi_parse_init(&ctx.parse, tokens);
1887 while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
1888 tgsi_parse_token(&ctx.parse);
1889 switch (ctx.parse.FullToken.Token.Type) {
1890 case TGSI_TOKEN_TYPE_INSTRUCTION:
1891 r = tgsi_is_supported(&ctx);
1892 if (r)
1893 goto out_err;
1894 ctx.max_driver_temp_used = 0;
1895 /* reserve first tmp for everyone */
1896 r600_get_temp(&ctx);
1897
1898 opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode;
1899 if ((r = tgsi_split_constant(&ctx)))
1900 goto out_err;
1901 if ((r = tgsi_split_literal_constant(&ctx)))
1902 goto out_err;
1903 if (ctx.type == TGSI_PROCESSOR_GEOMETRY)
1904 if ((r = tgsi_split_gs_inputs(&ctx)))
1905 goto out_err;
1906 if (ctx.bc->chip_class == CAYMAN)
1907 ctx.inst_info = &cm_shader_tgsi_instruction[opcode];
1908 else if (ctx.bc->chip_class >= EVERGREEN)
1909 ctx.inst_info = &eg_shader_tgsi_instruction[opcode];
1910 else
1911 ctx.inst_info = &r600_shader_tgsi_instruction[opcode];
1912 r = ctx.inst_info->process(&ctx);
1913 if (r)
1914 goto out_err;
1915 break;
1916 default:
1917 break;
1918 }
1919 }
1920 }
1921
1922 /* Reset the temporary register counter. */
1923 ctx.max_driver_temp_used = 0;
1924
1925 noutput = shader->noutput;
1926
1927 if (!ring_outputs && ctx.clip_vertex_write) {
1928 unsigned clipdist_temp[2];
1929
1930 clipdist_temp[0] = r600_get_temp(&ctx);
1931 clipdist_temp[1] = r600_get_temp(&ctx);
1932
1933 /* need to convert a clipvertex write into clipdistance writes and not export
1934 the clip vertex anymore */
1935
1936 memset(&shader->output[noutput], 0, 2*sizeof(struct r600_shader_io));
1937 shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
1938 shader->output[noutput].gpr = clipdist_temp[0];
1939 noutput++;
1940 shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
1941 shader->output[noutput].gpr = clipdist_temp[1];
1942 noutput++;
1943
1944 /* reset spi_sid for clipvertex output to avoid confusing spi */
1945 shader->output[ctx.cv_output].spi_sid = 0;
1946
1947 shader->clip_dist_write = 0xFF;
1948
1949 for (i = 0; i < 8; i++) {
1950 int oreg = i >> 2;
1951 int ochan = i & 3;
1952
1953 for (j = 0; j < 4; j++) {
1954 struct r600_bytecode_alu alu;
1955 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1956 alu.op = ALU_OP2_DOT4;
1957 alu.src[0].sel = shader->output[ctx.cv_output].gpr;
1958 alu.src[0].chan = j;
1959
1960 alu.src[1].sel = 512 + i;
1961 alu.src[1].kc_bank = R600_UCP_CONST_BUFFER;
1962 alu.src[1].chan = j;
1963
1964 alu.dst.sel = clipdist_temp[oreg];
1965 alu.dst.chan = j;
1966 alu.dst.write = (j == ochan);
1967 if (j == 3)
1968 alu.last = 1;
1969 if (!use_llvm)
1970 r = r600_bytecode_add_alu(ctx.bc, &alu);
1971 if (r)
1972 return r;
1973 }
1974 }
1975 }
1976
1977 /* Add stream outputs. */
1978 if (!ring_outputs && ctx.type == TGSI_PROCESSOR_VERTEX &&
1979 so.num_outputs && !use_llvm)
1980 emit_streamout(&ctx, &so);
1981
1982 convert_edgeflag_to_int(&ctx);
1983
1984 if (ring_outputs) {
1985 if (key.vs_as_es)
1986 emit_gs_ring_writes(&ctx, FALSE);
1987 } else {
1988 /* Export output */
1989 next_clip_base = shader->vs_out_misc_write ? 62 : 61;
1990
1991 for (i = 0, j = 0; i < noutput; i++, j++) {
1992 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
1993 output[j].gpr = shader->output[i].gpr;
1994 output[j].elem_size = 3;
1995 output[j].swizzle_x = 0;
1996 output[j].swizzle_y = 1;
1997 output[j].swizzle_z = 2;
1998 output[j].swizzle_w = 3;
1999 output[j].burst_count = 1;
2000 output[j].type = -1;
2001 output[j].op = CF_OP_EXPORT;
2002 switch (ctx.type) {
2003 case TGSI_PROCESSOR_VERTEX:
2004 switch (shader->output[i].name) {
2005 case TGSI_SEMANTIC_POSITION:
2006 output[j].array_base = 60;
2007 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2008 pos_emitted = true;
2009 break;
2010
2011 case TGSI_SEMANTIC_PSIZE:
2012 output[j].array_base = 61;
2013 output[j].swizzle_y = 7;
2014 output[j].swizzle_z = 7;
2015 output[j].swizzle_w = 7;
2016 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2017 pos_emitted = true;
2018 break;
2019 case TGSI_SEMANTIC_EDGEFLAG:
2020 output[j].array_base = 61;
2021 output[j].swizzle_x = 7;
2022 output[j].swizzle_y = 0;
2023 output[j].swizzle_z = 7;
2024 output[j].swizzle_w = 7;
2025 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2026 pos_emitted = true;
2027 break;
2028 case TGSI_SEMANTIC_LAYER:
2029 /* spi_sid is 0 for outputs that are
2030 * not consumed by PS */
2031 if (shader->output[i].spi_sid) {
2032 output[j].array_base = next_param_base++;
2033 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2034 j++;
2035 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
2036 }
2037 output[j].array_base = 61;
2038 output[j].swizzle_x = 7;
2039 output[j].swizzle_y = 7;
2040 output[j].swizzle_z = 0;
2041 output[j].swizzle_w = 7;
2042 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2043 pos_emitted = true;
2044 break;
2045 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2046 /* spi_sid is 0 for outputs that are
2047 * not consumed by PS */
2048 if (shader->output[i].spi_sid) {
2049 output[j].array_base = next_param_base++;
2050 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2051 j++;
2052 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
2053 }
2054 output[j].array_base = 61;
2055 output[j].swizzle_x = 7;
2056 output[j].swizzle_y = 7;
2057 output[j].swizzle_z = 7;
2058 output[j].swizzle_w = 0;
2059 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2060 pos_emitted = true;
2061 break;
2062 case TGSI_SEMANTIC_CLIPVERTEX:
2063 j--;
2064 break;
2065 case TGSI_SEMANTIC_CLIPDIST:
2066 output[j].array_base = next_clip_base++;
2067 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2068 pos_emitted = true;
2069 /* spi_sid is 0 for clipdistance outputs that were generated
2070 * for clipvertex - we don't need to pass them to PS */
2071 if (shader->output[i].spi_sid) {
2072 j++;
2073 /* duplicate it as PARAM to pass to the pixel shader */
2074 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
2075 output[j].array_base = next_param_base++;
2076 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2077 }
2078 break;
2079 case TGSI_SEMANTIC_FOG:
2080 output[j].swizzle_y = 4; /* 0 */
2081 output[j].swizzle_z = 4; /* 0 */
2082 output[j].swizzle_w = 5; /* 1 */
2083 break;
2084 }
2085 break;
2086 case TGSI_PROCESSOR_FRAGMENT:
2087 if (shader->output[i].name == TGSI_SEMANTIC_COLOR) {
2088 /* never export more colors than the number of CBs */
2089 if (shader->output[i].sid >= max_color_exports) {
2090 /* skip export */
2091 j--;
2092 continue;
2093 }
2094 output[j].swizzle_w = key.alpha_to_one ? 5 : 3;
2095 output[j].array_base = shader->output[i].sid;
2096 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2097 shader->nr_ps_color_exports++;
2098 if (shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN)) {
2099 for (k = 1; k < max_color_exports; k++) {
2100 j++;
2101 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
2102 output[j].gpr = shader->output[i].gpr;
2103 output[j].elem_size = 3;
2104 output[j].swizzle_x = 0;
2105 output[j].swizzle_y = 1;
2106 output[j].swizzle_z = 2;
2107 output[j].swizzle_w = key.alpha_to_one ? 5 : 3;
2108 output[j].burst_count = 1;
2109 output[j].array_base = k;
2110 output[j].op = CF_OP_EXPORT;
2111 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2112 shader->nr_ps_color_exports++;
2113 }
2114 }
2115 } else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) {
2116 output[j].array_base = 61;
2117 output[j].swizzle_x = 2;
2118 output[j].swizzle_y = 7;
2119 output[j].swizzle_z = output[j].swizzle_w = 7;
2120 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2121 } else if (shader->output[i].name == TGSI_SEMANTIC_STENCIL) {
2122 output[j].array_base = 61;
2123 output[j].swizzle_x = 7;
2124 output[j].swizzle_y = 1;
2125 output[j].swizzle_z = output[j].swizzle_w = 7;
2126 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2127 } else {
2128 R600_ERR("unsupported fragment output name %d\n", shader->output[i].name);
2129 r = -EINVAL;
2130 goto out_err;
2131 }
2132 break;
2133 default:
2134 R600_ERR("unsupported processor type %d\n", ctx.type);
2135 r = -EINVAL;
2136 goto out_err;
2137 }
2138
2139 if (output[j].type==-1) {
2140 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2141 output[j].array_base = next_param_base++;
2142 }
2143 }
2144
2145 /* add fake position export */
2146 if (ctx.type == TGSI_PROCESSOR_VERTEX && pos_emitted == false) {
2147 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
2148 output[j].gpr = 0;
2149 output[j].elem_size = 3;
2150 output[j].swizzle_x = 7;
2151 output[j].swizzle_y = 7;
2152 output[j].swizzle_z = 7;
2153 output[j].swizzle_w = 7;
2154 output[j].burst_count = 1;
2155 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2156 output[j].array_base = 60;
2157 output[j].op = CF_OP_EXPORT;
2158 j++;
2159 }
2160
2161 /* add fake param output for vertex shader if no param is exported */
2162 if (ctx.type == TGSI_PROCESSOR_VERTEX && next_param_base == 0) {
2163 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
2164 output[j].gpr = 0;
2165 output[j].elem_size = 3;
2166 output[j].swizzle_x = 7;
2167 output[j].swizzle_y = 7;
2168 output[j].swizzle_z = 7;
2169 output[j].swizzle_w = 7;
2170 output[j].burst_count = 1;
2171 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2172 output[j].array_base = 0;
2173 output[j].op = CF_OP_EXPORT;
2174 j++;
2175 }
2176
2177 /* add fake pixel export */
2178 if (ctx.type == TGSI_PROCESSOR_FRAGMENT && shader->nr_ps_color_exports == 0) {
2179 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
2180 output[j].gpr = 0;
2181 output[j].elem_size = 3;
2182 output[j].swizzle_x = 7;
2183 output[j].swizzle_y = 7;
2184 output[j].swizzle_z = 7;
2185 output[j].swizzle_w = 7;
2186 output[j].burst_count = 1;
2187 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2188 output[j].array_base = 0;
2189 output[j].op = CF_OP_EXPORT;
2190 j++;
2191 }
2192
2193 noutput = j;
2194
2195 /* set export done on last export of each type */
2196 for (i = noutput - 1, output_done = 0; i >= 0; i--) {
2197 if (!(output_done & (1 << output[i].type))) {
2198 output_done |= (1 << output[i].type);
2199 output[i].op = CF_OP_EXPORT_DONE;
2200 }
2201 }
2202 /* add output to bytecode */
2203 if (!use_llvm) {
2204 for (i = 0; i < noutput; i++) {
2205 r = r600_bytecode_add_output(ctx.bc, &output[i]);
2206 if (r)
2207 goto out_err;
2208 }
2209 }
2210 }
2211
2212 /* add program end */
2213 if (!use_llvm) {
2214 if (ctx.bc->chip_class == CAYMAN)
2215 cm_bytecode_add_cf_end(ctx.bc);
2216 else {
2217 const struct cf_op_info *last = NULL;
2218
2219 if (ctx.bc->cf_last)
2220 last = r600_isa_cf(ctx.bc->cf_last->op);
2221
2222 /* alu clause instructions don't have EOP bit, so add NOP */
2223 if (!last || last->flags & CF_ALU || ctx.bc->cf_last->op == CF_OP_LOOP_END || ctx.bc->cf_last->op == CF_OP_CALL_FS)
2224 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
2225
2226 ctx.bc->cf_last->end_of_program = 1;
2227 }
2228 }
2229
2230 /* check GPR limit - we have 124 = 128 - 4
2231 * (4 are reserved as alu clause temporary registers) */
2232 if (ctx.bc->ngpr > 124) {
2233 R600_ERR("GPR limit exceeded - shader requires %d registers\n", ctx.bc->ngpr);
2234 r = -ENOMEM;
2235 goto out_err;
2236 }
2237
2238 if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
2239 if ((r = generate_gs_copy_shader(rctx, pipeshader, &so)))
2240 return r;
2241 }
2242
2243 free(ctx.literals);
2244 tgsi_parse_free(&ctx.parse);
2245 return 0;
2246 out_err:
2247 free(ctx.literals);
2248 tgsi_parse_free(&ctx.parse);
2249 return r;
2250 }
2251
2252 static int tgsi_unsupported(struct r600_shader_ctx *ctx)
2253 {
2254 R600_ERR("%s tgsi opcode unsupported\n",
2255 tgsi_get_opcode_name(ctx->inst_info->tgsi_opcode));
2256 return -EINVAL;
2257 }
2258
2259 static int tgsi_end(struct r600_shader_ctx *ctx)
2260 {
2261 return 0;
2262 }
2263
2264 static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
2265 const struct r600_shader_src *shader_src,
2266 unsigned chan)
2267 {
2268 bc_src->sel = shader_src->sel;
2269 bc_src->chan = shader_src->swizzle[chan];
2270 bc_src->neg = shader_src->neg;
2271 bc_src->abs = shader_src->abs;
2272 bc_src->rel = shader_src->rel;
2273 bc_src->value = shader_src->value[bc_src->chan];
2274 bc_src->kc_bank = shader_src->kc_bank;
2275 }
2276
2277 static void r600_bytecode_src_set_abs(struct r600_bytecode_alu_src *bc_src)
2278 {
2279 bc_src->abs = 1;
2280 bc_src->neg = 0;
2281 }
2282
2283 static void r600_bytecode_src_toggle_neg(struct r600_bytecode_alu_src *bc_src)
2284 {
2285 bc_src->neg = !bc_src->neg;
2286 }
2287
2288 static void tgsi_dst(struct r600_shader_ctx *ctx,
2289 const struct tgsi_full_dst_register *tgsi_dst,
2290 unsigned swizzle,
2291 struct r600_bytecode_alu_dst *r600_dst)
2292 {
2293 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2294
2295 r600_dst->sel = tgsi_dst->Register.Index;
2296 r600_dst->sel += ctx->file_offset[tgsi_dst->Register.File];
2297 r600_dst->chan = swizzle;
2298 r600_dst->write = 1;
2299 if (tgsi_dst->Register.Indirect)
2300 r600_dst->rel = V_SQ_REL_RELATIVE;
2301 if (inst->Instruction.Saturate) {
2302 r600_dst->clamp = 1;
2303 }
2304 }
2305
2306 static int tgsi_last_instruction(unsigned writemask)
2307 {
2308 int i, lasti = 0;
2309
2310 for (i = 0; i < 4; i++) {
2311 if (writemask & (1 << i)) {
2312 lasti = i;
2313 }
2314 }
2315 return lasti;
2316 }
2317
2318 static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap, int trans_only)
2319 {
2320 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2321 struct r600_bytecode_alu alu;
2322 unsigned write_mask = inst->Dst[0].Register.WriteMask;
2323 int i, j, r, lasti = tgsi_last_instruction(write_mask);
2324 /* use temp register if trans_only and more than one dst component */
2325 int use_tmp = trans_only && (write_mask ^ (1 << lasti));
2326
2327 for (i = 0; i <= lasti; i++) {
2328 if (!(write_mask & (1 << i)))
2329 continue;
2330
2331 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2332 if (use_tmp) {
2333 alu.dst.sel = ctx->temp_reg;
2334 alu.dst.chan = i;
2335 alu.dst.write = 1;
2336 } else
2337 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2338
2339 alu.op = ctx->inst_info->op;
2340 if (!swap) {
2341 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
2342 r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
2343 }
2344 } else {
2345 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
2346 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
2347 }
2348 /* handle some special cases */
2349 switch (ctx->inst_info->tgsi_opcode) {
2350 case TGSI_OPCODE_SUB:
2351 r600_bytecode_src_toggle_neg(&alu.src[1]);
2352 break;
2353 case TGSI_OPCODE_ABS:
2354 r600_bytecode_src_set_abs(&alu.src[0]);
2355 break;
2356 default:
2357 break;
2358 }
2359 if (i == lasti || trans_only) {
2360 alu.last = 1;
2361 }
2362 r = r600_bytecode_add_alu(ctx->bc, &alu);
2363 if (r)
2364 return r;
2365 }
2366
2367 if (use_tmp) {
2368 /* move result from temp to dst */
2369 for (i = 0; i <= lasti; i++) {
2370 if (!(write_mask & (1 << i)))
2371 continue;
2372
2373 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2374 alu.op = ALU_OP1_MOV;
2375 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2376 alu.src[0].sel = ctx->temp_reg;
2377 alu.src[0].chan = i;
2378 alu.last = (i == lasti);
2379
2380 r = r600_bytecode_add_alu(ctx->bc, &alu);
2381 if (r)
2382 return r;
2383 }
2384 }
2385 return 0;
2386 }
2387
2388 static int tgsi_op2(struct r600_shader_ctx *ctx)
2389 {
2390 return tgsi_op2_s(ctx, 0, 0);
2391 }
2392
2393 static int tgsi_op2_swap(struct r600_shader_ctx *ctx)
2394 {
2395 return tgsi_op2_s(ctx, 1, 0);
2396 }
2397
2398 static int tgsi_op2_trans(struct r600_shader_ctx *ctx)
2399 {
2400 return tgsi_op2_s(ctx, 0, 1);
2401 }
2402
2403 static int tgsi_ineg(struct r600_shader_ctx *ctx)
2404 {
2405 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2406 struct r600_bytecode_alu alu;
2407 int i, r;
2408 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
2409
2410 for (i = 0; i < lasti + 1; i++) {
2411
2412 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
2413 continue;
2414 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2415 alu.op = ctx->inst_info->op;
2416
2417 alu.src[0].sel = V_SQ_ALU_SRC_0;
2418
2419 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
2420
2421 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2422
2423 if (i == lasti) {
2424 alu.last = 1;
2425 }
2426 r = r600_bytecode_add_alu(ctx->bc, &alu);
2427 if (r)
2428 return r;
2429 }
2430 return 0;
2431
2432 }
2433
2434 static int cayman_emit_float_instr(struct r600_shader_ctx *ctx)
2435 {
2436 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2437 int i, j, r;
2438 struct r600_bytecode_alu alu;
2439 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
2440
2441 for (i = 0 ; i < last_slot; i++) {
2442 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2443 alu.op = ctx->inst_info->op;
2444 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
2445 r600_bytecode_src(&alu.src[j], &ctx->src[j], 0);
2446
2447 /* RSQ should take the absolute value of src */
2448 if (ctx->inst_info->tgsi_opcode == TGSI_OPCODE_RSQ) {
2449 r600_bytecode_src_set_abs(&alu.src[j]);
2450 }
2451 }
2452 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2453 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
2454
2455 if (i == last_slot - 1)
2456 alu.last = 1;
2457 r = r600_bytecode_add_alu(ctx->bc, &alu);
2458 if (r)
2459 return r;
2460 }
2461 return 0;
2462 }
2463
2464 static int cayman_mul_int_instr(struct r600_shader_ctx *ctx)
2465 {
2466 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2467 int i, j, k, r;
2468 struct r600_bytecode_alu alu;
2469 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
2470 for (k = 0; k < last_slot; k++) {
2471 if (!(inst->Dst[0].Register.WriteMask & (1 << k)))
2472 continue;
2473
2474 for (i = 0 ; i < 4; i++) {
2475 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2476 alu.op = ctx->inst_info->op;
2477 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
2478 r600_bytecode_src(&alu.src[j], &ctx->src[j], k);
2479 }
2480 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2481 alu.dst.write = (i == k);
2482 if (i == 3)
2483 alu.last = 1;
2484 r = r600_bytecode_add_alu(ctx->bc, &alu);
2485 if (r)
2486 return r;
2487 }
2488 }
2489 return 0;
2490 }
2491
2492 /*
2493 * r600 - trunc to -PI..PI range
2494 * r700 - normalize by dividing by 2PI
2495 * see fdo bug 27901
2496 */
2497 static int tgsi_setup_trig(struct r600_shader_ctx *ctx)
2498 {
2499 static float half_inv_pi = 1.0 /(3.1415926535 * 2);
2500 static float double_pi = 3.1415926535 * 2;
2501 static float neg_pi = -3.1415926535;
2502
2503 int r;
2504 struct r600_bytecode_alu alu;
2505
2506 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2507 alu.op = ALU_OP3_MULADD;
2508 alu.is_op3 = 1;
2509
2510 alu.dst.chan = 0;
2511 alu.dst.sel = ctx->temp_reg;
2512 alu.dst.write = 1;
2513
2514 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
2515
2516 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2517 alu.src[1].chan = 0;
2518 alu.src[1].value = *(uint32_t *)&half_inv_pi;
2519 alu.src[2].sel = V_SQ_ALU_SRC_0_5;
2520 alu.src[2].chan = 0;
2521 alu.last = 1;
2522 r = r600_bytecode_add_alu(ctx->bc, &alu);
2523 if (r)
2524 return r;
2525
2526 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2527 alu.op = ALU_OP1_FRACT;
2528
2529 alu.dst.chan = 0;
2530 alu.dst.sel = ctx->temp_reg;
2531 alu.dst.write = 1;
2532
2533 alu.src[0].sel = ctx->temp_reg;
2534 alu.src[0].chan = 0;
2535 alu.last = 1;
2536 r = r600_bytecode_add_alu(ctx->bc, &alu);
2537 if (r)
2538 return r;
2539
2540 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2541 alu.op = ALU_OP3_MULADD;
2542 alu.is_op3 = 1;
2543
2544 alu.dst.chan = 0;
2545 alu.dst.sel = ctx->temp_reg;
2546 alu.dst.write = 1;
2547
2548 alu.src[0].sel = ctx->temp_reg;
2549 alu.src[0].chan = 0;
2550
2551 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2552 alu.src[1].chan = 0;
2553 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
2554 alu.src[2].chan = 0;
2555
2556 if (ctx->bc->chip_class == R600) {
2557 alu.src[1].value = *(uint32_t *)&double_pi;
2558 alu.src[2].value = *(uint32_t *)&neg_pi;
2559 } else {
2560 alu.src[1].sel = V_SQ_ALU_SRC_1;
2561 alu.src[2].sel = V_SQ_ALU_SRC_0_5;
2562 alu.src[2].neg = 1;
2563 }
2564
2565 alu.last = 1;
2566 r = r600_bytecode_add_alu(ctx->bc, &alu);
2567 if (r)
2568 return r;
2569 return 0;
2570 }
2571
2572 static int cayman_trig(struct r600_shader_ctx *ctx)
2573 {
2574 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2575 struct r600_bytecode_alu alu;
2576 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
2577 int i, r;
2578
2579 r = tgsi_setup_trig(ctx);
2580 if (r)
2581 return r;
2582
2583
2584 for (i = 0; i < last_slot; i++) {
2585 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2586 alu.op = ctx->inst_info->op;
2587 alu.dst.chan = i;
2588
2589 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2590 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
2591
2592 alu.src[0].sel = ctx->temp_reg;
2593 alu.src[0].chan = 0;
2594 if (i == last_slot - 1)
2595 alu.last = 1;
2596 r = r600_bytecode_add_alu(ctx->bc, &alu);
2597 if (r)
2598 return r;
2599 }
2600 return 0;
2601 }
2602
2603 static int tgsi_trig(struct r600_shader_ctx *ctx)
2604 {
2605 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2606 struct r600_bytecode_alu alu;
2607 int i, r;
2608 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
2609
2610 r = tgsi_setup_trig(ctx);
2611 if (r)
2612 return r;
2613
2614 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2615 alu.op = ctx->inst_info->op;
2616 alu.dst.chan = 0;
2617 alu.dst.sel = ctx->temp_reg;
2618 alu.dst.write = 1;
2619
2620 alu.src[0].sel = ctx->temp_reg;
2621 alu.src[0].chan = 0;
2622 alu.last = 1;
2623 r = r600_bytecode_add_alu(ctx->bc, &alu);
2624 if (r)
2625 return r;
2626
2627 /* replicate result */
2628 for (i = 0; i < lasti + 1; i++) {
2629 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
2630 continue;
2631
2632 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2633 alu.op = ALU_OP1_MOV;
2634
2635 alu.src[0].sel = ctx->temp_reg;
2636 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2637 if (i == lasti)
2638 alu.last = 1;
2639 r = r600_bytecode_add_alu(ctx->bc, &alu);
2640 if (r)
2641 return r;
2642 }
2643 return 0;
2644 }
2645
2646 static int tgsi_scs(struct r600_shader_ctx *ctx)
2647 {
2648 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2649 struct r600_bytecode_alu alu;
2650 int i, r;
2651
2652 /* We'll only need the trig stuff if we are going to write to the
2653 * X or Y components of the destination vector.
2654 */
2655 if (likely(inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY)) {
2656 r = tgsi_setup_trig(ctx);
2657 if (r)
2658 return r;
2659 }
2660
2661 /* dst.x = COS */
2662 if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2663 if (ctx->bc->chip_class == CAYMAN) {
2664 for (i = 0 ; i < 3; i++) {
2665 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2666 alu.op = ALU_OP1_COS;
2667 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2668
2669 if (i == 0)
2670 alu.dst.write = 1;
2671 else
2672 alu.dst.write = 0;
2673 alu.src[0].sel = ctx->temp_reg;
2674 alu.src[0].chan = 0;
2675 if (i == 2)
2676 alu.last = 1;
2677 r = r600_bytecode_add_alu(ctx->bc, &alu);
2678 if (r)
2679 return r;
2680 }
2681 } else {
2682 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2683 alu.op = ALU_OP1_COS;
2684 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
2685
2686 alu.src[0].sel = ctx->temp_reg;
2687 alu.src[0].chan = 0;
2688 alu.last = 1;
2689 r = r600_bytecode_add_alu(ctx->bc, &alu);
2690 if (r)
2691 return r;
2692 }
2693 }
2694
2695 /* dst.y = SIN */
2696 if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2697 if (ctx->bc->chip_class == CAYMAN) {
2698 for (i = 0 ; i < 3; i++) {
2699 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2700 alu.op = ALU_OP1_SIN;
2701 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2702 if (i == 1)
2703 alu.dst.write = 1;
2704 else
2705 alu.dst.write = 0;
2706 alu.src[0].sel = ctx->temp_reg;
2707 alu.src[0].chan = 0;
2708 if (i == 2)
2709 alu.last = 1;
2710 r = r600_bytecode_add_alu(ctx->bc, &alu);
2711 if (r)
2712 return r;
2713 }
2714 } else {
2715 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2716 alu.op = ALU_OP1_SIN;
2717 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
2718
2719 alu.src[0].sel = ctx->temp_reg;
2720 alu.src[0].chan = 0;
2721 alu.last = 1;
2722 r = r600_bytecode_add_alu(ctx->bc, &alu);
2723 if (r)
2724 return r;
2725 }
2726 }
2727
2728 /* dst.z = 0.0; */
2729 if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2730 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2731
2732 alu.op = ALU_OP1_MOV;
2733
2734 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
2735
2736 alu.src[0].sel = V_SQ_ALU_SRC_0;
2737 alu.src[0].chan = 0;
2738
2739 alu.last = 1;
2740
2741 r = r600_bytecode_add_alu(ctx->bc, &alu);
2742 if (r)
2743 return r;
2744 }
2745
2746 /* dst.w = 1.0; */
2747 if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2748 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2749
2750 alu.op = ALU_OP1_MOV;
2751
2752 tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
2753
2754 alu.src[0].sel = V_SQ_ALU_SRC_1;
2755 alu.src[0].chan = 0;
2756
2757 alu.last = 1;
2758
2759 r = r600_bytecode_add_alu(ctx->bc, &alu);
2760 if (r)
2761 return r;
2762 }
2763
2764 return 0;
2765 }
2766
2767 static int tgsi_kill(struct r600_shader_ctx *ctx)
2768 {
2769 struct r600_bytecode_alu alu;
2770 int i, r;
2771
2772 for (i = 0; i < 4; i++) {
2773 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2774 alu.op = ctx->inst_info->op;
2775
2776 alu.dst.chan = i;
2777
2778 alu.src[0].sel = V_SQ_ALU_SRC_0;
2779
2780 if (ctx->inst_info->tgsi_opcode == TGSI_OPCODE_KILL) {
2781 alu.src[1].sel = V_SQ_ALU_SRC_1;
2782 alu.src[1].neg = 1;
2783 } else {
2784 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
2785 }
2786 if (i == 3) {
2787 alu.last = 1;
2788 }
2789 r = r600_bytecode_add_alu(ctx->bc, &alu);
2790 if (r)
2791 return r;
2792 }
2793
2794 /* kill must be last in ALU */
2795 ctx->bc->force_add_cf = 1;
2796 ctx->shader->uses_kill = TRUE;
2797 return 0;
2798 }
2799
2800 static int tgsi_lit(struct r600_shader_ctx *ctx)
2801 {
2802 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2803 struct r600_bytecode_alu alu;
2804 int r;
2805
2806 /* tmp.x = max(src.y, 0.0) */
2807 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2808 alu.op = ALU_OP2_MAX;
2809 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
2810 alu.src[1].sel = V_SQ_ALU_SRC_0; /*0.0*/
2811 alu.src[1].chan = 1;
2812
2813 alu.dst.sel = ctx->temp_reg;
2814 alu.dst.chan = 0;
2815 alu.dst.write = 1;
2816
2817 alu.last = 1;
2818 r = r600_bytecode_add_alu(ctx->bc, &alu);
2819 if (r)
2820 return r;
2821
2822 if (inst->Dst[0].Register.WriteMask & (1 << 2))
2823 {
2824 int chan;
2825 int sel;
2826 int i;
2827
2828 if (ctx->bc->chip_class == CAYMAN) {
2829 for (i = 0; i < 3; i++) {
2830 /* tmp.z = log(tmp.x) */
2831 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2832 alu.op = ALU_OP1_LOG_CLAMPED;
2833 alu.src[0].sel = ctx->temp_reg;
2834 alu.src[0].chan = 0;
2835 alu.dst.sel = ctx->temp_reg;
2836 alu.dst.chan = i;
2837 if (i == 2) {
2838 alu.dst.write = 1;
2839 alu.last = 1;
2840 } else
2841 alu.dst.write = 0;
2842
2843 r = r600_bytecode_add_alu(ctx->bc, &alu);
2844 if (r)
2845 return r;
2846 }
2847 } else {
2848 /* tmp.z = log(tmp.x) */
2849 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2850 alu.op = ALU_OP1_LOG_CLAMPED;
2851 alu.src[0].sel = ctx->temp_reg;
2852 alu.src[0].chan = 0;
2853 alu.dst.sel = ctx->temp_reg;
2854 alu.dst.chan = 2;
2855 alu.dst.write = 1;
2856 alu.last = 1;
2857 r = r600_bytecode_add_alu(ctx->bc, &alu);
2858 if (r)
2859 return r;
2860 }
2861
2862 chan = alu.dst.chan;
2863 sel = alu.dst.sel;
2864
2865 /* tmp.x = amd MUL_LIT(tmp.z, src.w, src.x ) */
2866 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2867 alu.op = ALU_OP3_MUL_LIT;
2868 alu.src[0].sel = sel;
2869 alu.src[0].chan = chan;
2870 r600_bytecode_src(&alu.src[1], &ctx->src[0], 3);
2871 r600_bytecode_src(&alu.src[2], &ctx->src[0], 0);
2872 alu.dst.sel = ctx->temp_reg;
2873 alu.dst.chan = 0;
2874 alu.dst.write = 1;
2875 alu.is_op3 = 1;
2876 alu.last = 1;
2877 r = r600_bytecode_add_alu(ctx->bc, &alu);
2878 if (r)
2879 return r;
2880
2881 if (ctx->bc->chip_class == CAYMAN) {
2882 for (i = 0; i < 3; i++) {
2883 /* dst.z = exp(tmp.x) */
2884 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2885 alu.op = ALU_OP1_EXP_IEEE;
2886 alu.src[0].sel = ctx->temp_reg;
2887 alu.src[0].chan = 0;
2888 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2889 if (i == 2) {
2890 alu.dst.write = 1;
2891 alu.last = 1;
2892 } else
2893 alu.dst.write = 0;
2894 r = r600_bytecode_add_alu(ctx->bc, &alu);
2895 if (r)
2896 return r;
2897 }
2898 } else {
2899 /* dst.z = exp(tmp.x) */
2900 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2901 alu.op = ALU_OP1_EXP_IEEE;
2902 alu.src[0].sel = ctx->temp_reg;
2903 alu.src[0].chan = 0;
2904 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
2905 alu.last = 1;
2906 r = r600_bytecode_add_alu(ctx->bc, &alu);
2907 if (r)
2908 return r;
2909 }
2910 }
2911
2912 /* dst.x, <- 1.0 */
2913 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2914 alu.op = ALU_OP1_MOV;
2915 alu.src[0].sel = V_SQ_ALU_SRC_1; /*1.0*/
2916 alu.src[0].chan = 0;
2917 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
2918 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 0) & 1;
2919 r = r600_bytecode_add_alu(ctx->bc, &alu);
2920 if (r)
2921 return r;
2922
2923 /* dst.y = max(src.x, 0.0) */
2924 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2925 alu.op = ALU_OP2_MAX;
2926 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
2927 alu.src[1].sel = V_SQ_ALU_SRC_0; /*0.0*/
2928 alu.src[1].chan = 0;
2929 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
2930 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 1) & 1;
2931 r = r600_bytecode_add_alu(ctx->bc, &alu);
2932 if (r)
2933 return r;
2934
2935 /* dst.w, <- 1.0 */
2936 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2937 alu.op = ALU_OP1_MOV;
2938 alu.src[0].sel = V_SQ_ALU_SRC_1;
2939 alu.src[0].chan = 0;
2940 tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
2941 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 3) & 1;
2942 alu.last = 1;
2943 r = r600_bytecode_add_alu(ctx->bc, &alu);
2944 if (r)
2945 return r;
2946
2947 return 0;
2948 }
2949
2950 static int tgsi_rsq(struct r600_shader_ctx *ctx)
2951 {
2952 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2953 struct r600_bytecode_alu alu;
2954 int i, r;
2955
2956 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2957
2958 /* XXX:
2959 * For state trackers other than OpenGL, we'll want to use
2960 * _RECIPSQRT_IEEE instead.
2961 */
2962 alu.op = ALU_OP1_RECIPSQRT_CLAMPED;
2963
2964 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
2965 r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
2966 r600_bytecode_src_set_abs(&alu.src[i]);
2967 }
2968 alu.dst.sel = ctx->temp_reg;
2969 alu.dst.write = 1;
2970 alu.last = 1;
2971 r = r600_bytecode_add_alu(ctx->bc, &alu);
2972 if (r)
2973 return r;
2974 /* replicate result */
2975 return tgsi_helper_tempx_replicate(ctx);
2976 }
2977
2978 static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx)
2979 {
2980 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2981 struct r600_bytecode_alu alu;
2982 int i, r;
2983
2984 for (i = 0; i < 4; i++) {
2985 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2986 alu.src[0].sel = ctx->temp_reg;
2987 alu.op = ALU_OP1_MOV;
2988 alu.dst.chan = i;
2989 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2990 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
2991 if (i == 3)
2992 alu.last = 1;
2993 r = r600_bytecode_add_alu(ctx->bc, &alu);
2994 if (r)
2995 return r;
2996 }
2997 return 0;
2998 }
2999
3000 static int tgsi_trans_srcx_replicate(struct r600_shader_ctx *ctx)
3001 {
3002 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3003 struct r600_bytecode_alu alu;
3004 int i, r;
3005
3006 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3007 alu.op = ctx->inst_info->op;
3008 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
3009 r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
3010 }
3011 alu.dst.sel = ctx->temp_reg;
3012 alu.dst.write = 1;
3013 alu.last = 1;
3014 r = r600_bytecode_add_alu(ctx->bc, &alu);
3015 if (r)
3016 return r;
3017 /* replicate result */
3018 return tgsi_helper_tempx_replicate(ctx);
3019 }
3020
3021 static int cayman_pow(struct r600_shader_ctx *ctx)
3022 {
3023 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3024 int i, r;
3025 struct r600_bytecode_alu alu;
3026 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
3027
3028 for (i = 0; i < 3; i++) {
3029 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3030 alu.op = ALU_OP1_LOG_IEEE;
3031 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
3032 alu.dst.sel = ctx->temp_reg;
3033 alu.dst.chan = i;
3034 alu.dst.write = 1;
3035 if (i == 2)
3036 alu.last = 1;
3037 r = r600_bytecode_add_alu(ctx->bc, &alu);
3038 if (r)
3039 return r;
3040 }
3041
3042 /* b * LOG2(a) */
3043 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3044 alu.op = ALU_OP2_MUL;
3045 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
3046 alu.src[1].sel = ctx->temp_reg;
3047 alu.dst.sel = ctx->temp_reg;
3048 alu.dst.write = 1;
3049 alu.last = 1;
3050 r = r600_bytecode_add_alu(ctx->bc, &alu);
3051 if (r)
3052 return r;
3053
3054 for (i = 0; i < last_slot; i++) {
3055 /* POW(a,b) = EXP2(b * LOG2(a))*/
3056 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3057 alu.op = ALU_OP1_EXP_IEEE;
3058 alu.src[0].sel = ctx->temp_reg;
3059
3060 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3061 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
3062 if (i == last_slot - 1)
3063 alu.last = 1;
3064 r = r600_bytecode_add_alu(ctx->bc, &alu);
3065 if (r)
3066 return r;
3067 }
3068 return 0;
3069 }
3070
3071 static int tgsi_pow(struct r600_shader_ctx *ctx)
3072 {
3073 struct r600_bytecode_alu alu;
3074 int r;
3075
3076 /* LOG2(a) */
3077 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3078 alu.op = ALU_OP1_LOG_IEEE;
3079 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
3080 alu.dst.sel = ctx->temp_reg;
3081 alu.dst.write = 1;
3082 alu.last = 1;
3083 r = r600_bytecode_add_alu(ctx->bc, &alu);
3084 if (r)
3085 return r;
3086 /* b * LOG2(a) */
3087 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3088 alu.op = ALU_OP2_MUL;
3089 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
3090 alu.src[1].sel = ctx->temp_reg;
3091 alu.dst.sel = ctx->temp_reg;
3092 alu.dst.write = 1;
3093 alu.last = 1;
3094 r = r600_bytecode_add_alu(ctx->bc, &alu);
3095 if (r)
3096 return r;
3097 /* POW(a,b) = EXP2(b * LOG2(a))*/
3098 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3099 alu.op = ALU_OP1_EXP_IEEE;
3100 alu.src[0].sel = ctx->temp_reg;
3101 alu.dst.sel = ctx->temp_reg;
3102 alu.dst.write = 1;
3103 alu.last = 1;
3104 r = r600_bytecode_add_alu(ctx->bc, &alu);
3105 if (r)
3106 return r;
3107 return tgsi_helper_tempx_replicate(ctx);
3108 }
3109
3110 static int tgsi_divmod(struct r600_shader_ctx *ctx, int mod, int signed_op)
3111 {
3112 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3113 struct r600_bytecode_alu alu;
3114 int i, r, j;
3115 unsigned write_mask = inst->Dst[0].Register.WriteMask;
3116 int tmp0 = ctx->temp_reg;
3117 int tmp1 = r600_get_temp(ctx);
3118 int tmp2 = r600_get_temp(ctx);
3119 int tmp3 = r600_get_temp(ctx);
3120 /* Unsigned path:
3121 *
3122 * we need to represent src1 as src2*q + r, where q - quotient, r - remainder
3123 *
3124 * 1. tmp0.x = rcp (src2) = 2^32/src2 + e, where e is rounding error
3125 * 2. tmp0.z = lo (tmp0.x * src2)
3126 * 3. tmp0.w = -tmp0.z
3127 * 4. tmp0.y = hi (tmp0.x * src2)
3128 * 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z) = abs(lo(rcp*src2))
3129 * 6. tmp0.w = hi (tmp0.z * tmp0.x) = e, rounding error
3130 * 7. tmp1.x = tmp0.x - tmp0.w
3131 * 8. tmp1.y = tmp0.x + tmp0.w
3132 * 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x)
3133 * 10. tmp0.z = hi(tmp0.x * src1) = q
3134 * 11. tmp0.y = lo (tmp0.z * src2) = src2*q = src1 - r
3135 *
3136 * 12. tmp0.w = src1 - tmp0.y = r
3137 * 13. tmp1.x = tmp0.w >= src2 = r >= src2 (uint comparison)
3138 * 14. tmp1.y = src1 >= tmp0.y = r >= 0 (uint comparison)
3139 *
3140 * if DIV
3141 *
3142 * 15. tmp1.z = tmp0.z + 1 = q + 1
3143 * 16. tmp1.w = tmp0.z - 1 = q - 1
3144 *
3145 * else MOD
3146 *
3147 * 15. tmp1.z = tmp0.w - src2 = r - src2
3148 * 16. tmp1.w = tmp0.w + src2 = r + src2
3149 *
3150 * endif
3151 *
3152 * 17. tmp1.x = tmp1.x & tmp1.y
3153 *
3154 * DIV: 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z
3155 * MOD: 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z
3156 *
3157 * 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z
3158 * 20. dst = src2==0 ? MAX_UINT : tmp0.z
3159 *
3160 * Signed path:
3161 *
3162 * Same as unsigned, using abs values of the operands,
3163 * and fixing the sign of the result in the end.
3164 */
3165
3166 for (i = 0; i < 4; i++) {
3167 if (!(write_mask & (1<<i)))
3168 continue;
3169
3170 if (signed_op) {
3171
3172 /* tmp2.x = -src0 */
3173 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3174 alu.op = ALU_OP2_SUB_INT;
3175
3176 alu.dst.sel = tmp2;
3177 alu.dst.chan = 0;
3178 alu.dst.write = 1;
3179
3180 alu.src[0].sel = V_SQ_ALU_SRC_0;
3181
3182 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3183
3184 alu.last = 1;
3185 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3186 return r;
3187
3188 /* tmp2.y = -src1 */
3189 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3190 alu.op = ALU_OP2_SUB_INT;
3191
3192 alu.dst.sel = tmp2;
3193 alu.dst.chan = 1;
3194 alu.dst.write = 1;
3195
3196 alu.src[0].sel = V_SQ_ALU_SRC_0;
3197
3198 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3199
3200 alu.last = 1;
3201 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3202 return r;
3203
3204 /* tmp2.z sign bit is set if src0 and src2 signs are different */
3205 /* it will be a sign of the quotient */
3206 if (!mod) {
3207
3208 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3209 alu.op = ALU_OP2_XOR_INT;
3210
3211 alu.dst.sel = tmp2;
3212 alu.dst.chan = 2;
3213 alu.dst.write = 1;
3214
3215 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3216 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3217
3218 alu.last = 1;
3219 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3220 return r;
3221 }
3222
3223 /* tmp2.x = |src0| */
3224 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3225 alu.op = ALU_OP3_CNDGE_INT;
3226 alu.is_op3 = 1;
3227
3228 alu.dst.sel = tmp2;
3229 alu.dst.chan = 0;
3230 alu.dst.write = 1;
3231
3232 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3233 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3234 alu.src[2].sel = tmp2;
3235 alu.src[2].chan = 0;
3236
3237 alu.last = 1;
3238 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3239 return r;
3240
3241 /* tmp2.y = |src1| */
3242 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3243 alu.op = ALU_OP3_CNDGE_INT;
3244 alu.is_op3 = 1;
3245
3246 alu.dst.sel = tmp2;
3247 alu.dst.chan = 1;
3248 alu.dst.write = 1;
3249
3250 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
3251 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3252 alu.src[2].sel = tmp2;
3253 alu.src[2].chan = 1;
3254
3255 alu.last = 1;
3256 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3257 return r;
3258
3259 }
3260
3261 /* 1. tmp0.x = rcp_u (src2) = 2^32/src2 + e, where e is rounding error */
3262 if (ctx->bc->chip_class == CAYMAN) {
3263 /* tmp3.x = u2f(src2) */
3264 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3265 alu.op = ALU_OP1_UINT_TO_FLT;
3266
3267 alu.dst.sel = tmp3;
3268 alu.dst.chan = 0;
3269 alu.dst.write = 1;
3270
3271 if (signed_op) {
3272 alu.src[0].sel = tmp2;
3273 alu.src[0].chan = 1;
3274 } else {
3275 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
3276 }
3277
3278 alu.last = 1;
3279 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3280 return r;
3281
3282 /* tmp0.x = recip(tmp3.x) */
3283 for (j = 0 ; j < 3; j++) {
3284 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3285 alu.op = ALU_OP1_RECIP_IEEE;
3286
3287 alu.dst.sel = tmp0;
3288 alu.dst.chan = j;
3289 alu.dst.write = (j == 0);
3290
3291 alu.src[0].sel = tmp3;
3292 alu.src[0].chan = 0;
3293
3294 if (j == 2)
3295 alu.last = 1;
3296 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3297 return r;
3298 }
3299
3300 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3301 alu.op = ALU_OP2_MUL;
3302
3303 alu.src[0].sel = tmp0;
3304 alu.src[0].chan = 0;
3305
3306 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
3307 alu.src[1].value = 0x4f800000;
3308
3309 alu.dst.sel = tmp3;
3310 alu.dst.write = 1;
3311 alu.last = 1;
3312 r = r600_bytecode_add_alu(ctx->bc, &alu);
3313 if (r)
3314 return r;
3315
3316 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3317 alu.op = ALU_OP1_FLT_TO_UINT;
3318
3319 alu.dst.sel = tmp0;
3320 alu.dst.chan = 0;
3321 alu.dst.write = 1;
3322
3323 alu.src[0].sel = tmp3;
3324 alu.src[0].chan = 0;
3325
3326 alu.last = 1;
3327 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3328 return r;
3329
3330 } else {
3331 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3332 alu.op = ALU_OP1_RECIP_UINT;
3333
3334 alu.dst.sel = tmp0;
3335 alu.dst.chan = 0;
3336 alu.dst.write = 1;
3337
3338 if (signed_op) {
3339 alu.src[0].sel = tmp2;
3340 alu.src[0].chan = 1;
3341 } else {
3342 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
3343 }
3344
3345 alu.last = 1;
3346 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3347 return r;
3348 }
3349
3350 /* 2. tmp0.z = lo (tmp0.x * src2) */
3351 if (ctx->bc->chip_class == CAYMAN) {
3352 for (j = 0 ; j < 4; j++) {
3353 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3354 alu.op = ALU_OP2_MULLO_UINT;
3355
3356 alu.dst.sel = tmp0;
3357 alu.dst.chan = j;
3358 alu.dst.write = (j == 2);
3359
3360 alu.src[0].sel = tmp0;
3361 alu.src[0].chan = 0;
3362 if (signed_op) {
3363 alu.src[1].sel = tmp2;
3364 alu.src[1].chan = 1;
3365 } else {
3366 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3367 }
3368
3369 alu.last = (j == 3);
3370 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3371 return r;
3372 }
3373 } else {
3374 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3375 alu.op = ALU_OP2_MULLO_UINT;
3376
3377 alu.dst.sel = tmp0;
3378 alu.dst.chan = 2;
3379 alu.dst.write = 1;
3380
3381 alu.src[0].sel = tmp0;
3382 alu.src[0].chan = 0;
3383 if (signed_op) {
3384 alu.src[1].sel = tmp2;
3385 alu.src[1].chan = 1;
3386 } else {
3387 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3388 }
3389
3390 alu.last = 1;
3391 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3392 return r;
3393 }
3394
3395 /* 3. tmp0.w = -tmp0.z */
3396 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3397 alu.op = ALU_OP2_SUB_INT;
3398
3399 alu.dst.sel = tmp0;
3400 alu.dst.chan = 3;
3401 alu.dst.write = 1;
3402
3403 alu.src[0].sel = V_SQ_ALU_SRC_0;
3404 alu.src[1].sel = tmp0;
3405 alu.src[1].chan = 2;
3406
3407 alu.last = 1;
3408 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3409 return r;
3410
3411 /* 4. tmp0.y = hi (tmp0.x * src2) */
3412 if (ctx->bc->chip_class == CAYMAN) {
3413 for (j = 0 ; j < 4; j++) {
3414 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3415 alu.op = ALU_OP2_MULHI_UINT;
3416
3417 alu.dst.sel = tmp0;
3418 alu.dst.chan = j;
3419 alu.dst.write = (j == 1);
3420
3421 alu.src[0].sel = tmp0;
3422 alu.src[0].chan = 0;
3423
3424 if (signed_op) {
3425 alu.src[1].sel = tmp2;
3426 alu.src[1].chan = 1;
3427 } else {
3428 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3429 }
3430 alu.last = (j == 3);
3431 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3432 return r;
3433 }
3434 } else {
3435 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3436 alu.op = ALU_OP2_MULHI_UINT;
3437
3438 alu.dst.sel = tmp0;
3439 alu.dst.chan = 1;
3440 alu.dst.write = 1;
3441
3442 alu.src[0].sel = tmp0;
3443 alu.src[0].chan = 0;
3444
3445 if (signed_op) {
3446 alu.src[1].sel = tmp2;
3447 alu.src[1].chan = 1;
3448 } else {
3449 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3450 }
3451
3452 alu.last = 1;
3453 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3454 return r;
3455 }
3456
3457 /* 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z) = abs(lo(rcp*src)) */
3458 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3459 alu.op = ALU_OP3_CNDE_INT;
3460 alu.is_op3 = 1;
3461
3462 alu.dst.sel = tmp0;
3463 alu.dst.chan = 2;
3464 alu.dst.write = 1;
3465
3466 alu.src[0].sel = tmp0;
3467 alu.src[0].chan = 1;
3468 alu.src[1].sel = tmp0;
3469 alu.src[1].chan = 3;
3470 alu.src[2].sel = tmp0;
3471 alu.src[2].chan = 2;
3472
3473 alu.last = 1;
3474 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3475 return r;
3476
3477 /* 6. tmp0.w = hi (tmp0.z * tmp0.x) = e, rounding error */
3478 if (ctx->bc->chip_class == CAYMAN) {
3479 for (j = 0 ; j < 4; j++) {
3480 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3481 alu.op = ALU_OP2_MULHI_UINT;
3482
3483 alu.dst.sel = tmp0;
3484 alu.dst.chan = j;
3485 alu.dst.write = (j == 3);
3486
3487 alu.src[0].sel = tmp0;
3488 alu.src[0].chan = 2;
3489
3490 alu.src[1].sel = tmp0;
3491 alu.src[1].chan = 0;
3492
3493 alu.last = (j == 3);
3494 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3495 return r;
3496 }
3497 } else {
3498 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3499 alu.op = ALU_OP2_MULHI_UINT;
3500
3501 alu.dst.sel = tmp0;
3502 alu.dst.chan = 3;
3503 alu.dst.write = 1;
3504
3505 alu.src[0].sel = tmp0;
3506 alu.src[0].chan = 2;
3507
3508 alu.src[1].sel = tmp0;
3509 alu.src[1].chan = 0;
3510
3511 alu.last = 1;
3512 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3513 return r;
3514 }
3515
3516 /* 7. tmp1.x = tmp0.x - tmp0.w */
3517 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3518 alu.op = ALU_OP2_SUB_INT;
3519
3520 alu.dst.sel = tmp1;
3521 alu.dst.chan = 0;
3522 alu.dst.write = 1;
3523
3524 alu.src[0].sel = tmp0;
3525 alu.src[0].chan = 0;
3526 alu.src[1].sel = tmp0;
3527 alu.src[1].chan = 3;
3528
3529 alu.last = 1;
3530 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3531 return r;
3532
3533 /* 8. tmp1.y = tmp0.x + tmp0.w */
3534 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3535 alu.op = ALU_OP2_ADD_INT;
3536
3537 alu.dst.sel = tmp1;
3538 alu.dst.chan = 1;
3539 alu.dst.write = 1;
3540
3541 alu.src[0].sel = tmp0;
3542 alu.src[0].chan = 0;
3543 alu.src[1].sel = tmp0;
3544 alu.src[1].chan = 3;
3545
3546 alu.last = 1;
3547 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3548 return r;
3549
3550 /* 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) */
3551 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3552 alu.op = ALU_OP3_CNDE_INT;
3553 alu.is_op3 = 1;
3554
3555 alu.dst.sel = tmp0;
3556 alu.dst.chan = 0;
3557 alu.dst.write = 1;
3558
3559 alu.src[0].sel = tmp0;
3560 alu.src[0].chan = 1;
3561 alu.src[1].sel = tmp1;
3562 alu.src[1].chan = 1;
3563 alu.src[2].sel = tmp1;
3564 alu.src[2].chan = 0;
3565
3566 alu.last = 1;
3567 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3568 return r;
3569
3570 /* 10. tmp0.z = hi(tmp0.x * src1) = q */
3571 if (ctx->bc->chip_class == CAYMAN) {
3572 for (j = 0 ; j < 4; j++) {
3573 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3574 alu.op = ALU_OP2_MULHI_UINT;
3575
3576 alu.dst.sel = tmp0;
3577 alu.dst.chan = j;
3578 alu.dst.write = (j == 2);
3579
3580 alu.src[0].sel = tmp0;
3581 alu.src[0].chan = 0;
3582
3583 if (signed_op) {
3584 alu.src[1].sel = tmp2;
3585 alu.src[1].chan = 0;
3586 } else {
3587 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3588 }
3589
3590 alu.last = (j == 3);
3591 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3592 return r;
3593 }
3594 } else {
3595 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3596 alu.op = ALU_OP2_MULHI_UINT;
3597
3598 alu.dst.sel = tmp0;
3599 alu.dst.chan = 2;
3600 alu.dst.write = 1;
3601
3602 alu.src[0].sel = tmp0;
3603 alu.src[0].chan = 0;
3604
3605 if (signed_op) {
3606 alu.src[1].sel = tmp2;
3607 alu.src[1].chan = 0;
3608 } else {
3609 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3610 }
3611
3612 alu.last = 1;
3613 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3614 return r;
3615 }
3616
3617 /* 11. tmp0.y = lo (src2 * tmp0.z) = src2*q = src1 - r */
3618 if (ctx->bc->chip_class == CAYMAN) {
3619 for (j = 0 ; j < 4; j++) {
3620 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3621 alu.op = ALU_OP2_MULLO_UINT;
3622
3623 alu.dst.sel = tmp0;
3624 alu.dst.chan = j;
3625 alu.dst.write = (j == 1);
3626
3627 if (signed_op) {
3628 alu.src[0].sel = tmp2;
3629 alu.src[0].chan = 1;
3630 } else {
3631 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
3632 }
3633
3634 alu.src[1].sel = tmp0;
3635 alu.src[1].chan = 2;
3636
3637 alu.last = (j == 3);
3638 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3639 return r;
3640 }
3641 } else {
3642 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3643 alu.op = ALU_OP2_MULLO_UINT;
3644
3645 alu.dst.sel = tmp0;
3646 alu.dst.chan = 1;
3647 alu.dst.write = 1;
3648
3649 if (signed_op) {
3650 alu.src[0].sel = tmp2;
3651 alu.src[0].chan = 1;
3652 } else {
3653 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
3654 }
3655
3656 alu.src[1].sel = tmp0;
3657 alu.src[1].chan = 2;
3658
3659 alu.last = 1;
3660 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3661 return r;
3662 }
3663
3664 /* 12. tmp0.w = src1 - tmp0.y = r */
3665 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3666 alu.op = ALU_OP2_SUB_INT;
3667
3668 alu.dst.sel = tmp0;
3669 alu.dst.chan = 3;
3670 alu.dst.write = 1;
3671
3672 if (signed_op) {
3673 alu.src[0].sel = tmp2;
3674 alu.src[0].chan = 0;
3675 } else {
3676 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3677 }
3678
3679 alu.src[1].sel = tmp0;
3680 alu.src[1].chan = 1;
3681
3682 alu.last = 1;
3683 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3684 return r;
3685
3686 /* 13. tmp1.x = tmp0.w >= src2 = r >= src2 */
3687 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3688 alu.op = ALU_OP2_SETGE_UINT;
3689
3690 alu.dst.sel = tmp1;
3691 alu.dst.chan = 0;
3692 alu.dst.write = 1;
3693
3694 alu.src[0].sel = tmp0;
3695 alu.src[0].chan = 3;
3696 if (signed_op) {
3697 alu.src[1].sel = tmp2;
3698 alu.src[1].chan = 1;
3699 } else {
3700 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3701 }
3702
3703 alu.last = 1;
3704 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3705 return r;
3706
3707 /* 14. tmp1.y = src1 >= tmp0.y = r >= 0 */
3708 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3709 alu.op = ALU_OP2_SETGE_UINT;
3710
3711 alu.dst.sel = tmp1;
3712 alu.dst.chan = 1;
3713 alu.dst.write = 1;
3714
3715 if (signed_op) {
3716 alu.src[0].sel = tmp2;
3717 alu.src[0].chan = 0;
3718 } else {
3719 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3720 }
3721
3722 alu.src[1].sel = tmp0;
3723 alu.src[1].chan = 1;
3724
3725 alu.last = 1;
3726 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3727 return r;
3728
3729 if (mod) { /* UMOD */
3730
3731 /* 15. tmp1.z = tmp0.w - src2 = r - src2 */
3732 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3733 alu.op = ALU_OP2_SUB_INT;
3734
3735 alu.dst.sel = tmp1;
3736 alu.dst.chan = 2;
3737 alu.dst.write = 1;
3738
3739 alu.src[0].sel = tmp0;
3740 alu.src[0].chan = 3;
3741
3742 if (signed_op) {
3743 alu.src[1].sel = tmp2;
3744 alu.src[1].chan = 1;
3745 } else {
3746 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3747 }
3748
3749 alu.last = 1;
3750 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3751 return r;
3752
3753 /* 16. tmp1.w = tmp0.w + src2 = r + src2 */
3754 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3755 alu.op = ALU_OP2_ADD_INT;
3756
3757 alu.dst.sel = tmp1;
3758 alu.dst.chan = 3;
3759 alu.dst.write = 1;
3760
3761 alu.src[0].sel = tmp0;
3762 alu.src[0].chan = 3;
3763 if (signed_op) {
3764 alu.src[1].sel = tmp2;
3765 alu.src[1].chan = 1;
3766 } else {
3767 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3768 }
3769
3770 alu.last = 1;
3771 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3772 return r;
3773
3774 } else { /* UDIV */
3775
3776 /* 15. tmp1.z = tmp0.z + 1 = q + 1 DIV */
3777 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3778 alu.op = ALU_OP2_ADD_INT;
3779
3780 alu.dst.sel = tmp1;
3781 alu.dst.chan = 2;
3782 alu.dst.write = 1;
3783
3784 alu.src[0].sel = tmp0;
3785 alu.src[0].chan = 2;
3786 alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
3787
3788 alu.last = 1;
3789 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3790 return r;
3791
3792 /* 16. tmp1.w = tmp0.z - 1 = q - 1 */
3793 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3794 alu.op = ALU_OP2_ADD_INT;
3795
3796 alu.dst.sel = tmp1;
3797 alu.dst.chan = 3;
3798 alu.dst.write = 1;
3799
3800 alu.src[0].sel = tmp0;
3801 alu.src[0].chan = 2;
3802 alu.src[1].sel = V_SQ_ALU_SRC_M_1_INT;
3803
3804 alu.last = 1;
3805 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3806 return r;
3807
3808 }
3809
3810 /* 17. tmp1.x = tmp1.x & tmp1.y */
3811 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3812 alu.op = ALU_OP2_AND_INT;
3813
3814 alu.dst.sel = tmp1;
3815 alu.dst.chan = 0;
3816 alu.dst.write = 1;
3817
3818 alu.src[0].sel = tmp1;
3819 alu.src[0].chan = 0;
3820 alu.src[1].sel = tmp1;
3821 alu.src[1].chan = 1;
3822
3823 alu.last = 1;
3824 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3825 return r;
3826
3827 /* 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z DIV */
3828 /* 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z MOD */
3829 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3830 alu.op = ALU_OP3_CNDE_INT;
3831 alu.is_op3 = 1;
3832
3833 alu.dst.sel = tmp0;
3834 alu.dst.chan = 2;
3835 alu.dst.write = 1;
3836
3837 alu.src[0].sel = tmp1;
3838 alu.src[0].chan = 0;
3839 alu.src[1].sel = tmp0;
3840 alu.src[1].chan = mod ? 3 : 2;
3841 alu.src[2].sel = tmp1;
3842 alu.src[2].chan = 2;
3843
3844 alu.last = 1;
3845 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3846 return r;
3847
3848 /* 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z */
3849 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3850 alu.op = ALU_OP3_CNDE_INT;
3851 alu.is_op3 = 1;
3852
3853 if (signed_op) {
3854 alu.dst.sel = tmp0;
3855 alu.dst.chan = 2;
3856 alu.dst.write = 1;
3857 } else {
3858 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3859 }
3860
3861 alu.src[0].sel = tmp1;
3862 alu.src[0].chan = 1;
3863 alu.src[1].sel = tmp1;
3864 alu.src[1].chan = 3;
3865 alu.src[2].sel = tmp0;
3866 alu.src[2].chan = 2;
3867
3868 alu.last = 1;
3869 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3870 return r;
3871
3872 if (signed_op) {
3873
3874 /* fix the sign of the result */
3875
3876 if (mod) {
3877
3878 /* tmp0.x = -tmp0.z */
3879 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3880 alu.op = ALU_OP2_SUB_INT;
3881
3882 alu.dst.sel = tmp0;
3883 alu.dst.chan = 0;
3884 alu.dst.write = 1;
3885
3886 alu.src[0].sel = V_SQ_ALU_SRC_0;
3887 alu.src[1].sel = tmp0;
3888 alu.src[1].chan = 2;
3889
3890 alu.last = 1;
3891 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3892 return r;
3893
3894 /* sign of the remainder is the same as the sign of src0 */
3895 /* tmp0.x = src0>=0 ? tmp0.z : tmp0.x */
3896 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3897 alu.op = ALU_OP3_CNDGE_INT;
3898 alu.is_op3 = 1;
3899
3900 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3901
3902 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3903 alu.src[1].sel = tmp0;
3904 alu.src[1].chan = 2;
3905 alu.src[2].sel = tmp0;
3906 alu.src[2].chan = 0;
3907
3908 alu.last = 1;
3909 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3910 return r;
3911
3912 } else {
3913
3914 /* tmp0.x = -tmp0.z */
3915 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3916 alu.op = ALU_OP2_SUB_INT;
3917
3918 alu.dst.sel = tmp0;
3919 alu.dst.chan = 0;
3920 alu.dst.write = 1;
3921
3922 alu.src[0].sel = V_SQ_ALU_SRC_0;
3923 alu.src[1].sel = tmp0;
3924 alu.src[1].chan = 2;
3925
3926 alu.last = 1;
3927 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3928 return r;
3929
3930 /* fix the quotient sign (same as the sign of src0*src1) */
3931 /* tmp0.x = tmp2.z>=0 ? tmp0.z : tmp0.x */
3932 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3933 alu.op = ALU_OP3_CNDGE_INT;
3934 alu.is_op3 = 1;
3935
3936 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3937
3938 alu.src[0].sel = tmp2;
3939 alu.src[0].chan = 2;
3940 alu.src[1].sel = tmp0;
3941 alu.src[1].chan = 2;
3942 alu.src[2].sel = tmp0;
3943 alu.src[2].chan = 0;
3944
3945 alu.last = 1;
3946 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3947 return r;
3948 }
3949 }
3950 }
3951 return 0;
3952 }
3953
3954 static int tgsi_udiv(struct r600_shader_ctx *ctx)
3955 {
3956 return tgsi_divmod(ctx, 0, 0);
3957 }
3958
3959 static int tgsi_umod(struct r600_shader_ctx *ctx)
3960 {
3961 return tgsi_divmod(ctx, 1, 0);
3962 }
3963
3964 static int tgsi_idiv(struct r600_shader_ctx *ctx)
3965 {
3966 return tgsi_divmod(ctx, 0, 1);
3967 }
3968
3969 static int tgsi_imod(struct r600_shader_ctx *ctx)
3970 {
3971 return tgsi_divmod(ctx, 1, 1);
3972 }
3973
3974
3975 static int tgsi_f2i(struct r600_shader_ctx *ctx)
3976 {
3977 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3978 struct r600_bytecode_alu alu;
3979 int i, r;
3980 unsigned write_mask = inst->Dst[0].Register.WriteMask;
3981 int last_inst = tgsi_last_instruction(write_mask);
3982
3983 for (i = 0; i < 4; i++) {
3984 if (!(write_mask & (1<<i)))
3985 continue;
3986
3987 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3988 alu.op = ALU_OP1_TRUNC;
3989
3990 alu.dst.sel = ctx->temp_reg;
3991 alu.dst.chan = i;
3992 alu.dst.write = 1;
3993
3994 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3995 if (i == last_inst)
3996 alu.last = 1;
3997 r = r600_bytecode_add_alu(ctx->bc, &alu);
3998 if (r)
3999 return r;
4000 }
4001
4002 for (i = 0; i < 4; i++) {
4003 if (!(write_mask & (1<<i)))
4004 continue;
4005
4006 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4007 alu.op = ctx->inst_info->op;
4008
4009 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4010
4011 alu.src[0].sel = ctx->temp_reg;
4012 alu.src[0].chan = i;
4013
4014 if (i == last_inst || alu.op == ALU_OP1_FLT_TO_UINT)
4015 alu.last = 1;
4016 r = r600_bytecode_add_alu(ctx->bc, &alu);
4017 if (r)
4018 return r;
4019 }
4020
4021 return 0;
4022 }
4023
4024 static int tgsi_iabs(struct r600_shader_ctx *ctx)
4025 {
4026 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4027 struct r600_bytecode_alu alu;
4028 int i, r;
4029 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4030 int last_inst = tgsi_last_instruction(write_mask);
4031
4032 /* tmp = -src */
4033 for (i = 0; i < 4; i++) {
4034 if (!(write_mask & (1<<i)))
4035 continue;
4036
4037 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4038 alu.op = ALU_OP2_SUB_INT;
4039
4040 alu.dst.sel = ctx->temp_reg;
4041 alu.dst.chan = i;
4042 alu.dst.write = 1;
4043
4044 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4045 alu.src[0].sel = V_SQ_ALU_SRC_0;
4046
4047 if (i == last_inst)
4048 alu.last = 1;
4049 r = r600_bytecode_add_alu(ctx->bc, &alu);
4050 if (r)
4051 return r;
4052 }
4053
4054 /* dst = (src >= 0 ? src : tmp) */
4055 for (i = 0; i < 4; i++) {
4056 if (!(write_mask & (1<<i)))
4057 continue;
4058
4059 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4060 alu.op = ALU_OP3_CNDGE_INT;
4061 alu.is_op3 = 1;
4062 alu.dst.write = 1;
4063
4064 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4065
4066 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4067 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4068 alu.src[2].sel = ctx->temp_reg;
4069 alu.src[2].chan = i;
4070
4071 if (i == last_inst)
4072 alu.last = 1;
4073 r = r600_bytecode_add_alu(ctx->bc, &alu);
4074 if (r)
4075 return r;
4076 }
4077 return 0;
4078 }
4079
4080 static int tgsi_issg(struct r600_shader_ctx *ctx)
4081 {
4082 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4083 struct r600_bytecode_alu alu;
4084 int i, r;
4085 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4086 int last_inst = tgsi_last_instruction(write_mask);
4087
4088 /* tmp = (src >= 0 ? src : -1) */
4089 for (i = 0; i < 4; i++) {
4090 if (!(write_mask & (1<<i)))
4091 continue;
4092
4093 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4094 alu.op = ALU_OP3_CNDGE_INT;
4095 alu.is_op3 = 1;
4096
4097 alu.dst.sel = ctx->temp_reg;
4098 alu.dst.chan = i;
4099 alu.dst.write = 1;
4100
4101 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4102 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4103 alu.src[2].sel = V_SQ_ALU_SRC_M_1_INT;
4104
4105 if (i == last_inst)
4106 alu.last = 1;
4107 r = r600_bytecode_add_alu(ctx->bc, &alu);
4108 if (r)
4109 return r;
4110 }
4111
4112 /* dst = (tmp > 0 ? 1 : tmp) */
4113 for (i = 0; i < 4; i++) {
4114 if (!(write_mask & (1<<i)))
4115 continue;
4116
4117 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4118 alu.op = ALU_OP3_CNDGT_INT;
4119 alu.is_op3 = 1;
4120 alu.dst.write = 1;
4121
4122 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4123
4124 alu.src[0].sel = ctx->temp_reg;
4125 alu.src[0].chan = i;
4126
4127 alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
4128
4129 alu.src[2].sel = ctx->temp_reg;
4130 alu.src[2].chan = i;
4131
4132 if (i == last_inst)
4133 alu.last = 1;
4134 r = r600_bytecode_add_alu(ctx->bc, &alu);
4135 if (r)
4136 return r;
4137 }
4138 return 0;
4139 }
4140
4141
4142
4143 static int tgsi_ssg(struct r600_shader_ctx *ctx)
4144 {
4145 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4146 struct r600_bytecode_alu alu;
4147 int i, r;
4148
4149 /* tmp = (src > 0 ? 1 : src) */
4150 for (i = 0; i < 4; i++) {
4151 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4152 alu.op = ALU_OP3_CNDGT;
4153 alu.is_op3 = 1;
4154
4155 alu.dst.sel = ctx->temp_reg;
4156 alu.dst.chan = i;
4157
4158 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4159 alu.src[1].sel = V_SQ_ALU_SRC_1;
4160 r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
4161
4162 if (i == 3)
4163 alu.last = 1;
4164 r = r600_bytecode_add_alu(ctx->bc, &alu);
4165 if (r)
4166 return r;
4167 }
4168
4169 /* dst = (-tmp > 0 ? -1 : tmp) */
4170 for (i = 0; i < 4; i++) {
4171 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4172 alu.op = ALU_OP3_CNDGT;
4173 alu.is_op3 = 1;
4174 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4175
4176 alu.src[0].sel = ctx->temp_reg;
4177 alu.src[0].chan = i;
4178 alu.src[0].neg = 1;
4179
4180 alu.src[1].sel = V_SQ_ALU_SRC_1;
4181 alu.src[1].neg = 1;
4182
4183 alu.src[2].sel = ctx->temp_reg;
4184 alu.src[2].chan = i;
4185
4186 if (i == 3)
4187 alu.last = 1;
4188 r = r600_bytecode_add_alu(ctx->bc, &alu);
4189 if (r)
4190 return r;
4191 }
4192 return 0;
4193 }
4194
4195 static int tgsi_helper_copy(struct r600_shader_ctx *ctx, struct tgsi_full_instruction *inst)
4196 {
4197 struct r600_bytecode_alu alu;
4198 int i, r;
4199
4200 for (i = 0; i < 4; i++) {
4201 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4202 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) {
4203 alu.op = ALU_OP0_NOP;
4204 alu.dst.chan = i;
4205 } else {
4206 alu.op = ALU_OP1_MOV;
4207 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4208 alu.src[0].sel = ctx->temp_reg;
4209 alu.src[0].chan = i;
4210 }
4211 if (i == 3) {
4212 alu.last = 1;
4213 }
4214 r = r600_bytecode_add_alu(ctx->bc, &alu);
4215 if (r)
4216 return r;
4217 }
4218 return 0;
4219 }
4220
4221 static int tgsi_op3(struct r600_shader_ctx *ctx)
4222 {
4223 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4224 struct r600_bytecode_alu alu;
4225 int i, j, r;
4226 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4227
4228 for (i = 0; i < lasti + 1; i++) {
4229 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4230 continue;
4231
4232 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4233 alu.op = ctx->inst_info->op;
4234 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4235 r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
4236 }
4237
4238 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4239 alu.dst.chan = i;
4240 alu.dst.write = 1;
4241 alu.is_op3 = 1;
4242 if (i == lasti) {
4243 alu.last = 1;
4244 }
4245 r = r600_bytecode_add_alu(ctx->bc, &alu);
4246 if (r)
4247 return r;
4248 }
4249 return 0;
4250 }
4251
4252 static int tgsi_dp(struct r600_shader_ctx *ctx)
4253 {
4254 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4255 struct r600_bytecode_alu alu;
4256 int i, j, r;
4257
4258 for (i = 0; i < 4; i++) {
4259 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4260 alu.op = ctx->inst_info->op;
4261 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4262 r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
4263 }
4264
4265 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4266 alu.dst.chan = i;
4267 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
4268 /* handle some special cases */
4269 switch (ctx->inst_info->tgsi_opcode) {
4270 case TGSI_OPCODE_DP2:
4271 if (i > 1) {
4272 alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
4273 alu.src[0].chan = alu.src[1].chan = 0;
4274 }
4275 break;
4276 case TGSI_OPCODE_DP3:
4277 if (i > 2) {
4278 alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
4279 alu.src[0].chan = alu.src[1].chan = 0;
4280 }
4281 break;
4282 case TGSI_OPCODE_DPH:
4283 if (i == 3) {
4284 alu.src[0].sel = V_SQ_ALU_SRC_1;
4285 alu.src[0].chan = 0;
4286 alu.src[0].neg = 0;
4287 }
4288 break;
4289 default:
4290 break;
4291 }
4292 if (i == 3) {
4293 alu.last = 1;
4294 }
4295 r = r600_bytecode_add_alu(ctx->bc, &alu);
4296 if (r)
4297 return r;
4298 }
4299 return 0;
4300 }
4301
4302 static inline boolean tgsi_tex_src_requires_loading(struct r600_shader_ctx *ctx,
4303 unsigned index)
4304 {
4305 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4306 return (inst->Src[index].Register.File != TGSI_FILE_TEMPORARY &&
4307 inst->Src[index].Register.File != TGSI_FILE_INPUT &&
4308 inst->Src[index].Register.File != TGSI_FILE_OUTPUT) ||
4309 ctx->src[index].neg || ctx->src[index].abs;
4310 }
4311
4312 static inline unsigned tgsi_tex_get_src_gpr(struct r600_shader_ctx *ctx,
4313 unsigned index)
4314 {
4315 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4316 return ctx->file_offset[inst->Src[index].Register.File] + inst->Src[index].Register.Index;
4317 }
4318
4319 static int do_vtx_fetch_inst(struct r600_shader_ctx *ctx, boolean src_requires_loading)
4320 {
4321 struct r600_bytecode_vtx vtx;
4322 struct r600_bytecode_alu alu;
4323 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4324 int src_gpr, r, i;
4325 int id = tgsi_tex_get_src_gpr(ctx, 1);
4326
4327 src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
4328 if (src_requires_loading) {
4329 for (i = 0; i < 4; i++) {
4330 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4331 alu.op = ALU_OP1_MOV;
4332 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4333 alu.dst.sel = ctx->temp_reg;
4334 alu.dst.chan = i;
4335 if (i == 3)
4336 alu.last = 1;
4337 alu.dst.write = 1;
4338 r = r600_bytecode_add_alu(ctx->bc, &alu);
4339 if (r)
4340 return r;
4341 }
4342 src_gpr = ctx->temp_reg;
4343 }
4344
4345 memset(&vtx, 0, sizeof(vtx));
4346 vtx.op = FETCH_OP_VFETCH;
4347 vtx.buffer_id = id + R600_MAX_CONST_BUFFERS;
4348 vtx.fetch_type = 2; /* VTX_FETCH_NO_INDEX_OFFSET */
4349 vtx.src_gpr = src_gpr;
4350 vtx.mega_fetch_count = 16;
4351 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
4352 vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; /* SEL_X */
4353 vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; /* SEL_Y */
4354 vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; /* SEL_Z */
4355 vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; /* SEL_W */
4356 vtx.use_const_fields = 1;
4357 vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */
4358
4359 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
4360 return r;
4361
4362 if (ctx->bc->chip_class >= EVERGREEN)
4363 return 0;
4364
4365 for (i = 0; i < 4; i++) {
4366 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4367 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4368 continue;
4369
4370 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4371 alu.op = ALU_OP2_AND_INT;
4372
4373 alu.dst.chan = i;
4374 alu.dst.sel = vtx.dst_gpr;
4375 alu.dst.write = 1;
4376
4377 alu.src[0].sel = vtx.dst_gpr;
4378 alu.src[0].chan = i;
4379
4380 alu.src[1].sel = 512 + (id * 2);
4381 alu.src[1].chan = i % 4;
4382 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
4383
4384 if (i == lasti)
4385 alu.last = 1;
4386 r = r600_bytecode_add_alu(ctx->bc, &alu);
4387 if (r)
4388 return r;
4389 }
4390
4391 if (inst->Dst[0].Register.WriteMask & 3) {
4392 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4393 alu.op = ALU_OP2_OR_INT;
4394
4395 alu.dst.chan = 3;
4396 alu.dst.sel = vtx.dst_gpr;
4397 alu.dst.write = 1;
4398
4399 alu.src[0].sel = vtx.dst_gpr;
4400 alu.src[0].chan = 3;
4401
4402 alu.src[1].sel = 512 + (id * 2) + 1;
4403 alu.src[1].chan = 0;
4404 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
4405
4406 alu.last = 1;
4407 r = r600_bytecode_add_alu(ctx->bc, &alu);
4408 if (r)
4409 return r;
4410 }
4411 return 0;
4412 }
4413
4414 static int r600_do_buffer_txq(struct r600_shader_ctx *ctx)
4415 {
4416 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4417 struct r600_bytecode_alu alu;
4418 int r;
4419 int id = tgsi_tex_get_src_gpr(ctx, 1);
4420
4421 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4422 alu.op = ALU_OP1_MOV;
4423
4424 if (ctx->bc->chip_class >= EVERGREEN) {
4425 alu.src[0].sel = 512 + (id / 4);
4426 alu.src[0].chan = id % 4;
4427 } else {
4428 /* r600 we have them at channel 2 of the second dword */
4429 alu.src[0].sel = 512 + (id * 2) + 1;
4430 alu.src[0].chan = 1;
4431 }
4432 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
4433 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
4434 alu.last = 1;
4435 r = r600_bytecode_add_alu(ctx->bc, &alu);
4436 if (r)
4437 return r;
4438 return 0;
4439 }
4440
4441 static int tgsi_tex(struct r600_shader_ctx *ctx)
4442 {
4443 static float one_point_five = 1.5f;
4444 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4445 struct r600_bytecode_tex tex;
4446 struct r600_bytecode_alu alu;
4447 unsigned src_gpr;
4448 int r, i, j;
4449 int opcode;
4450 bool read_compressed_msaa = ctx->bc->has_compressed_msaa_texturing &&
4451 inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
4452 (inst->Texture.Texture == TGSI_TEXTURE_2D_MSAA ||
4453 inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA);
4454
4455 bool txf_add_offsets = inst->Texture.NumOffsets &&
4456 inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
4457 inst->Texture.Texture != TGSI_TEXTURE_BUFFER;
4458
4459 /* Texture fetch instructions can only use gprs as source.
4460 * Also they cannot negate the source or take the absolute value */
4461 const boolean src_requires_loading = (inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ &&
4462 tgsi_tex_src_requires_loading(ctx, 0)) ||
4463 read_compressed_msaa || txf_add_offsets;
4464
4465 boolean src_loaded = FALSE;
4466 unsigned sampler_src_reg = inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ ? 0 : 1;
4467 int8_t offset_x = 0, offset_y = 0, offset_z = 0;
4468 boolean has_txq_cube_array_z = false;
4469
4470 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ &&
4471 ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
4472 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)))
4473 if (inst->Dst[0].Register.WriteMask & 4) {
4474 ctx->shader->has_txq_cube_array_z_comp = true;
4475 has_txq_cube_array_z = true;
4476 }
4477
4478 if (inst->Instruction.Opcode == TGSI_OPCODE_TEX2 ||
4479 inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
4480 inst->Instruction.Opcode == TGSI_OPCODE_TXL2)
4481 sampler_src_reg = 2;
4482
4483 src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
4484
4485 if (inst->Texture.Texture == TGSI_TEXTURE_BUFFER) {
4486 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) {
4487 ctx->shader->uses_tex_buffers = true;
4488 return r600_do_buffer_txq(ctx);
4489 }
4490 else if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
4491 if (ctx->bc->chip_class < EVERGREEN)
4492 ctx->shader->uses_tex_buffers = true;
4493 return do_vtx_fetch_inst(ctx, src_requires_loading);
4494 }
4495 }
4496
4497 if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) {
4498 /* TGSI moves the sampler to src reg 3 for TXD */
4499 sampler_src_reg = 3;
4500
4501 for (i = 1; i < 3; i++) {
4502 /* set gradients h/v */
4503 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
4504 tex.op = (i == 1) ? FETCH_OP_SET_GRADIENTS_H :
4505 FETCH_OP_SET_GRADIENTS_V;
4506 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
4507 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
4508
4509 if (tgsi_tex_src_requires_loading(ctx, i)) {
4510 tex.src_gpr = r600_get_temp(ctx);
4511 tex.src_sel_x = 0;
4512 tex.src_sel_y = 1;
4513 tex.src_sel_z = 2;
4514 tex.src_sel_w = 3;
4515
4516 for (j = 0; j < 4; j++) {
4517 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4518 alu.op = ALU_OP1_MOV;
4519 r600_bytecode_src(&alu.src[0], &ctx->src[i], j);
4520 alu.dst.sel = tex.src_gpr;
4521 alu.dst.chan = j;
4522 if (j == 3)
4523 alu.last = 1;
4524 alu.dst.write = 1;
4525 r = r600_bytecode_add_alu(ctx->bc, &alu);
4526 if (r)
4527 return r;
4528 }
4529
4530 } else {
4531 tex.src_gpr = tgsi_tex_get_src_gpr(ctx, i);
4532 tex.src_sel_x = ctx->src[i].swizzle[0];
4533 tex.src_sel_y = ctx->src[i].swizzle[1];
4534 tex.src_sel_z = ctx->src[i].swizzle[2];
4535 tex.src_sel_w = ctx->src[i].swizzle[3];
4536 tex.src_rel = ctx->src[i].rel;
4537 }
4538 tex.dst_gpr = ctx->temp_reg; /* just to avoid confusing the asm scheduler */
4539 tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
4540 if (inst->Texture.Texture != TGSI_TEXTURE_RECT) {
4541 tex.coord_type_x = 1;
4542 tex.coord_type_y = 1;
4543 tex.coord_type_z = 1;
4544 tex.coord_type_w = 1;
4545 }
4546 r = r600_bytecode_add_tex(ctx->bc, &tex);
4547 if (r)
4548 return r;
4549 }
4550 } else if (inst->Instruction.Opcode == TGSI_OPCODE_TXP) {
4551 int out_chan;
4552 /* Add perspective divide */
4553 if (ctx->bc->chip_class == CAYMAN) {
4554 out_chan = 2;
4555 for (i = 0; i < 3; i++) {
4556 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4557 alu.op = ALU_OP1_RECIP_IEEE;
4558 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
4559
4560 alu.dst.sel = ctx->temp_reg;
4561 alu.dst.chan = i;
4562 if (i == 2)
4563 alu.last = 1;
4564 if (out_chan == i)
4565 alu.dst.write = 1;
4566 r = r600_bytecode_add_alu(ctx->bc, &alu);
4567 if (r)
4568 return r;
4569 }
4570
4571 } else {
4572 out_chan = 3;
4573 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4574 alu.op = ALU_OP1_RECIP_IEEE;
4575 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
4576
4577 alu.dst.sel = ctx->temp_reg;
4578 alu.dst.chan = out_chan;
4579 alu.last = 1;
4580 alu.dst.write = 1;
4581 r = r600_bytecode_add_alu(ctx->bc, &alu);
4582 if (r)
4583 return r;
4584 }
4585
4586 for (i = 0; i < 3; i++) {
4587 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4588 alu.op = ALU_OP2_MUL;
4589 alu.src[0].sel = ctx->temp_reg;
4590 alu.src[0].chan = out_chan;
4591 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4592 alu.dst.sel = ctx->temp_reg;
4593 alu.dst.chan = i;
4594 alu.dst.write = 1;
4595 r = r600_bytecode_add_alu(ctx->bc, &alu);
4596 if (r)
4597 return r;
4598 }
4599 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4600 alu.op = ALU_OP1_MOV;
4601 alu.src[0].sel = V_SQ_ALU_SRC_1;
4602 alu.src[0].chan = 0;
4603 alu.dst.sel = ctx->temp_reg;
4604 alu.dst.chan = 3;
4605 alu.last = 1;
4606 alu.dst.write = 1;
4607 r = r600_bytecode_add_alu(ctx->bc, &alu);
4608 if (r)
4609 return r;
4610 src_loaded = TRUE;
4611 src_gpr = ctx->temp_reg;
4612 }
4613
4614 if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
4615 inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
4616 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
4617 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
4618 inst->Instruction.Opcode != TGSI_OPCODE_TXQ &&
4619 inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ) {
4620
4621 static const unsigned src0_swizzle[] = {2, 2, 0, 1};
4622 static const unsigned src1_swizzle[] = {1, 0, 2, 2};
4623
4624 /* tmp1.xyzw = CUBE(R0.zzxy, R0.yxzz) */
4625 for (i = 0; i < 4; i++) {
4626 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4627 alu.op = ALU_OP2_CUBE;
4628 r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
4629 r600_bytecode_src(&alu.src[1], &ctx->src[0], src1_swizzle[i]);
4630 alu.dst.sel = ctx->temp_reg;
4631 alu.dst.chan = i;
4632 if (i == 3)
4633 alu.last = 1;
4634 alu.dst.write = 1;
4635 r = r600_bytecode_add_alu(ctx->bc, &alu);
4636 if (r)
4637 return r;
4638 }
4639
4640 /* tmp1.z = RCP_e(|tmp1.z|) */
4641 if (ctx->bc->chip_class == CAYMAN) {
4642 for (i = 0; i < 3; i++) {
4643 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4644 alu.op = ALU_OP1_RECIP_IEEE;
4645 alu.src[0].sel = ctx->temp_reg;
4646 alu.src[0].chan = 2;
4647 alu.src[0].abs = 1;
4648 alu.dst.sel = ctx->temp_reg;
4649 alu.dst.chan = i;
4650 if (i == 2)
4651 alu.dst.write = 1;
4652 if (i == 2)
4653 alu.last = 1;
4654 r = r600_bytecode_add_alu(ctx->bc, &alu);
4655 if (r)
4656 return r;
4657 }
4658 } else {
4659 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4660 alu.op = ALU_OP1_RECIP_IEEE;
4661 alu.src[0].sel = ctx->temp_reg;
4662 alu.src[0].chan = 2;
4663 alu.src[0].abs = 1;
4664 alu.dst.sel = ctx->temp_reg;
4665 alu.dst.chan = 2;
4666 alu.dst.write = 1;
4667 alu.last = 1;
4668 r = r600_bytecode_add_alu(ctx->bc, &alu);
4669 if (r)
4670 return r;
4671 }
4672
4673 /* MULADD R0.x, R0.x, PS1, (0x3FC00000, 1.5f).x
4674 * MULADD R0.y, R0.y, PS1, (0x3FC00000, 1.5f).x
4675 * muladd has no writemask, have to use another temp
4676 */
4677 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4678 alu.op = ALU_OP3_MULADD;
4679 alu.is_op3 = 1;
4680
4681 alu.src[0].sel = ctx->temp_reg;
4682 alu.src[0].chan = 0;
4683 alu.src[1].sel = ctx->temp_reg;
4684 alu.src[1].chan = 2;
4685
4686 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
4687 alu.src[2].chan = 0;
4688 alu.src[2].value = *(uint32_t *)&one_point_five;
4689
4690 alu.dst.sel = ctx->temp_reg;
4691 alu.dst.chan = 0;
4692 alu.dst.write = 1;
4693
4694 r = r600_bytecode_add_alu(ctx->bc, &alu);
4695 if (r)
4696 return r;
4697
4698 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4699 alu.op = ALU_OP3_MULADD;
4700 alu.is_op3 = 1;
4701
4702 alu.src[0].sel = ctx->temp_reg;
4703 alu.src[0].chan = 1;
4704 alu.src[1].sel = ctx->temp_reg;
4705 alu.src[1].chan = 2;
4706
4707 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
4708 alu.src[2].chan = 0;
4709 alu.src[2].value = *(uint32_t *)&one_point_five;
4710
4711 alu.dst.sel = ctx->temp_reg;
4712 alu.dst.chan = 1;
4713 alu.dst.write = 1;
4714
4715 alu.last = 1;
4716 r = r600_bytecode_add_alu(ctx->bc, &alu);
4717 if (r)
4718 return r;
4719 /* write initial compare value into Z component
4720 - W src 0 for shadow cube
4721 - X src 1 for shadow cube array */
4722 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
4723 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4724 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4725 alu.op = ALU_OP1_MOV;
4726 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
4727 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
4728 else
4729 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
4730 alu.dst.sel = ctx->temp_reg;
4731 alu.dst.chan = 2;
4732 alu.dst.write = 1;
4733 alu.last = 1;
4734 r = r600_bytecode_add_alu(ctx->bc, &alu);
4735 if (r)
4736 return r;
4737 }
4738
4739 if (inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
4740 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
4741 if (ctx->bc->chip_class >= EVERGREEN) {
4742 int mytmp = r600_get_temp(ctx);
4743 static const float eight = 8.0f;
4744 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4745 alu.op = ALU_OP1_MOV;
4746 alu.src[0].sel = ctx->temp_reg;
4747 alu.src[0].chan = 3;
4748 alu.dst.sel = mytmp;
4749 alu.dst.chan = 0;
4750 alu.dst.write = 1;
4751 alu.last = 1;
4752 r = r600_bytecode_add_alu(ctx->bc, &alu);
4753 if (r)
4754 return r;
4755
4756 /* have to multiply original layer by 8 and add to face id (temp.w) in Z */
4757 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4758 alu.op = ALU_OP3_MULADD;
4759 alu.is_op3 = 1;
4760 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
4761 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4762 alu.src[1].chan = 0;
4763 alu.src[1].value = *(uint32_t *)&eight;
4764 alu.src[2].sel = mytmp;
4765 alu.src[2].chan = 0;
4766 alu.dst.sel = ctx->temp_reg;
4767 alu.dst.chan = 3;
4768 alu.dst.write = 1;
4769 alu.last = 1;
4770 r = r600_bytecode_add_alu(ctx->bc, &alu);
4771 if (r)
4772 return r;
4773 } else if (ctx->bc->chip_class < EVERGREEN) {
4774 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
4775 tex.op = FETCH_OP_SET_CUBEMAP_INDEX;
4776 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
4777 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
4778 tex.src_gpr = r600_get_temp(ctx);
4779 tex.src_sel_x = 0;
4780 tex.src_sel_y = 0;
4781 tex.src_sel_z = 0;
4782 tex.src_sel_w = 0;
4783 tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
4784 tex.coord_type_x = 1;
4785 tex.coord_type_y = 1;
4786 tex.coord_type_z = 1;
4787 tex.coord_type_w = 1;
4788 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4789 alu.op = ALU_OP1_MOV;
4790 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
4791 alu.dst.sel = tex.src_gpr;
4792 alu.dst.chan = 0;
4793 alu.last = 1;
4794 alu.dst.write = 1;
4795 r = r600_bytecode_add_alu(ctx->bc, &alu);
4796 if (r)
4797 return r;
4798
4799 r = r600_bytecode_add_tex(ctx->bc, &tex);
4800 if (r)
4801 return r;
4802 }
4803
4804 }
4805
4806 /* for cube forms of lod and bias we need to route things */
4807 if (inst->Instruction.Opcode == TGSI_OPCODE_TXB ||
4808 inst->Instruction.Opcode == TGSI_OPCODE_TXL ||
4809 inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
4810 inst->Instruction.Opcode == TGSI_OPCODE_TXL2) {
4811 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4812 alu.op = ALU_OP1_MOV;
4813 if (inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
4814 inst->Instruction.Opcode == TGSI_OPCODE_TXL2)
4815 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
4816 else
4817 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
4818 alu.dst.sel = ctx->temp_reg;
4819 alu.dst.chan = 2;
4820 alu.last = 1;
4821 alu.dst.write = 1;
4822 r = r600_bytecode_add_alu(ctx->bc, &alu);
4823 if (r)
4824 return r;
4825 }
4826
4827 src_loaded = TRUE;
4828 src_gpr = ctx->temp_reg;
4829 }
4830
4831 if (src_requires_loading && !src_loaded) {
4832 for (i = 0; i < 4; i++) {
4833 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4834 alu.op = ALU_OP1_MOV;
4835 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4836 alu.dst.sel = ctx->temp_reg;
4837 alu.dst.chan = i;
4838 if (i == 3)
4839 alu.last = 1;
4840 alu.dst.write = 1;
4841 r = r600_bytecode_add_alu(ctx->bc, &alu);
4842 if (r)
4843 return r;
4844 }
4845 src_loaded = TRUE;
4846 src_gpr = ctx->temp_reg;
4847 }
4848
4849 /* get offset values */
4850 if (inst->Texture.NumOffsets) {
4851 assert(inst->Texture.NumOffsets == 1);
4852
4853 /* The texture offset feature doesn't work with the TXF instruction
4854 * and must be emulated by adding the offset to the texture coordinates. */
4855 if (txf_add_offsets) {
4856 const struct tgsi_texture_offset *off = inst->TexOffsets;
4857
4858 switch (inst->Texture.Texture) {
4859 case TGSI_TEXTURE_3D:
4860 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4861 alu.op = ALU_OP2_ADD_INT;
4862 alu.src[0].sel = src_gpr;
4863 alu.src[0].chan = 2;
4864 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4865 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleZ];
4866 alu.dst.sel = src_gpr;
4867 alu.dst.chan = 2;
4868 alu.dst.write = 1;
4869 alu.last = 1;
4870 r = r600_bytecode_add_alu(ctx->bc, &alu);
4871 if (r)
4872 return r;
4873 /* fall through */
4874
4875 case TGSI_TEXTURE_2D:
4876 case TGSI_TEXTURE_SHADOW2D:
4877 case TGSI_TEXTURE_RECT:
4878 case TGSI_TEXTURE_SHADOWRECT:
4879 case TGSI_TEXTURE_2D_ARRAY:
4880 case TGSI_TEXTURE_SHADOW2D_ARRAY:
4881 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4882 alu.op = ALU_OP2_ADD_INT;
4883 alu.src[0].sel = src_gpr;
4884 alu.src[0].chan = 1;
4885 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4886 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleY];
4887 alu.dst.sel = src_gpr;
4888 alu.dst.chan = 1;
4889 alu.dst.write = 1;
4890 alu.last = 1;
4891 r = r600_bytecode_add_alu(ctx->bc, &alu);
4892 if (r)
4893 return r;
4894 /* fall through */
4895
4896 case TGSI_TEXTURE_1D:
4897 case TGSI_TEXTURE_SHADOW1D:
4898 case TGSI_TEXTURE_1D_ARRAY:
4899 case TGSI_TEXTURE_SHADOW1D_ARRAY:
4900 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4901 alu.op = ALU_OP2_ADD_INT;
4902 alu.src[0].sel = src_gpr;
4903 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4904 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleX];
4905 alu.dst.sel = src_gpr;
4906 alu.dst.write = 1;
4907 alu.last = 1;
4908 r = r600_bytecode_add_alu(ctx->bc, &alu);
4909 if (r)
4910 return r;
4911 break;
4912 /* texture offsets do not apply to other texture targets */
4913 }
4914 } else {
4915 offset_x = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleX] << 1;
4916 offset_y = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleY] << 1;
4917 offset_z = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleZ] << 1;
4918 }
4919 }
4920
4921 /* Obtain the sample index for reading a compressed MSAA color texture.
4922 * To read the FMASK, we use the ldfptr instruction, which tells us
4923 * where the samples are stored.
4924 * For uncompressed 8x MSAA surfaces, ldfptr should return 0x76543210,
4925 * which is the identity mapping. Each nibble says which physical sample
4926 * should be fetched to get that sample.
4927 *
4928 * Assume src.z contains the sample index. It should be modified like this:
4929 * src.z = (ldfptr() >> (src.z * 4)) & 0xF;
4930 * Then fetch the texel with src.
4931 */
4932 if (read_compressed_msaa) {
4933 unsigned sample_chan = 3;
4934 unsigned temp = r600_get_temp(ctx);
4935 assert(src_loaded);
4936
4937 /* temp.w = ldfptr() */
4938 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
4939 tex.op = FETCH_OP_LD;
4940 tex.inst_mod = 1; /* to indicate this is ldfptr */
4941 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
4942 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
4943 tex.src_gpr = src_gpr;
4944 tex.dst_gpr = temp;
4945 tex.dst_sel_x = 7; /* mask out these components */
4946 tex.dst_sel_y = 7;
4947 tex.dst_sel_z = 7;
4948 tex.dst_sel_w = 0; /* store X */
4949 tex.src_sel_x = 0;
4950 tex.src_sel_y = 1;
4951 tex.src_sel_z = 2;
4952 tex.src_sel_w = 3;
4953 tex.offset_x = offset_x;
4954 tex.offset_y = offset_y;
4955 tex.offset_z = offset_z;
4956 r = r600_bytecode_add_tex(ctx->bc, &tex);
4957 if (r)
4958 return r;
4959
4960 /* temp.x = sample_index*4 */
4961 if (ctx->bc->chip_class == CAYMAN) {
4962 for (i = 0 ; i < 4; i++) {
4963 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4964 alu.op = ALU_OP2_MULLO_INT;
4965 alu.src[0].sel = src_gpr;
4966 alu.src[0].chan = sample_chan;
4967 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4968 alu.src[1].value = 4;
4969 alu.dst.sel = temp;
4970 alu.dst.chan = i;
4971 alu.dst.write = i == 0;
4972 if (i == 3)
4973 alu.last = 1;
4974 r = r600_bytecode_add_alu(ctx->bc, &alu);
4975 if (r)
4976 return r;
4977 }
4978 } else {
4979 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4980 alu.op = ALU_OP2_MULLO_INT;
4981 alu.src[0].sel = src_gpr;
4982 alu.src[0].chan = sample_chan;
4983 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4984 alu.src[1].value = 4;
4985 alu.dst.sel = temp;
4986 alu.dst.chan = 0;
4987 alu.dst.write = 1;
4988 alu.last = 1;
4989 r = r600_bytecode_add_alu(ctx->bc, &alu);
4990 if (r)
4991 return r;
4992 }
4993
4994 /* sample_index = temp.w >> temp.x */
4995 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4996 alu.op = ALU_OP2_LSHR_INT;
4997 alu.src[0].sel = temp;
4998 alu.src[0].chan = 3;
4999 alu.src[1].sel = temp;
5000 alu.src[1].chan = 0;
5001 alu.dst.sel = src_gpr;
5002 alu.dst.chan = sample_chan;
5003 alu.dst.write = 1;
5004 alu.last = 1;
5005 r = r600_bytecode_add_alu(ctx->bc, &alu);
5006 if (r)
5007 return r;
5008
5009 /* sample_index & 0xF */
5010 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5011 alu.op = ALU_OP2_AND_INT;
5012 alu.src[0].sel = src_gpr;
5013 alu.src[0].chan = sample_chan;
5014 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
5015 alu.src[1].value = 0xF;
5016 alu.dst.sel = src_gpr;
5017 alu.dst.chan = sample_chan;
5018 alu.dst.write = 1;
5019 alu.last = 1;
5020 r = r600_bytecode_add_alu(ctx->bc, &alu);
5021 if (r)
5022 return r;
5023 #if 0
5024 /* visualize the FMASK */
5025 for (i = 0; i < 4; i++) {
5026 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5027 alu.op = ALU_OP1_INT_TO_FLT;
5028 alu.src[0].sel = src_gpr;
5029 alu.src[0].chan = sample_chan;
5030 alu.dst.sel = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
5031 alu.dst.chan = i;
5032 alu.dst.write = 1;
5033 alu.last = 1;
5034 r = r600_bytecode_add_alu(ctx->bc, &alu);
5035 if (r)
5036 return r;
5037 }
5038 return 0;
5039 #endif
5040 }
5041
5042 /* does this shader want a num layers from TXQ for a cube array? */
5043 if (has_txq_cube_array_z) {
5044 int id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
5045
5046 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5047 alu.op = ALU_OP1_MOV;
5048
5049 alu.src[0].sel = 512 + (id / 4);
5050 alu.src[0].kc_bank = R600_TXQ_CONST_BUFFER;
5051 alu.src[0].chan = id % 4;
5052 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
5053 alu.last = 1;
5054 r = r600_bytecode_add_alu(ctx->bc, &alu);
5055 if (r)
5056 return r;
5057 /* disable writemask from texture instruction */
5058 inst->Dst[0].Register.WriteMask &= ~4;
5059 }
5060
5061 opcode = ctx->inst_info->op;
5062 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
5063 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
5064 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
5065 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
5066 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY ||
5067 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
5068 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
5069 switch (opcode) {
5070 case FETCH_OP_SAMPLE:
5071 opcode = FETCH_OP_SAMPLE_C;
5072 break;
5073 case FETCH_OP_SAMPLE_L:
5074 opcode = FETCH_OP_SAMPLE_C_L;
5075 break;
5076 case FETCH_OP_SAMPLE_LB:
5077 opcode = FETCH_OP_SAMPLE_C_LB;
5078 break;
5079 case FETCH_OP_SAMPLE_G:
5080 opcode = FETCH_OP_SAMPLE_C_G;
5081 break;
5082 }
5083 }
5084
5085 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
5086 tex.op = opcode;
5087
5088 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
5089 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
5090 tex.src_gpr = src_gpr;
5091 tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
5092 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
5093 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
5094 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
5095 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
5096
5097 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ) {
5098 tex.src_sel_x = 4;
5099 tex.src_sel_y = 4;
5100 tex.src_sel_z = 4;
5101 tex.src_sel_w = 4;
5102 } else if (src_loaded) {
5103 tex.src_sel_x = 0;
5104 tex.src_sel_y = 1;
5105 tex.src_sel_z = 2;
5106 tex.src_sel_w = 3;
5107 } else {
5108 tex.src_sel_x = ctx->src[0].swizzle[0];
5109 tex.src_sel_y = ctx->src[0].swizzle[1];
5110 tex.src_sel_z = ctx->src[0].swizzle[2];
5111 tex.src_sel_w = ctx->src[0].swizzle[3];
5112 tex.src_rel = ctx->src[0].rel;
5113 }
5114
5115 if (inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
5116 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
5117 inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
5118 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
5119 tex.src_sel_x = 1;
5120 tex.src_sel_y = 0;
5121 tex.src_sel_z = 3;
5122 tex.src_sel_w = 2; /* route Z compare or Lod value into W */
5123 }
5124
5125 if (inst->Texture.Texture != TGSI_TEXTURE_RECT &&
5126 inst->Texture.Texture != TGSI_TEXTURE_SHADOWRECT) {
5127 tex.coord_type_x = 1;
5128 tex.coord_type_y = 1;
5129 }
5130 tex.coord_type_z = 1;
5131 tex.coord_type_w = 1;
5132
5133 tex.offset_x = offset_x;
5134 tex.offset_y = offset_y;
5135 tex.offset_z = offset_z;
5136
5137 /* Put the depth for comparison in W.
5138 * TGSI_TEXTURE_SHADOW2D_ARRAY already has the depth in W.
5139 * Some instructions expect the depth in Z. */
5140 if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
5141 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
5142 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
5143 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) &&
5144 opcode != FETCH_OP_SAMPLE_C_L &&
5145 opcode != FETCH_OP_SAMPLE_C_LB) {
5146 tex.src_sel_w = tex.src_sel_z;
5147 }
5148
5149 if (inst->Texture.Texture == TGSI_TEXTURE_1D_ARRAY ||
5150 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) {
5151 if (opcode == FETCH_OP_SAMPLE_C_L ||
5152 opcode == FETCH_OP_SAMPLE_C_LB) {
5153 /* the array index is read from Y */
5154 tex.coord_type_y = 0;
5155 } else {
5156 /* the array index is read from Z */
5157 tex.coord_type_z = 0;
5158 tex.src_sel_z = tex.src_sel_y;
5159 }
5160 } else if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
5161 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
5162 ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
5163 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
5164 (ctx->bc->chip_class >= EVERGREEN)))
5165 /* the array index is read from Z */
5166 tex.coord_type_z = 0;
5167
5168 /* mask unused source components */
5169 if (opcode == FETCH_OP_SAMPLE) {
5170 switch (inst->Texture.Texture) {
5171 case TGSI_TEXTURE_2D:
5172 case TGSI_TEXTURE_RECT:
5173 tex.src_sel_z = 7;
5174 tex.src_sel_w = 7;
5175 break;
5176 case TGSI_TEXTURE_1D_ARRAY:
5177 tex.src_sel_y = 7;
5178 tex.src_sel_w = 7;
5179 break;
5180 case TGSI_TEXTURE_1D:
5181 tex.src_sel_y = 7;
5182 tex.src_sel_z = 7;
5183 tex.src_sel_w = 7;
5184 break;
5185 }
5186 }
5187
5188 r = r600_bytecode_add_tex(ctx->bc, &tex);
5189 if (r)
5190 return r;
5191
5192 /* add shadow ambient support - gallium doesn't do it yet */
5193 return 0;
5194 }
5195
5196 static int tgsi_lrp(struct r600_shader_ctx *ctx)
5197 {
5198 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5199 struct r600_bytecode_alu alu;
5200 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5201 unsigned i;
5202 int r;
5203
5204 /* optimize if it's just an equal balance */
5205 if (ctx->src[0].sel == V_SQ_ALU_SRC_0_5) {
5206 for (i = 0; i < lasti + 1; i++) {
5207 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5208 continue;
5209
5210 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5211 alu.op = ALU_OP2_ADD;
5212 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5213 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
5214 alu.omod = 3;
5215 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5216 alu.dst.chan = i;
5217 if (i == lasti) {
5218 alu.last = 1;
5219 }
5220 r = r600_bytecode_add_alu(ctx->bc, &alu);
5221 if (r)
5222 return r;
5223 }
5224 return 0;
5225 }
5226
5227 /* 1 - src0 */
5228 for (i = 0; i < lasti + 1; i++) {
5229 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5230 continue;
5231
5232 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5233 alu.op = ALU_OP2_ADD;
5234 alu.src[0].sel = V_SQ_ALU_SRC_1;
5235 alu.src[0].chan = 0;
5236 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5237 r600_bytecode_src_toggle_neg(&alu.src[1]);
5238 alu.dst.sel = ctx->temp_reg;
5239 alu.dst.chan = i;
5240 if (i == lasti) {
5241 alu.last = 1;
5242 }
5243 alu.dst.write = 1;
5244 r = r600_bytecode_add_alu(ctx->bc, &alu);
5245 if (r)
5246 return r;
5247 }
5248
5249 /* (1 - src0) * src2 */
5250 for (i = 0; i < lasti + 1; i++) {
5251 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5252 continue;
5253
5254 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5255 alu.op = ALU_OP2_MUL;
5256 alu.src[0].sel = ctx->temp_reg;
5257 alu.src[0].chan = i;
5258 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
5259 alu.dst.sel = ctx->temp_reg;
5260 alu.dst.chan = i;
5261 if (i == lasti) {
5262 alu.last = 1;
5263 }
5264 alu.dst.write = 1;
5265 r = r600_bytecode_add_alu(ctx->bc, &alu);
5266 if (r)
5267 return r;
5268 }
5269
5270 /* src0 * src1 + (1 - src0) * src2 */
5271 for (i = 0; i < lasti + 1; i++) {
5272 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5273 continue;
5274
5275 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5276 alu.op = ALU_OP3_MULADD;
5277 alu.is_op3 = 1;
5278 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5279 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5280 alu.src[2].sel = ctx->temp_reg;
5281 alu.src[2].chan = i;
5282
5283 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5284 alu.dst.chan = i;
5285 if (i == lasti) {
5286 alu.last = 1;
5287 }
5288 r = r600_bytecode_add_alu(ctx->bc, &alu);
5289 if (r)
5290 return r;
5291 }
5292 return 0;
5293 }
5294
5295 static int tgsi_cmp(struct r600_shader_ctx *ctx)
5296 {
5297 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5298 struct r600_bytecode_alu alu;
5299 int i, r;
5300 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5301
5302 for (i = 0; i < lasti + 1; i++) {
5303 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5304 continue;
5305
5306 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5307 alu.op = ALU_OP3_CNDGE;
5308 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5309 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
5310 r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
5311 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5312 alu.dst.chan = i;
5313 alu.dst.write = 1;
5314 alu.is_op3 = 1;
5315 if (i == lasti)
5316 alu.last = 1;
5317 r = r600_bytecode_add_alu(ctx->bc, &alu);
5318 if (r)
5319 return r;
5320 }
5321 return 0;
5322 }
5323
5324 static int tgsi_ucmp(struct r600_shader_ctx *ctx)
5325 {
5326 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5327 struct r600_bytecode_alu alu;
5328 int i, r;
5329 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5330
5331 for (i = 0; i < lasti + 1; i++) {
5332 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5333 continue;
5334
5335 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5336 alu.op = ALU_OP3_CNDGE_INT;
5337 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5338 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
5339 r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
5340 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5341 alu.dst.chan = i;
5342 alu.dst.write = 1;
5343 alu.is_op3 = 1;
5344 if (i == lasti)
5345 alu.last = 1;
5346 r = r600_bytecode_add_alu(ctx->bc, &alu);
5347 if (r)
5348 return r;
5349 }
5350 return 0;
5351 }
5352
5353 static int tgsi_xpd(struct r600_shader_ctx *ctx)
5354 {
5355 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5356 static const unsigned int src0_swizzle[] = {2, 0, 1};
5357 static const unsigned int src1_swizzle[] = {1, 2, 0};
5358 struct r600_bytecode_alu alu;
5359 uint32_t use_temp = 0;
5360 int i, r;
5361
5362 if (inst->Dst[0].Register.WriteMask != 0xf)
5363 use_temp = 1;
5364
5365 for (i = 0; i < 4; i++) {
5366 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5367 alu.op = ALU_OP2_MUL;
5368 if (i < 3) {
5369 r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
5370 r600_bytecode_src(&alu.src[1], &ctx->src[1], src1_swizzle[i]);
5371 } else {
5372 alu.src[0].sel = V_SQ_ALU_SRC_0;
5373 alu.src[0].chan = i;
5374 alu.src[1].sel = V_SQ_ALU_SRC_0;
5375 alu.src[1].chan = i;
5376 }
5377
5378 alu.dst.sel = ctx->temp_reg;
5379 alu.dst.chan = i;
5380 alu.dst.write = 1;
5381
5382 if (i == 3)
5383 alu.last = 1;
5384 r = r600_bytecode_add_alu(ctx->bc, &alu);
5385 if (r)
5386 return r;
5387 }
5388
5389 for (i = 0; i < 4; i++) {
5390 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5391 alu.op = ALU_OP3_MULADD;
5392
5393 if (i < 3) {
5394 r600_bytecode_src(&alu.src[0], &ctx->src[0], src1_swizzle[i]);
5395 r600_bytecode_src(&alu.src[1], &ctx->src[1], src0_swizzle[i]);
5396 } else {
5397 alu.src[0].sel = V_SQ_ALU_SRC_0;
5398 alu.src[0].chan = i;
5399 alu.src[1].sel = V_SQ_ALU_SRC_0;
5400 alu.src[1].chan = i;
5401 }
5402
5403 alu.src[2].sel = ctx->temp_reg;
5404 alu.src[2].neg = 1;
5405 alu.src[2].chan = i;
5406
5407 if (use_temp)
5408 alu.dst.sel = ctx->temp_reg;
5409 else
5410 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5411 alu.dst.chan = i;
5412 alu.dst.write = 1;
5413 alu.is_op3 = 1;
5414 if (i == 3)
5415 alu.last = 1;
5416 r = r600_bytecode_add_alu(ctx->bc, &alu);
5417 if (r)
5418 return r;
5419 }
5420 if (use_temp)
5421 return tgsi_helper_copy(ctx, inst);
5422 return 0;
5423 }
5424
5425 static int tgsi_exp(struct r600_shader_ctx *ctx)
5426 {
5427 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5428 struct r600_bytecode_alu alu;
5429 int r;
5430 int i;
5431
5432 /* result.x = 2^floor(src); */
5433 if (inst->Dst[0].Register.WriteMask & 1) {
5434 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5435
5436 alu.op = ALU_OP1_FLOOR;
5437 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5438
5439 alu.dst.sel = ctx->temp_reg;
5440 alu.dst.chan = 0;
5441 alu.dst.write = 1;
5442 alu.last = 1;
5443 r = r600_bytecode_add_alu(ctx->bc, &alu);
5444 if (r)
5445 return r;
5446
5447 if (ctx->bc->chip_class == CAYMAN) {
5448 for (i = 0; i < 3; i++) {
5449 alu.op = ALU_OP1_EXP_IEEE;
5450 alu.src[0].sel = ctx->temp_reg;
5451 alu.src[0].chan = 0;
5452
5453 alu.dst.sel = ctx->temp_reg;
5454 alu.dst.chan = i;
5455 alu.dst.write = i == 0;
5456 alu.last = i == 2;
5457 r = r600_bytecode_add_alu(ctx->bc, &alu);
5458 if (r)
5459 return r;
5460 }
5461 } else {
5462 alu.op = ALU_OP1_EXP_IEEE;
5463 alu.src[0].sel = ctx->temp_reg;
5464 alu.src[0].chan = 0;
5465
5466 alu.dst.sel = ctx->temp_reg;
5467 alu.dst.chan = 0;
5468 alu.dst.write = 1;
5469 alu.last = 1;
5470 r = r600_bytecode_add_alu(ctx->bc, &alu);
5471 if (r)
5472 return r;
5473 }
5474 }
5475
5476 /* result.y = tmp - floor(tmp); */
5477 if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
5478 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5479
5480 alu.op = ALU_OP1_FRACT;
5481 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5482
5483 alu.dst.sel = ctx->temp_reg;
5484 #if 0
5485 r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5486 if (r)
5487 return r;
5488 #endif
5489 alu.dst.write = 1;
5490 alu.dst.chan = 1;
5491
5492 alu.last = 1;
5493
5494 r = r600_bytecode_add_alu(ctx->bc, &alu);
5495 if (r)
5496 return r;
5497 }
5498
5499 /* result.z = RoughApprox2ToX(tmp);*/
5500 if ((inst->Dst[0].Register.WriteMask >> 2) & 0x1) {
5501 if (ctx->bc->chip_class == CAYMAN) {
5502 for (i = 0; i < 3; i++) {
5503 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5504 alu.op = ALU_OP1_EXP_IEEE;
5505 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5506
5507 alu.dst.sel = ctx->temp_reg;
5508 alu.dst.chan = i;
5509 if (i == 2) {
5510 alu.dst.write = 1;
5511 alu.last = 1;
5512 }
5513
5514 r = r600_bytecode_add_alu(ctx->bc, &alu);
5515 if (r)
5516 return r;
5517 }
5518 } else {
5519 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5520 alu.op = ALU_OP1_EXP_IEEE;
5521 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5522
5523 alu.dst.sel = ctx->temp_reg;
5524 alu.dst.write = 1;
5525 alu.dst.chan = 2;
5526
5527 alu.last = 1;
5528
5529 r = r600_bytecode_add_alu(ctx->bc, &alu);
5530 if (r)
5531 return r;
5532 }
5533 }
5534
5535 /* result.w = 1.0;*/
5536 if ((inst->Dst[0].Register.WriteMask >> 3) & 0x1) {
5537 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5538
5539 alu.op = ALU_OP1_MOV;
5540 alu.src[0].sel = V_SQ_ALU_SRC_1;
5541 alu.src[0].chan = 0;
5542
5543 alu.dst.sel = ctx->temp_reg;
5544 alu.dst.chan = 3;
5545 alu.dst.write = 1;
5546 alu.last = 1;
5547 r = r600_bytecode_add_alu(ctx->bc, &alu);
5548 if (r)
5549 return r;
5550 }
5551 return tgsi_helper_copy(ctx, inst);
5552 }
5553
5554 static int tgsi_log(struct r600_shader_ctx *ctx)
5555 {
5556 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5557 struct r600_bytecode_alu alu;
5558 int r;
5559 int i;
5560
5561 /* result.x = floor(log2(|src|)); */
5562 if (inst->Dst[0].Register.WriteMask & 1) {
5563 if (ctx->bc->chip_class == CAYMAN) {
5564 for (i = 0; i < 3; i++) {
5565 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5566
5567 alu.op = ALU_OP1_LOG_IEEE;
5568 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5569 r600_bytecode_src_set_abs(&alu.src[0]);
5570
5571 alu.dst.sel = ctx->temp_reg;
5572 alu.dst.chan = i;
5573 if (i == 0)
5574 alu.dst.write = 1;
5575 if (i == 2)
5576 alu.last = 1;
5577 r = r600_bytecode_add_alu(ctx->bc, &alu);
5578 if (r)
5579 return r;
5580 }
5581
5582 } else {
5583 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5584
5585 alu.op = ALU_OP1_LOG_IEEE;
5586 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5587 r600_bytecode_src_set_abs(&alu.src[0]);
5588
5589 alu.dst.sel = ctx->temp_reg;
5590 alu.dst.chan = 0;
5591 alu.dst.write = 1;
5592 alu.last = 1;
5593 r = r600_bytecode_add_alu(ctx->bc, &alu);
5594 if (r)
5595 return r;
5596 }
5597
5598 alu.op = ALU_OP1_FLOOR;
5599 alu.src[0].sel = ctx->temp_reg;
5600 alu.src[0].chan = 0;
5601
5602 alu.dst.sel = ctx->temp_reg;
5603 alu.dst.chan = 0;
5604 alu.dst.write = 1;
5605 alu.last = 1;
5606
5607 r = r600_bytecode_add_alu(ctx->bc, &alu);
5608 if (r)
5609 return r;
5610 }
5611
5612 /* result.y = |src.x| / (2 ^ floor(log2(|src.x|))); */
5613 if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
5614
5615 if (ctx->bc->chip_class == CAYMAN) {
5616 for (i = 0; i < 3; i++) {
5617 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5618
5619 alu.op = ALU_OP1_LOG_IEEE;
5620 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5621 r600_bytecode_src_set_abs(&alu.src[0]);
5622
5623 alu.dst.sel = ctx->temp_reg;
5624 alu.dst.chan = i;
5625 if (i == 1)
5626 alu.dst.write = 1;
5627 if (i == 2)
5628 alu.last = 1;
5629
5630 r = r600_bytecode_add_alu(ctx->bc, &alu);
5631 if (r)
5632 return r;
5633 }
5634 } else {
5635 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5636
5637 alu.op = ALU_OP1_LOG_IEEE;
5638 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5639 r600_bytecode_src_set_abs(&alu.src[0]);
5640
5641 alu.dst.sel = ctx->temp_reg;
5642 alu.dst.chan = 1;
5643 alu.dst.write = 1;
5644 alu.last = 1;
5645
5646 r = r600_bytecode_add_alu(ctx->bc, &alu);
5647 if (r)
5648 return r;
5649 }
5650
5651 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5652
5653 alu.op = ALU_OP1_FLOOR;
5654 alu.src[0].sel = ctx->temp_reg;
5655 alu.src[0].chan = 1;
5656
5657 alu.dst.sel = ctx->temp_reg;
5658 alu.dst.chan = 1;
5659 alu.dst.write = 1;
5660 alu.last = 1;
5661
5662 r = r600_bytecode_add_alu(ctx->bc, &alu);
5663 if (r)
5664 return r;
5665
5666 if (ctx->bc->chip_class == CAYMAN) {
5667 for (i = 0; i < 3; i++) {
5668 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5669 alu.op = ALU_OP1_EXP_IEEE;
5670 alu.src[0].sel = ctx->temp_reg;
5671 alu.src[0].chan = 1;
5672
5673 alu.dst.sel = ctx->temp_reg;
5674 alu.dst.chan = i;
5675 if (i == 1)
5676 alu.dst.write = 1;
5677 if (i == 2)
5678 alu.last = 1;
5679
5680 r = r600_bytecode_add_alu(ctx->bc, &alu);
5681 if (r)
5682 return r;
5683 }
5684 } else {
5685 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5686 alu.op = ALU_OP1_EXP_IEEE;
5687 alu.src[0].sel = ctx->temp_reg;
5688 alu.src[0].chan = 1;
5689
5690 alu.dst.sel = ctx->temp_reg;
5691 alu.dst.chan = 1;
5692 alu.dst.write = 1;
5693 alu.last = 1;
5694
5695 r = r600_bytecode_add_alu(ctx->bc, &alu);
5696 if (r)
5697 return r;
5698 }
5699
5700 if (ctx->bc->chip_class == CAYMAN) {
5701 for (i = 0; i < 3; i++) {
5702 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5703 alu.op = ALU_OP1_RECIP_IEEE;
5704 alu.src[0].sel = ctx->temp_reg;
5705 alu.src[0].chan = 1;
5706
5707 alu.dst.sel = ctx->temp_reg;
5708 alu.dst.chan = i;
5709 if (i == 1)
5710 alu.dst.write = 1;
5711 if (i == 2)
5712 alu.last = 1;
5713
5714 r = r600_bytecode_add_alu(ctx->bc, &alu);
5715 if (r)
5716 return r;
5717 }
5718 } else {
5719 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5720 alu.op = ALU_OP1_RECIP_IEEE;
5721 alu.src[0].sel = ctx->temp_reg;
5722 alu.src[0].chan = 1;
5723
5724 alu.dst.sel = ctx->temp_reg;
5725 alu.dst.chan = 1;
5726 alu.dst.write = 1;
5727 alu.last = 1;
5728
5729 r = r600_bytecode_add_alu(ctx->bc, &alu);
5730 if (r)
5731 return r;
5732 }
5733
5734 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5735
5736 alu.op = ALU_OP2_MUL;
5737
5738 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5739 r600_bytecode_src_set_abs(&alu.src[0]);
5740
5741 alu.src[1].sel = ctx->temp_reg;
5742 alu.src[1].chan = 1;
5743
5744 alu.dst.sel = ctx->temp_reg;
5745 alu.dst.chan = 1;
5746 alu.dst.write = 1;
5747 alu.last = 1;
5748
5749 r = r600_bytecode_add_alu(ctx->bc, &alu);
5750 if (r)
5751 return r;
5752 }
5753
5754 /* result.z = log2(|src|);*/
5755 if ((inst->Dst[0].Register.WriteMask >> 2) & 1) {
5756 if (ctx->bc->chip_class == CAYMAN) {
5757 for (i = 0; i < 3; i++) {
5758 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5759
5760 alu.op = ALU_OP1_LOG_IEEE;
5761 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5762 r600_bytecode_src_set_abs(&alu.src[0]);
5763
5764 alu.dst.sel = ctx->temp_reg;
5765 if (i == 2)
5766 alu.dst.write = 1;
5767 alu.dst.chan = i;
5768 if (i == 2)
5769 alu.last = 1;
5770
5771 r = r600_bytecode_add_alu(ctx->bc, &alu);
5772 if (r)
5773 return r;
5774 }
5775 } else {
5776 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5777
5778 alu.op = ALU_OP1_LOG_IEEE;
5779 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5780 r600_bytecode_src_set_abs(&alu.src[0]);
5781
5782 alu.dst.sel = ctx->temp_reg;
5783 alu.dst.write = 1;
5784 alu.dst.chan = 2;
5785 alu.last = 1;
5786
5787 r = r600_bytecode_add_alu(ctx->bc, &alu);
5788 if (r)
5789 return r;
5790 }
5791 }
5792
5793 /* result.w = 1.0; */
5794 if ((inst->Dst[0].Register.WriteMask >> 3) & 1) {
5795 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5796
5797 alu.op = ALU_OP1_MOV;
5798 alu.src[0].sel = V_SQ_ALU_SRC_1;
5799 alu.src[0].chan = 0;
5800
5801 alu.dst.sel = ctx->temp_reg;
5802 alu.dst.chan = 3;
5803 alu.dst.write = 1;
5804 alu.last = 1;
5805
5806 r = r600_bytecode_add_alu(ctx->bc, &alu);
5807 if (r)
5808 return r;
5809 }
5810
5811 return tgsi_helper_copy(ctx, inst);
5812 }
5813
5814 static int tgsi_eg_arl(struct r600_shader_ctx *ctx)
5815 {
5816 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5817 struct r600_bytecode_alu alu;
5818 int r;
5819 int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5820
5821 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5822
5823 switch (inst->Instruction.Opcode) {
5824 case TGSI_OPCODE_ARL:
5825 alu.op = ALU_OP1_FLT_TO_INT_FLOOR;
5826 break;
5827 case TGSI_OPCODE_ARR:
5828 alu.op = ALU_OP1_FLT_TO_INT;
5829 break;
5830 case TGSI_OPCODE_UARL:
5831 alu.op = ALU_OP1_MOV;
5832 break;
5833 default:
5834 assert(0);
5835 return -1;
5836 }
5837
5838 for (i = 0; i <= lasti; ++i) {
5839 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5840 continue;
5841 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5842 alu.last = i == lasti;
5843 alu.dst.sel = ctx->bc->ar_reg;
5844 alu.dst.chan = i;
5845 alu.dst.write = 1;
5846 r = r600_bytecode_add_alu(ctx->bc, &alu);
5847 if (r)
5848 return r;
5849 }
5850
5851 ctx->bc->ar_loaded = 0;
5852 return 0;
5853 }
5854 static int tgsi_r600_arl(struct r600_shader_ctx *ctx)
5855 {
5856 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5857 struct r600_bytecode_alu alu;
5858 int r;
5859 int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5860
5861 switch (inst->Instruction.Opcode) {
5862 case TGSI_OPCODE_ARL:
5863 memset(&alu, 0, sizeof(alu));
5864 alu.op = ALU_OP1_FLOOR;
5865 alu.dst.sel = ctx->bc->ar_reg;
5866 alu.dst.write = 1;
5867 for (i = 0; i <= lasti; ++i) {
5868 if (inst->Dst[0].Register.WriteMask & (1 << i)) {
5869 alu.dst.chan = i;
5870 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5871 alu.last = i == lasti;
5872 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5873 return r;
5874 }
5875 }
5876
5877 memset(&alu, 0, sizeof(alu));
5878 alu.op = ALU_OP1_FLT_TO_INT;
5879 alu.src[0].sel = ctx->bc->ar_reg;
5880 alu.dst.sel = ctx->bc->ar_reg;
5881 alu.dst.write = 1;
5882 /* FLT_TO_INT is trans-only on r600/r700 */
5883 alu.last = TRUE;
5884 for (i = 0; i <= lasti; ++i) {
5885 alu.dst.chan = i;
5886 alu.src[0].chan = i;
5887 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5888 return r;
5889 }
5890 break;
5891 case TGSI_OPCODE_ARR:
5892 memset(&alu, 0, sizeof(alu));
5893 alu.op = ALU_OP1_FLT_TO_INT;
5894 alu.dst.sel = ctx->bc->ar_reg;
5895 alu.dst.write = 1;
5896 /* FLT_TO_INT is trans-only on r600/r700 */
5897 alu.last = TRUE;
5898 for (i = 0; i <= lasti; ++i) {
5899 if (inst->Dst[0].Register.WriteMask & (1 << i)) {
5900 alu.dst.chan = i;
5901 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5902 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5903 return r;
5904 }
5905 }
5906 break;
5907 case TGSI_OPCODE_UARL:
5908 memset(&alu, 0, sizeof(alu));
5909 alu.op = ALU_OP1_MOV;
5910 alu.dst.sel = ctx->bc->ar_reg;
5911 alu.dst.write = 1;
5912 for (i = 0; i <= lasti; ++i) {
5913 if (inst->Dst[0].Register.WriteMask & (1 << i)) {
5914 alu.dst.chan = i;
5915 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5916 alu.last = i == lasti;
5917 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
5918 return r;
5919 }
5920 }
5921 break;
5922 default:
5923 assert(0);
5924 return -1;
5925 }
5926
5927 ctx->bc->ar_loaded = 0;
5928 return 0;
5929 }
5930
5931 static int tgsi_opdst(struct r600_shader_ctx *ctx)
5932 {
5933 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5934 struct r600_bytecode_alu alu;
5935 int i, r = 0;
5936
5937 for (i = 0; i < 4; i++) {
5938 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5939
5940 alu.op = ALU_OP2_MUL;
5941 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5942
5943 if (i == 0 || i == 3) {
5944 alu.src[0].sel = V_SQ_ALU_SRC_1;
5945 } else {
5946 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5947 }
5948
5949 if (i == 0 || i == 2) {
5950 alu.src[1].sel = V_SQ_ALU_SRC_1;
5951 } else {
5952 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5953 }
5954 if (i == 3)
5955 alu.last = 1;
5956 r = r600_bytecode_add_alu(ctx->bc, &alu);
5957 if (r)
5958 return r;
5959 }
5960 return 0;
5961 }
5962
5963 static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode, int alu_type)
5964 {
5965 struct r600_bytecode_alu alu;
5966 int r;
5967
5968 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5969 alu.op = opcode;
5970 alu.execute_mask = 1;
5971 alu.update_pred = 1;
5972
5973 alu.dst.sel = ctx->temp_reg;
5974 alu.dst.write = 1;
5975 alu.dst.chan = 0;
5976
5977 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
5978 alu.src[1].sel = V_SQ_ALU_SRC_0;
5979 alu.src[1].chan = 0;
5980
5981 alu.last = 1;
5982
5983 r = r600_bytecode_add_alu_type(ctx->bc, &alu, alu_type);
5984 if (r)
5985 return r;
5986 return 0;
5987 }
5988
5989 static int pops(struct r600_shader_ctx *ctx, int pops)
5990 {
5991 unsigned force_pop = ctx->bc->force_add_cf;
5992
5993 if (!force_pop) {
5994 int alu_pop = 3;
5995 if (ctx->bc->cf_last) {
5996 if (ctx->bc->cf_last->op == CF_OP_ALU)
5997 alu_pop = 0;
5998 else if (ctx->bc->cf_last->op == CF_OP_ALU_POP_AFTER)
5999 alu_pop = 1;
6000 }
6001 alu_pop += pops;
6002 if (alu_pop == 1) {
6003 ctx->bc->cf_last->op = CF_OP_ALU_POP_AFTER;
6004 ctx->bc->force_add_cf = 1;
6005 } else if (alu_pop == 2) {
6006 ctx->bc->cf_last->op = CF_OP_ALU_POP2_AFTER;
6007 ctx->bc->force_add_cf = 1;
6008 } else {
6009 force_pop = 1;
6010 }
6011 }
6012
6013 if (force_pop) {
6014 r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP);
6015 ctx->bc->cf_last->pop_count = pops;
6016 ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
6017 }
6018
6019 return 0;
6020 }
6021
6022 static inline void callstack_update_max_depth(struct r600_shader_ctx *ctx,
6023 unsigned reason)
6024 {
6025 struct r600_stack_info *stack = &ctx->bc->stack;
6026 unsigned elements, entries;
6027
6028 unsigned entry_size = stack->entry_size;
6029
6030 elements = (stack->loop + stack->push_wqm ) * entry_size;
6031 elements += stack->push;
6032
6033 switch (ctx->bc->chip_class) {
6034 case R600:
6035 case R700:
6036 /* pre-r8xx: if any non-WQM PUSH instruction is invoked, 2 elements on
6037 * the stack must be reserved to hold the current active/continue
6038 * masks */
6039 if (reason == FC_PUSH_VPM) {
6040 elements += 2;
6041 }
6042 break;
6043
6044 case CAYMAN:
6045 /* r9xx: any stack operation on empty stack consumes 2 additional
6046 * elements */
6047 elements += 2;
6048
6049 /* fallthrough */
6050 /* FIXME: do the two elements added above cover the cases for the
6051 * r8xx+ below? */
6052
6053 case EVERGREEN:
6054 /* r8xx+: 2 extra elements are not always required, but one extra
6055 * element must be added for each of the following cases:
6056 * 1. There is an ALU_ELSE_AFTER instruction at the point of greatest
6057 * stack usage.
6058 * (Currently we don't use ALU_ELSE_AFTER.)
6059 * 2. There are LOOP/WQM frames on the stack when any flavor of non-WQM
6060 * PUSH instruction executed.
6061 *
6062 * NOTE: it seems we also need to reserve additional element in some
6063 * other cases, e.g. when we have 4 levels of PUSH_VPM in the shader,
6064 * then STACK_SIZE should be 2 instead of 1 */
6065 if (reason == FC_PUSH_VPM) {
6066 elements += 1;
6067 }
6068 break;
6069
6070 default:
6071 assert(0);
6072 break;
6073 }
6074
6075 /* NOTE: it seems STACK_SIZE is interpreted by hw as if entry_size is 4
6076 * for all chips, so we use 4 in the final formula, not the real entry_size
6077 * for the chip */
6078 entry_size = 4;
6079
6080 entries = (elements + (entry_size - 1)) / entry_size;
6081
6082 if (entries > stack->max_entries)
6083 stack->max_entries = entries;
6084 }
6085
6086 static inline void callstack_pop(struct r600_shader_ctx *ctx, unsigned reason)
6087 {
6088 switch(reason) {
6089 case FC_PUSH_VPM:
6090 --ctx->bc->stack.push;
6091 assert(ctx->bc->stack.push >= 0);
6092 break;
6093 case FC_PUSH_WQM:
6094 --ctx->bc->stack.push_wqm;
6095 assert(ctx->bc->stack.push_wqm >= 0);
6096 break;
6097 case FC_LOOP:
6098 --ctx->bc->stack.loop;
6099 assert(ctx->bc->stack.loop >= 0);
6100 break;
6101 default:
6102 assert(0);
6103 break;
6104 }
6105 }
6106
6107 static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason)
6108 {
6109 switch (reason) {
6110 case FC_PUSH_VPM:
6111 ++ctx->bc->stack.push;
6112 break;
6113 case FC_PUSH_WQM:
6114 ++ctx->bc->stack.push_wqm;
6115 case FC_LOOP:
6116 ++ctx->bc->stack.loop;
6117 break;
6118 default:
6119 assert(0);
6120 }
6121
6122 callstack_update_max_depth(ctx, reason);
6123 }
6124
6125 static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp)
6126 {
6127 struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[fc_sp];
6128
6129 sp->mid = realloc((void *)sp->mid,
6130 sizeof(struct r600_bytecode_cf *) * (sp->num_mid + 1));
6131 sp->mid[sp->num_mid] = ctx->bc->cf_last;
6132 sp->num_mid++;
6133 }
6134
6135 static void fc_pushlevel(struct r600_shader_ctx *ctx, int type)
6136 {
6137 ctx->bc->fc_sp++;
6138 ctx->bc->fc_stack[ctx->bc->fc_sp].type = type;
6139 ctx->bc->fc_stack[ctx->bc->fc_sp].start = ctx->bc->cf_last;
6140 }
6141
6142 static void fc_poplevel(struct r600_shader_ctx *ctx)
6143 {
6144 struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[ctx->bc->fc_sp];
6145 free(sp->mid);
6146 sp->mid = NULL;
6147 sp->num_mid = 0;
6148 sp->start = NULL;
6149 sp->type = 0;
6150 ctx->bc->fc_sp--;
6151 }
6152
6153 #if 0
6154 static int emit_return(struct r600_shader_ctx *ctx)
6155 {
6156 r600_bytecode_add_cfinst(ctx->bc, CF_OP_RETURN));
6157 return 0;
6158 }
6159
6160 static int emit_jump_to_offset(struct r600_shader_ctx *ctx, int pops, int offset)
6161 {
6162
6163 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP));
6164 ctx->bc->cf_last->pop_count = pops;
6165 /* XXX work out offset */
6166 return 0;
6167 }
6168
6169 static int emit_setret_in_loop_flag(struct r600_shader_ctx *ctx, unsigned flag_value)
6170 {
6171 return 0;
6172 }
6173
6174 static void emit_testflag(struct r600_shader_ctx *ctx)
6175 {
6176
6177 }
6178
6179 static void emit_return_on_flag(struct r600_shader_ctx *ctx, unsigned ifidx)
6180 {
6181 emit_testflag(ctx);
6182 emit_jump_to_offset(ctx, 1, 4);
6183 emit_setret_in_loop_flag(ctx, V_SQ_ALU_SRC_0);
6184 pops(ctx, ifidx + 1);
6185 emit_return(ctx);
6186 }
6187
6188 static void break_loop_on_flag(struct r600_shader_ctx *ctx, unsigned fc_sp)
6189 {
6190 emit_testflag(ctx);
6191
6192 r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
6193 ctx->bc->cf_last->pop_count = 1;
6194
6195 fc_set_mid(ctx, fc_sp);
6196
6197 pops(ctx, 1);
6198 }
6199 #endif
6200
6201 static int emit_if(struct r600_shader_ctx *ctx, int opcode)
6202 {
6203 int alu_type = CF_OP_ALU_PUSH_BEFORE;
6204
6205 /* There is a hardware bug on Cayman where a BREAK/CONTINUE followed by
6206 * LOOP_STARTxxx for nested loops may put the branch stack into a state
6207 * such that ALU_PUSH_BEFORE doesn't work as expected. Workaround this
6208 * by replacing the ALU_PUSH_BEFORE with a PUSH + ALU */
6209 if (ctx->bc->chip_class == CAYMAN && ctx->bc->stack.loop > 1) {
6210 r600_bytecode_add_cfinst(ctx->bc, CF_OP_PUSH);
6211 ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
6212 alu_type = CF_OP_ALU;
6213 }
6214
6215 emit_logic_pred(ctx, opcode, alu_type);
6216
6217 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
6218
6219 fc_pushlevel(ctx, FC_IF);
6220
6221 callstack_push(ctx, FC_PUSH_VPM);
6222 return 0;
6223 }
6224
6225 static int tgsi_if(struct r600_shader_ctx *ctx)
6226 {
6227 return emit_if(ctx, ALU_OP2_PRED_SETNE);
6228 }
6229
6230 static int tgsi_uif(struct r600_shader_ctx *ctx)
6231 {
6232 return emit_if(ctx, ALU_OP2_PRED_SETNE_INT);
6233 }
6234
6235 static int tgsi_else(struct r600_shader_ctx *ctx)
6236 {
6237 r600_bytecode_add_cfinst(ctx->bc, CF_OP_ELSE);
6238 ctx->bc->cf_last->pop_count = 1;
6239
6240 fc_set_mid(ctx, ctx->bc->fc_sp);
6241 ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id;
6242 return 0;
6243 }
6244
6245 static int tgsi_endif(struct r600_shader_ctx *ctx)
6246 {
6247 pops(ctx, 1);
6248 if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_IF) {
6249 R600_ERR("if/endif unbalanced in shader\n");
6250 return -1;
6251 }
6252
6253 if (ctx->bc->fc_stack[ctx->bc->fc_sp].mid == NULL) {
6254 ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2;
6255 ctx->bc->fc_stack[ctx->bc->fc_sp].start->pop_count = 1;
6256 } else {
6257 ctx->bc->fc_stack[ctx->bc->fc_sp].mid[0]->cf_addr = ctx->bc->cf_last->id + 2;
6258 }
6259 fc_poplevel(ctx);
6260
6261 callstack_pop(ctx, FC_PUSH_VPM);
6262 return 0;
6263 }
6264
6265 static int tgsi_bgnloop(struct r600_shader_ctx *ctx)
6266 {
6267 /* LOOP_START_DX10 ignores the LOOP_CONFIG* registers, so it is not
6268 * limited to 4096 iterations, like the other LOOP_* instructions. */
6269 r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_START_DX10);
6270
6271 fc_pushlevel(ctx, FC_LOOP);
6272
6273 /* check stack depth */
6274 callstack_push(ctx, FC_LOOP);
6275 return 0;
6276 }
6277
6278 static int tgsi_endloop(struct r600_shader_ctx *ctx)
6279 {
6280 int i;
6281
6282 r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_END);
6283
6284 if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_LOOP) {
6285 R600_ERR("loop/endloop in shader code are not paired.\n");
6286 return -EINVAL;
6287 }
6288
6289 /* fixup loop pointers - from r600isa
6290 LOOP END points to CF after LOOP START,
6291 LOOP START point to CF after LOOP END
6292 BRK/CONT point to LOOP END CF
6293 */
6294 ctx->bc->cf_last->cf_addr = ctx->bc->fc_stack[ctx->bc->fc_sp].start->id + 2;
6295
6296 ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2;
6297
6298 for (i = 0; i < ctx->bc->fc_stack[ctx->bc->fc_sp].num_mid; i++) {
6299 ctx->bc->fc_stack[ctx->bc->fc_sp].mid[i]->cf_addr = ctx->bc->cf_last->id;
6300 }
6301 /* XXX add LOOPRET support */
6302 fc_poplevel(ctx);
6303 callstack_pop(ctx, FC_LOOP);
6304 return 0;
6305 }
6306
6307 static int tgsi_loop_breakc(struct r600_shader_ctx *ctx)
6308 {
6309 int r;
6310 unsigned int fscp;
6311
6312 for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
6313 {
6314 if (FC_LOOP == ctx->bc->fc_stack[fscp].type)
6315 break;
6316 }
6317 if (fscp == 0) {
6318 R600_ERR("BREAKC not inside loop/endloop pair\n");
6319 return -EINVAL;
6320 }
6321
6322 if (ctx->bc->chip_class == EVERGREEN &&
6323 ctx->bc->family != CHIP_CYPRESS &&
6324 ctx->bc->family != CHIP_JUNIPER) {
6325 /* HW bug: ALU_BREAK does not save the active mask correctly */
6326 r = tgsi_uif(ctx);
6327 if (r)
6328 return r;
6329
6330 r = r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_BREAK);
6331 if (r)
6332 return r;
6333 fc_set_mid(ctx, fscp);
6334
6335 return tgsi_endif(ctx);
6336 } else {
6337 r = emit_logic_pred(ctx, ALU_OP2_PRED_SETE_INT, CF_OP_ALU_BREAK);
6338 if (r)
6339 return r;
6340 fc_set_mid(ctx, fscp);
6341 }
6342
6343 return 0;
6344 }
6345
6346 static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx)
6347 {
6348 unsigned int fscp;
6349
6350 for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
6351 {
6352 if (FC_LOOP == ctx->bc->fc_stack[fscp].type)
6353 break;
6354 }
6355
6356 if (fscp == 0) {
6357 R600_ERR("Break not inside loop/endloop pair\n");
6358 return -EINVAL;
6359 }
6360
6361 r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
6362
6363 fc_set_mid(ctx, fscp);
6364
6365 return 0;
6366 }
6367
6368 static int tgsi_gs_emit(struct r600_shader_ctx *ctx)
6369 {
6370 if (ctx->inst_info->op == CF_OP_EMIT_VERTEX)
6371 emit_gs_ring_writes(ctx, TRUE);
6372
6373 return r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
6374 }
6375
6376 static int tgsi_umad(struct r600_shader_ctx *ctx)
6377 {
6378 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6379 struct r600_bytecode_alu alu;
6380 int i, j, k, r;
6381 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6382
6383 /* src0 * src1 */
6384 for (i = 0; i < lasti + 1; i++) {
6385 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6386 continue;
6387
6388 if (ctx->bc->chip_class == CAYMAN) {
6389 for (j = 0 ; j < 4; j++) {
6390 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6391
6392 alu.op = ALU_OP2_MULLO_UINT;
6393 for (k = 0; k < inst->Instruction.NumSrcRegs; k++) {
6394 r600_bytecode_src(&alu.src[k], &ctx->src[k], i);
6395 }
6396 tgsi_dst(ctx, &inst->Dst[0], j, &alu.dst);
6397 alu.dst.sel = ctx->temp_reg;
6398 alu.dst.write = (j == i);
6399 if (j == 3)
6400 alu.last = 1;
6401 r = r600_bytecode_add_alu(ctx->bc, &alu);
6402 if (r)
6403 return r;
6404 }
6405 } else {
6406 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6407
6408 alu.dst.chan = i;
6409 alu.dst.sel = ctx->temp_reg;
6410 alu.dst.write = 1;
6411
6412 alu.op = ALU_OP2_MULLO_UINT;
6413 for (j = 0; j < 2; j++) {
6414 r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
6415 }
6416
6417 alu.last = 1;
6418 r = r600_bytecode_add_alu(ctx->bc, &alu);
6419 if (r)
6420 return r;
6421 }
6422 }
6423
6424
6425 for (i = 0; i < lasti + 1; i++) {
6426 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6427 continue;
6428
6429 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6430 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6431
6432 alu.op = ALU_OP2_ADD_INT;
6433
6434 alu.src[0].sel = ctx->temp_reg;
6435 alu.src[0].chan = i;
6436
6437 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6438 if (i == lasti) {
6439 alu.last = 1;
6440 }
6441 r = r600_bytecode_add_alu(ctx->bc, &alu);
6442 if (r)
6443 return r;
6444 }
6445 return 0;
6446 }
6447
6448 static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
6449 {TGSI_OPCODE_ARL, 0, ALU_OP0_NOP, tgsi_r600_arl},
6450 {TGSI_OPCODE_MOV, 0, ALU_OP1_MOV, tgsi_op2},
6451 {TGSI_OPCODE_LIT, 0, ALU_OP0_NOP, tgsi_lit},
6452
6453 /* XXX:
6454 * For state trackers other than OpenGL, we'll want to use
6455 * _RECIP_IEEE instead.
6456 */
6457 {TGSI_OPCODE_RCP, 0, ALU_OP1_RECIP_CLAMPED, tgsi_trans_srcx_replicate},
6458
6459 {TGSI_OPCODE_RSQ, 0, ALU_OP0_NOP, tgsi_rsq},
6460 {TGSI_OPCODE_EXP, 0, ALU_OP0_NOP, tgsi_exp},
6461 {TGSI_OPCODE_LOG, 0, ALU_OP0_NOP, tgsi_log},
6462 {TGSI_OPCODE_MUL, 0, ALU_OP2_MUL, tgsi_op2},
6463 {TGSI_OPCODE_ADD, 0, ALU_OP2_ADD, tgsi_op2},
6464 {TGSI_OPCODE_DP3, 0, ALU_OP2_DOT4, tgsi_dp},
6465 {TGSI_OPCODE_DP4, 0, ALU_OP2_DOT4, tgsi_dp},
6466 {TGSI_OPCODE_DST, 0, ALU_OP0_NOP, tgsi_opdst},
6467 {TGSI_OPCODE_MIN, 0, ALU_OP2_MIN, tgsi_op2},
6468 {TGSI_OPCODE_MAX, 0, ALU_OP2_MAX, tgsi_op2},
6469 {TGSI_OPCODE_SLT, 0, ALU_OP2_SETGT, tgsi_op2_swap},
6470 {TGSI_OPCODE_SGE, 0, ALU_OP2_SETGE, tgsi_op2},
6471 {TGSI_OPCODE_MAD, 1, ALU_OP3_MULADD, tgsi_op3},
6472 {TGSI_OPCODE_SUB, 0, ALU_OP2_ADD, tgsi_op2},
6473 {TGSI_OPCODE_LRP, 0, ALU_OP0_NOP, tgsi_lrp},
6474 {TGSI_OPCODE_CND, 0, ALU_OP0_NOP, tgsi_unsupported},
6475 /* gap */
6476 {20, 0, ALU_OP0_NOP, tgsi_unsupported},
6477 {TGSI_OPCODE_DP2A, 0, ALU_OP0_NOP, tgsi_unsupported},
6478 /* gap */
6479 {22, 0, ALU_OP0_NOP, tgsi_unsupported},
6480 {23, 0, ALU_OP0_NOP, tgsi_unsupported},
6481 {TGSI_OPCODE_FRC, 0, ALU_OP1_FRACT, tgsi_op2},
6482 {TGSI_OPCODE_CLAMP, 0, ALU_OP0_NOP, tgsi_unsupported},
6483 {TGSI_OPCODE_FLR, 0, ALU_OP1_FLOOR, tgsi_op2},
6484 {TGSI_OPCODE_ROUND, 0, ALU_OP1_RNDNE, tgsi_op2},
6485 {TGSI_OPCODE_EX2, 0, ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
6486 {TGSI_OPCODE_LG2, 0, ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
6487 {TGSI_OPCODE_POW, 0, ALU_OP0_NOP, tgsi_pow},
6488 {TGSI_OPCODE_XPD, 0, ALU_OP0_NOP, tgsi_xpd},
6489 /* gap */
6490 {32, 0, ALU_OP0_NOP, tgsi_unsupported},
6491 {TGSI_OPCODE_ABS, 0, ALU_OP1_MOV, tgsi_op2},
6492 {TGSI_OPCODE_RCC, 0, ALU_OP0_NOP, tgsi_unsupported},
6493 {TGSI_OPCODE_DPH, 0, ALU_OP2_DOT4, tgsi_dp},
6494 {TGSI_OPCODE_COS, 0, ALU_OP1_COS, tgsi_trig},
6495 {TGSI_OPCODE_DDX, 0, FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
6496 {TGSI_OPCODE_DDY, 0, FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
6497 {TGSI_OPCODE_KILL, 0, ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */
6498 {TGSI_OPCODE_PK2H, 0, ALU_OP0_NOP, tgsi_unsupported},
6499 {TGSI_OPCODE_PK2US, 0, ALU_OP0_NOP, tgsi_unsupported},
6500 {TGSI_OPCODE_PK4B, 0, ALU_OP0_NOP, tgsi_unsupported},
6501 {TGSI_OPCODE_PK4UB, 0, ALU_OP0_NOP, tgsi_unsupported},
6502 {TGSI_OPCODE_RFL, 0, ALU_OP0_NOP, tgsi_unsupported},
6503 {TGSI_OPCODE_SEQ, 0, ALU_OP2_SETE, tgsi_op2},
6504 {TGSI_OPCODE_SFL, 0, ALU_OP0_NOP, tgsi_unsupported},
6505 {TGSI_OPCODE_SGT, 0, ALU_OP2_SETGT, tgsi_op2},
6506 {TGSI_OPCODE_SIN, 0, ALU_OP1_SIN, tgsi_trig},
6507 {TGSI_OPCODE_SLE, 0, ALU_OP2_SETGE, tgsi_op2_swap},
6508 {TGSI_OPCODE_SNE, 0, ALU_OP2_SETNE, tgsi_op2},
6509 {TGSI_OPCODE_STR, 0, ALU_OP0_NOP, tgsi_unsupported},
6510 {TGSI_OPCODE_TEX, 0, FETCH_OP_SAMPLE, tgsi_tex},
6511 {TGSI_OPCODE_TXD, 0, FETCH_OP_SAMPLE_G, tgsi_tex},
6512 {TGSI_OPCODE_TXP, 0, FETCH_OP_SAMPLE, tgsi_tex},
6513 {TGSI_OPCODE_UP2H, 0, ALU_OP0_NOP, tgsi_unsupported},
6514 {TGSI_OPCODE_UP2US, 0, ALU_OP0_NOP, tgsi_unsupported},
6515 {TGSI_OPCODE_UP4B, 0, ALU_OP0_NOP, tgsi_unsupported},
6516 {TGSI_OPCODE_UP4UB, 0, ALU_OP0_NOP, tgsi_unsupported},
6517 {TGSI_OPCODE_X2D, 0, ALU_OP0_NOP, tgsi_unsupported},
6518 {TGSI_OPCODE_ARA, 0, ALU_OP0_NOP, tgsi_unsupported},
6519 {TGSI_OPCODE_ARR, 0, ALU_OP0_NOP, tgsi_r600_arl},
6520 {TGSI_OPCODE_BRA, 0, ALU_OP0_NOP, tgsi_unsupported},
6521 {TGSI_OPCODE_CAL, 0, ALU_OP0_NOP, tgsi_unsupported},
6522 {TGSI_OPCODE_RET, 0, ALU_OP0_NOP, tgsi_unsupported},
6523 {TGSI_OPCODE_SSG, 0, ALU_OP0_NOP, tgsi_ssg},
6524 {TGSI_OPCODE_CMP, 0, ALU_OP0_NOP, tgsi_cmp},
6525 {TGSI_OPCODE_SCS, 0, ALU_OP0_NOP, tgsi_scs},
6526 {TGSI_OPCODE_TXB, 0, FETCH_OP_SAMPLE_LB, tgsi_tex},
6527 {TGSI_OPCODE_NRM, 0, ALU_OP0_NOP, tgsi_unsupported},
6528 {TGSI_OPCODE_DIV, 0, ALU_OP0_NOP, tgsi_unsupported},
6529 {TGSI_OPCODE_DP2, 0, ALU_OP2_DOT4, tgsi_dp},
6530 {TGSI_OPCODE_TXL, 0, FETCH_OP_SAMPLE_L, tgsi_tex},
6531 {TGSI_OPCODE_BRK, 0, CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
6532 {TGSI_OPCODE_IF, 0, ALU_OP0_NOP, tgsi_if},
6533 {TGSI_OPCODE_UIF, 0, ALU_OP0_NOP, tgsi_uif},
6534 {76, 0, ALU_OP0_NOP, tgsi_unsupported},
6535 {TGSI_OPCODE_ELSE, 0, ALU_OP0_NOP, tgsi_else},
6536 {TGSI_OPCODE_ENDIF, 0, ALU_OP0_NOP, tgsi_endif},
6537 /* gap */
6538 {79, 0, ALU_OP0_NOP, tgsi_unsupported},
6539 {80, 0, ALU_OP0_NOP, tgsi_unsupported},
6540 {TGSI_OPCODE_PUSHA, 0, ALU_OP0_NOP, tgsi_unsupported},
6541 {TGSI_OPCODE_POPA, 0, ALU_OP0_NOP, tgsi_unsupported},
6542 {TGSI_OPCODE_CEIL, 0, ALU_OP1_CEIL, tgsi_op2},
6543 {TGSI_OPCODE_I2F, 0, ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
6544 {TGSI_OPCODE_NOT, 0, ALU_OP1_NOT_INT, tgsi_op2},
6545 {TGSI_OPCODE_TRUNC, 0, ALU_OP1_TRUNC, tgsi_op2},
6546 {TGSI_OPCODE_SHL, 0, ALU_OP2_LSHL_INT, tgsi_op2_trans},
6547 /* gap */
6548 {88, 0, ALU_OP0_NOP, tgsi_unsupported},
6549 {TGSI_OPCODE_AND, 0, ALU_OP2_AND_INT, tgsi_op2},
6550 {TGSI_OPCODE_OR, 0, ALU_OP2_OR_INT, tgsi_op2},
6551 {TGSI_OPCODE_MOD, 0, ALU_OP0_NOP, tgsi_imod},
6552 {TGSI_OPCODE_XOR, 0, ALU_OP2_XOR_INT, tgsi_op2},
6553 {TGSI_OPCODE_SAD, 0, ALU_OP0_NOP, tgsi_unsupported},
6554 {TGSI_OPCODE_TXF, 0, FETCH_OP_LD, tgsi_tex},
6555 {TGSI_OPCODE_TXQ, 0, FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
6556 {TGSI_OPCODE_CONT, 0, CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
6557 {TGSI_OPCODE_EMIT, 0, CF_OP_EMIT_VERTEX, tgsi_gs_emit},
6558 {TGSI_OPCODE_ENDPRIM, 0, CF_OP_CUT_VERTEX, tgsi_gs_emit},
6559 {TGSI_OPCODE_BGNLOOP, 0, ALU_OP0_NOP, tgsi_bgnloop},
6560 {TGSI_OPCODE_BGNSUB, 0, ALU_OP0_NOP, tgsi_unsupported},
6561 {TGSI_OPCODE_ENDLOOP, 0, ALU_OP0_NOP, tgsi_endloop},
6562 {TGSI_OPCODE_ENDSUB, 0, ALU_OP0_NOP, tgsi_unsupported},
6563 {TGSI_OPCODE_TXQ_LZ, 0, FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
6564 /* gap */
6565 {104, 0, ALU_OP0_NOP, tgsi_unsupported},
6566 {105, 0, ALU_OP0_NOP, tgsi_unsupported},
6567 {106, 0, ALU_OP0_NOP, tgsi_unsupported},
6568 {TGSI_OPCODE_NOP, 0, ALU_OP0_NOP, tgsi_unsupported},
6569 {TGSI_OPCODE_FSEQ, 0, ALU_OP2_SETE_DX10, tgsi_op2},
6570 {TGSI_OPCODE_FSGE, 0, ALU_OP2_SETGE_DX10, tgsi_op2},
6571 {TGSI_OPCODE_FSLT, 0, ALU_OP2_SETGT_DX10, tgsi_op2_swap},
6572 {TGSI_OPCODE_FSNE, 0, ALU_OP2_SETNE_DX10, tgsi_op2_swap},
6573 {TGSI_OPCODE_NRM4, 0, ALU_OP0_NOP, tgsi_unsupported},
6574 {TGSI_OPCODE_CALLNZ, 0, ALU_OP0_NOP, tgsi_unsupported},
6575 /* gap */
6576 {114, 0, ALU_OP0_NOP, tgsi_unsupported},
6577 {TGSI_OPCODE_BREAKC, 0, ALU_OP0_NOP, tgsi_loop_breakc},
6578 {TGSI_OPCODE_KILL_IF, 0, ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */
6579 {TGSI_OPCODE_END, 0, ALU_OP0_NOP, tgsi_end}, /* aka HALT */
6580 /* gap */
6581 {118, 0, ALU_OP0_NOP, tgsi_unsupported},
6582 {TGSI_OPCODE_F2I, 0, ALU_OP1_FLT_TO_INT, tgsi_op2_trans},
6583 {TGSI_OPCODE_IDIV, 0, ALU_OP0_NOP, tgsi_idiv},
6584 {TGSI_OPCODE_IMAX, 0, ALU_OP2_MAX_INT, tgsi_op2},
6585 {TGSI_OPCODE_IMIN, 0, ALU_OP2_MIN_INT, tgsi_op2},
6586 {TGSI_OPCODE_INEG, 0, ALU_OP2_SUB_INT, tgsi_ineg},
6587 {TGSI_OPCODE_ISGE, 0, ALU_OP2_SETGE_INT, tgsi_op2},
6588 {TGSI_OPCODE_ISHR, 0, ALU_OP2_ASHR_INT, tgsi_op2_trans},
6589 {TGSI_OPCODE_ISLT, 0, ALU_OP2_SETGT_INT, tgsi_op2_swap},
6590 {TGSI_OPCODE_F2U, 0, ALU_OP1_FLT_TO_UINT, tgsi_op2_trans},
6591 {TGSI_OPCODE_U2F, 0, ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
6592 {TGSI_OPCODE_UADD, 0, ALU_OP2_ADD_INT, tgsi_op2},
6593 {TGSI_OPCODE_UDIV, 0, ALU_OP0_NOP, tgsi_udiv},
6594 {TGSI_OPCODE_UMAD, 0, ALU_OP0_NOP, tgsi_umad},
6595 {TGSI_OPCODE_UMAX, 0, ALU_OP2_MAX_UINT, tgsi_op2},
6596 {TGSI_OPCODE_UMIN, 0, ALU_OP2_MIN_UINT, tgsi_op2},
6597 {TGSI_OPCODE_UMOD, 0, ALU_OP0_NOP, tgsi_umod},
6598 {TGSI_OPCODE_UMUL, 0, ALU_OP2_MULLO_UINT, tgsi_op2_trans},
6599 {TGSI_OPCODE_USEQ, 0, ALU_OP2_SETE_INT, tgsi_op2},
6600 {TGSI_OPCODE_USGE, 0, ALU_OP2_SETGE_UINT, tgsi_op2},
6601 {TGSI_OPCODE_USHR, 0, ALU_OP2_LSHR_INT, tgsi_op2_trans},
6602 {TGSI_OPCODE_USLT, 0, ALU_OP2_SETGT_UINT, tgsi_op2_swap},
6603 {TGSI_OPCODE_USNE, 0, ALU_OP2_SETNE_INT, tgsi_op2_swap},
6604 {TGSI_OPCODE_SWITCH, 0, ALU_OP0_NOP, tgsi_unsupported},
6605 {TGSI_OPCODE_CASE, 0, ALU_OP0_NOP, tgsi_unsupported},
6606 {TGSI_OPCODE_DEFAULT, 0, ALU_OP0_NOP, tgsi_unsupported},
6607 {TGSI_OPCODE_ENDSWITCH, 0, ALU_OP0_NOP, tgsi_unsupported},
6608 {TGSI_OPCODE_SAMPLE, 0, 0, tgsi_unsupported},
6609 {TGSI_OPCODE_SAMPLE_I, 0, 0, tgsi_unsupported},
6610 {TGSI_OPCODE_SAMPLE_I_MS, 0, 0, tgsi_unsupported},
6611 {TGSI_OPCODE_SAMPLE_B, 0, 0, tgsi_unsupported},
6612 {TGSI_OPCODE_SAMPLE_C, 0, 0, tgsi_unsupported},
6613 {TGSI_OPCODE_SAMPLE_C_LZ, 0, 0, tgsi_unsupported},
6614 {TGSI_OPCODE_SAMPLE_D, 0, 0, tgsi_unsupported},
6615 {TGSI_OPCODE_SAMPLE_L, 0, 0, tgsi_unsupported},
6616 {TGSI_OPCODE_GATHER4, 0, 0, tgsi_unsupported},
6617 {TGSI_OPCODE_SVIEWINFO, 0, 0, tgsi_unsupported},
6618 {TGSI_OPCODE_SAMPLE_POS, 0, 0, tgsi_unsupported},
6619 {TGSI_OPCODE_SAMPLE_INFO, 0, 0, tgsi_unsupported},
6620 {TGSI_OPCODE_UARL, 0, ALU_OP1_MOVA_INT, tgsi_r600_arl},
6621 {TGSI_OPCODE_UCMP, 0, ALU_OP0_NOP, tgsi_ucmp},
6622 {TGSI_OPCODE_IABS, 0, 0, tgsi_iabs},
6623 {TGSI_OPCODE_ISSG, 0, 0, tgsi_issg},
6624 {TGSI_OPCODE_LOAD, 0, ALU_OP0_NOP, tgsi_unsupported},
6625 {TGSI_OPCODE_STORE, 0, ALU_OP0_NOP, tgsi_unsupported},
6626 {TGSI_OPCODE_MFENCE, 0, ALU_OP0_NOP, tgsi_unsupported},
6627 {TGSI_OPCODE_LFENCE, 0, ALU_OP0_NOP, tgsi_unsupported},
6628 {TGSI_OPCODE_SFENCE, 0, ALU_OP0_NOP, tgsi_unsupported},
6629 {TGSI_OPCODE_BARRIER, 0, ALU_OP0_NOP, tgsi_unsupported},
6630 {TGSI_OPCODE_ATOMUADD, 0, ALU_OP0_NOP, tgsi_unsupported},
6631 {TGSI_OPCODE_ATOMXCHG, 0, ALU_OP0_NOP, tgsi_unsupported},
6632 {TGSI_OPCODE_ATOMCAS, 0, ALU_OP0_NOP, tgsi_unsupported},
6633 {TGSI_OPCODE_ATOMAND, 0, ALU_OP0_NOP, tgsi_unsupported},
6634 {TGSI_OPCODE_ATOMOR, 0, ALU_OP0_NOP, tgsi_unsupported},
6635 {TGSI_OPCODE_ATOMXOR, 0, ALU_OP0_NOP, tgsi_unsupported},
6636 {TGSI_OPCODE_ATOMUMIN, 0, ALU_OP0_NOP, tgsi_unsupported},
6637 {TGSI_OPCODE_ATOMUMAX, 0, ALU_OP0_NOP, tgsi_unsupported},
6638 {TGSI_OPCODE_ATOMIMIN, 0, ALU_OP0_NOP, tgsi_unsupported},
6639 {TGSI_OPCODE_ATOMIMAX, 0, ALU_OP0_NOP, tgsi_unsupported},
6640 {TGSI_OPCODE_TEX2, 0, FETCH_OP_SAMPLE, tgsi_tex},
6641 {TGSI_OPCODE_TXB2, 0, FETCH_OP_SAMPLE_LB, tgsi_tex},
6642 {TGSI_OPCODE_TXL2, 0, FETCH_OP_SAMPLE_L, tgsi_tex},
6643 {TGSI_OPCODE_LAST, 0, ALU_OP0_NOP, tgsi_unsupported},
6644 };
6645
6646 static struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = {
6647 {TGSI_OPCODE_ARL, 0, ALU_OP0_NOP, tgsi_eg_arl},
6648 {TGSI_OPCODE_MOV, 0, ALU_OP1_MOV, tgsi_op2},
6649 {TGSI_OPCODE_LIT, 0, ALU_OP0_NOP, tgsi_lit},
6650 {TGSI_OPCODE_RCP, 0, ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate},
6651 {TGSI_OPCODE_RSQ, 0, ALU_OP1_RECIPSQRT_IEEE, tgsi_rsq},
6652 {TGSI_OPCODE_EXP, 0, ALU_OP0_NOP, tgsi_exp},
6653 {TGSI_OPCODE_LOG, 0, ALU_OP0_NOP, tgsi_log},
6654 {TGSI_OPCODE_MUL, 0, ALU_OP2_MUL, tgsi_op2},
6655 {TGSI_OPCODE_ADD, 0, ALU_OP2_ADD, tgsi_op2},
6656 {TGSI_OPCODE_DP3, 0, ALU_OP2_DOT4, tgsi_dp},
6657 {TGSI_OPCODE_DP4, 0, ALU_OP2_DOT4, tgsi_dp},
6658 {TGSI_OPCODE_DST, 0, ALU_OP0_NOP, tgsi_opdst},
6659 {TGSI_OPCODE_MIN, 0, ALU_OP2_MIN, tgsi_op2},
6660 {TGSI_OPCODE_MAX, 0, ALU_OP2_MAX, tgsi_op2},
6661 {TGSI_OPCODE_SLT, 0, ALU_OP2_SETGT, tgsi_op2_swap},
6662 {TGSI_OPCODE_SGE, 0, ALU_OP2_SETGE, tgsi_op2},
6663 {TGSI_OPCODE_MAD, 1, ALU_OP3_MULADD, tgsi_op3},
6664 {TGSI_OPCODE_SUB, 0, ALU_OP2_ADD, tgsi_op2},
6665 {TGSI_OPCODE_LRP, 0, ALU_OP0_NOP, tgsi_lrp},
6666 {TGSI_OPCODE_CND, 0, ALU_OP0_NOP, tgsi_unsupported},
6667 /* gap */
6668 {20, 0, ALU_OP0_NOP, tgsi_unsupported},
6669 {TGSI_OPCODE_DP2A, 0, ALU_OP0_NOP, tgsi_unsupported},
6670 /* gap */
6671 {22, 0, ALU_OP0_NOP, tgsi_unsupported},
6672 {23, 0, ALU_OP0_NOP, tgsi_unsupported},
6673 {TGSI_OPCODE_FRC, 0, ALU_OP1_FRACT, tgsi_op2},
6674 {TGSI_OPCODE_CLAMP, 0, ALU_OP0_NOP, tgsi_unsupported},
6675 {TGSI_OPCODE_FLR, 0, ALU_OP1_FLOOR, tgsi_op2},
6676 {TGSI_OPCODE_ROUND, 0, ALU_OP1_RNDNE, tgsi_op2},
6677 {TGSI_OPCODE_EX2, 0, ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
6678 {TGSI_OPCODE_LG2, 0, ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
6679 {TGSI_OPCODE_POW, 0, ALU_OP0_NOP, tgsi_pow},
6680 {TGSI_OPCODE_XPD, 0, ALU_OP0_NOP, tgsi_xpd},
6681 /* gap */
6682 {32, 0, ALU_OP0_NOP, tgsi_unsupported},
6683 {TGSI_OPCODE_ABS, 0, ALU_OP1_MOV, tgsi_op2},
6684 {TGSI_OPCODE_RCC, 0, ALU_OP0_NOP, tgsi_unsupported},
6685 {TGSI_OPCODE_DPH, 0, ALU_OP2_DOT4, tgsi_dp},
6686 {TGSI_OPCODE_COS, 0, ALU_OP1_COS, tgsi_trig},
6687 {TGSI_OPCODE_DDX, 0, FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
6688 {TGSI_OPCODE_DDY, 0, FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
6689 {TGSI_OPCODE_KILL, 0, ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */
6690 {TGSI_OPCODE_PK2H, 0, ALU_OP0_NOP, tgsi_unsupported},
6691 {TGSI_OPCODE_PK2US, 0, ALU_OP0_NOP, tgsi_unsupported},
6692 {TGSI_OPCODE_PK4B, 0, ALU_OP0_NOP, tgsi_unsupported},
6693 {TGSI_OPCODE_PK4UB, 0, ALU_OP0_NOP, tgsi_unsupported},
6694 {TGSI_OPCODE_RFL, 0, ALU_OP0_NOP, tgsi_unsupported},
6695 {TGSI_OPCODE_SEQ, 0, ALU_OP2_SETE, tgsi_op2},
6696 {TGSI_OPCODE_SFL, 0, ALU_OP0_NOP, tgsi_unsupported},
6697 {TGSI_OPCODE_SGT, 0, ALU_OP2_SETGT, tgsi_op2},
6698 {TGSI_OPCODE_SIN, 0, ALU_OP1_SIN, tgsi_trig},
6699 {TGSI_OPCODE_SLE, 0, ALU_OP2_SETGE, tgsi_op2_swap},
6700 {TGSI_OPCODE_SNE, 0, ALU_OP2_SETNE, tgsi_op2},
6701 {TGSI_OPCODE_STR, 0, ALU_OP0_NOP, tgsi_unsupported},
6702 {TGSI_OPCODE_TEX, 0, FETCH_OP_SAMPLE, tgsi_tex},
6703 {TGSI_OPCODE_TXD, 0, FETCH_OP_SAMPLE_G, tgsi_tex},
6704 {TGSI_OPCODE_TXP, 0, FETCH_OP_SAMPLE, tgsi_tex},
6705 {TGSI_OPCODE_UP2H, 0, ALU_OP0_NOP, tgsi_unsupported},
6706 {TGSI_OPCODE_UP2US, 0, ALU_OP0_NOP, tgsi_unsupported},
6707 {TGSI_OPCODE_UP4B, 0, ALU_OP0_NOP, tgsi_unsupported},
6708 {TGSI_OPCODE_UP4UB, 0, ALU_OP0_NOP, tgsi_unsupported},
6709 {TGSI_OPCODE_X2D, 0, ALU_OP0_NOP, tgsi_unsupported},
6710 {TGSI_OPCODE_ARA, 0, ALU_OP0_NOP, tgsi_unsupported},
6711 {TGSI_OPCODE_ARR, 0, ALU_OP0_NOP, tgsi_eg_arl},
6712 {TGSI_OPCODE_BRA, 0, ALU_OP0_NOP, tgsi_unsupported},
6713 {TGSI_OPCODE_CAL, 0, ALU_OP0_NOP, tgsi_unsupported},
6714 {TGSI_OPCODE_RET, 0, ALU_OP0_NOP, tgsi_unsupported},
6715 {TGSI_OPCODE_SSG, 0, ALU_OP0_NOP, tgsi_ssg},
6716 {TGSI_OPCODE_CMP, 0, ALU_OP0_NOP, tgsi_cmp},
6717 {TGSI_OPCODE_SCS, 0, ALU_OP0_NOP, tgsi_scs},
6718 {TGSI_OPCODE_TXB, 0, FETCH_OP_SAMPLE_LB, tgsi_tex},
6719 {TGSI_OPCODE_NRM, 0, ALU_OP0_NOP, tgsi_unsupported},
6720 {TGSI_OPCODE_DIV, 0, ALU_OP0_NOP, tgsi_unsupported},
6721 {TGSI_OPCODE_DP2, 0, ALU_OP2_DOT4, tgsi_dp},
6722 {TGSI_OPCODE_TXL, 0, FETCH_OP_SAMPLE_L, tgsi_tex},
6723 {TGSI_OPCODE_BRK, 0, CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
6724 {TGSI_OPCODE_IF, 0, ALU_OP0_NOP, tgsi_if},
6725 {TGSI_OPCODE_UIF, 0, ALU_OP0_NOP, tgsi_uif},
6726 {76, 0, ALU_OP0_NOP, tgsi_unsupported},
6727 {TGSI_OPCODE_ELSE, 0, ALU_OP0_NOP, tgsi_else},
6728 {TGSI_OPCODE_ENDIF, 0, ALU_OP0_NOP, tgsi_endif},
6729 /* gap */
6730 {79, 0, ALU_OP0_NOP, tgsi_unsupported},
6731 {80, 0, ALU_OP0_NOP, tgsi_unsupported},
6732 {TGSI_OPCODE_PUSHA, 0, ALU_OP0_NOP, tgsi_unsupported},
6733 {TGSI_OPCODE_POPA, 0, ALU_OP0_NOP, tgsi_unsupported},
6734 {TGSI_OPCODE_CEIL, 0, ALU_OP1_CEIL, tgsi_op2},
6735 {TGSI_OPCODE_I2F, 0, ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
6736 {TGSI_OPCODE_NOT, 0, ALU_OP1_NOT_INT, tgsi_op2},
6737 {TGSI_OPCODE_TRUNC, 0, ALU_OP1_TRUNC, tgsi_op2},
6738 {TGSI_OPCODE_SHL, 0, ALU_OP2_LSHL_INT, tgsi_op2},
6739 /* gap */
6740 {88, 0, ALU_OP0_NOP, tgsi_unsupported},
6741 {TGSI_OPCODE_AND, 0, ALU_OP2_AND_INT, tgsi_op2},
6742 {TGSI_OPCODE_OR, 0, ALU_OP2_OR_INT, tgsi_op2},
6743 {TGSI_OPCODE_MOD, 0, ALU_OP0_NOP, tgsi_imod},
6744 {TGSI_OPCODE_XOR, 0, ALU_OP2_XOR_INT, tgsi_op2},
6745 {TGSI_OPCODE_SAD, 0, ALU_OP0_NOP, tgsi_unsupported},
6746 {TGSI_OPCODE_TXF, 0, FETCH_OP_LD, tgsi_tex},
6747 {TGSI_OPCODE_TXQ, 0, FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
6748 {TGSI_OPCODE_CONT, 0, CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
6749 {TGSI_OPCODE_EMIT, 0, CF_OP_EMIT_VERTEX, tgsi_gs_emit},
6750 {TGSI_OPCODE_ENDPRIM, 0, CF_OP_CUT_VERTEX, tgsi_gs_emit},
6751 {TGSI_OPCODE_BGNLOOP, 0, ALU_OP0_NOP, tgsi_bgnloop},
6752 {TGSI_OPCODE_BGNSUB, 0, ALU_OP0_NOP, tgsi_unsupported},
6753 {TGSI_OPCODE_ENDLOOP, 0, ALU_OP0_NOP, tgsi_endloop},
6754 {TGSI_OPCODE_ENDSUB, 0, ALU_OP0_NOP, tgsi_unsupported},
6755 {TGSI_OPCODE_TXQ_LZ, 0, FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
6756 /* gap */
6757 {104, 0, ALU_OP0_NOP, tgsi_unsupported},
6758 {105, 0, ALU_OP0_NOP, tgsi_unsupported},
6759 {106, 0, ALU_OP0_NOP, tgsi_unsupported},
6760 {TGSI_OPCODE_NOP, 0, ALU_OP0_NOP, tgsi_unsupported},
6761 {TGSI_OPCODE_FSEQ, 0, ALU_OP2_SETE_DX10, tgsi_op2},
6762 {TGSI_OPCODE_FSGE, 0, ALU_OP2_SETGE_DX10, tgsi_op2},
6763 {TGSI_OPCODE_FSLT, 0, ALU_OP2_SETGT_DX10, tgsi_op2_swap},
6764 {TGSI_OPCODE_FSNE, 0, ALU_OP2_SETNE_DX10, tgsi_op2_swap},
6765 {TGSI_OPCODE_NRM4, 0, ALU_OP0_NOP, tgsi_unsupported},
6766 {TGSI_OPCODE_CALLNZ, 0, ALU_OP0_NOP, tgsi_unsupported},
6767 /* gap */
6768 {114, 0, ALU_OP0_NOP, tgsi_unsupported},
6769 {TGSI_OPCODE_BREAKC, 0, ALU_OP0_NOP, tgsi_unsupported},
6770 {TGSI_OPCODE_KILL_IF, 0, ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */
6771 {TGSI_OPCODE_END, 0, ALU_OP0_NOP, tgsi_end}, /* aka HALT */
6772 /* gap */
6773 {118, 0, ALU_OP0_NOP, tgsi_unsupported},
6774 {TGSI_OPCODE_F2I, 0, ALU_OP1_FLT_TO_INT, tgsi_f2i},
6775 {TGSI_OPCODE_IDIV, 0, ALU_OP0_NOP, tgsi_idiv},
6776 {TGSI_OPCODE_IMAX, 0, ALU_OP2_MAX_INT, tgsi_op2},
6777 {TGSI_OPCODE_IMIN, 0, ALU_OP2_MIN_INT, tgsi_op2},
6778 {TGSI_OPCODE_INEG, 0, ALU_OP2_SUB_INT, tgsi_ineg},
6779 {TGSI_OPCODE_ISGE, 0, ALU_OP2_SETGE_INT, tgsi_op2},
6780 {TGSI_OPCODE_ISHR, 0, ALU_OP2_ASHR_INT, tgsi_op2},
6781 {TGSI_OPCODE_ISLT, 0, ALU_OP2_SETGT_INT, tgsi_op2_swap},
6782 {TGSI_OPCODE_F2U, 0, ALU_OP1_FLT_TO_UINT, tgsi_f2i},
6783 {TGSI_OPCODE_U2F, 0, ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
6784 {TGSI_OPCODE_UADD, 0, ALU_OP2_ADD_INT, tgsi_op2},
6785 {TGSI_OPCODE_UDIV, 0, ALU_OP0_NOP, tgsi_udiv},
6786 {TGSI_OPCODE_UMAD, 0, ALU_OP0_NOP, tgsi_umad},
6787 {TGSI_OPCODE_UMAX, 0, ALU_OP2_MAX_UINT, tgsi_op2},
6788 {TGSI_OPCODE_UMIN, 0, ALU_OP2_MIN_UINT, tgsi_op2},
6789 {TGSI_OPCODE_UMOD, 0, ALU_OP0_NOP, tgsi_umod},
6790 {TGSI_OPCODE_UMUL, 0, ALU_OP2_MULLO_UINT, tgsi_op2_trans},
6791 {TGSI_OPCODE_USEQ, 0, ALU_OP2_SETE_INT, tgsi_op2},
6792 {TGSI_OPCODE_USGE, 0, ALU_OP2_SETGE_UINT, tgsi_op2},
6793 {TGSI_OPCODE_USHR, 0, ALU_OP2_LSHR_INT, tgsi_op2},
6794 {TGSI_OPCODE_USLT, 0, ALU_OP2_SETGT_UINT, tgsi_op2_swap},
6795 {TGSI_OPCODE_USNE, 0, ALU_OP2_SETNE_INT, tgsi_op2},
6796 {TGSI_OPCODE_SWITCH, 0, ALU_OP0_NOP, tgsi_unsupported},
6797 {TGSI_OPCODE_CASE, 0, ALU_OP0_NOP, tgsi_unsupported},
6798 {TGSI_OPCODE_DEFAULT, 0, ALU_OP0_NOP, tgsi_unsupported},
6799 {TGSI_OPCODE_ENDSWITCH, 0, ALU_OP0_NOP, tgsi_unsupported},
6800 {TGSI_OPCODE_SAMPLE, 0, 0, tgsi_unsupported},
6801 {TGSI_OPCODE_SAMPLE_I, 0, 0, tgsi_unsupported},
6802 {TGSI_OPCODE_SAMPLE_I_MS, 0, 0, tgsi_unsupported},
6803 {TGSI_OPCODE_SAMPLE_B, 0, 0, tgsi_unsupported},
6804 {TGSI_OPCODE_SAMPLE_C, 0, 0, tgsi_unsupported},
6805 {TGSI_OPCODE_SAMPLE_C_LZ, 0, 0, tgsi_unsupported},
6806 {TGSI_OPCODE_SAMPLE_D, 0, 0, tgsi_unsupported},
6807 {TGSI_OPCODE_SAMPLE_L, 0, 0, tgsi_unsupported},
6808 {TGSI_OPCODE_GATHER4, 0, 0, tgsi_unsupported},
6809 {TGSI_OPCODE_SVIEWINFO, 0, 0, tgsi_unsupported},
6810 {TGSI_OPCODE_SAMPLE_POS, 0, 0, tgsi_unsupported},
6811 {TGSI_OPCODE_SAMPLE_INFO, 0, 0, tgsi_unsupported},
6812 {TGSI_OPCODE_UARL, 0, ALU_OP1_MOVA_INT, tgsi_eg_arl},
6813 {TGSI_OPCODE_UCMP, 0, ALU_OP0_NOP, tgsi_ucmp},
6814 {TGSI_OPCODE_IABS, 0, 0, tgsi_iabs},
6815 {TGSI_OPCODE_ISSG, 0, 0, tgsi_issg},
6816 {TGSI_OPCODE_LOAD, 0, ALU_OP0_NOP, tgsi_unsupported},
6817 {TGSI_OPCODE_STORE, 0, ALU_OP0_NOP, tgsi_unsupported},
6818 {TGSI_OPCODE_MFENCE, 0, ALU_OP0_NOP, tgsi_unsupported},
6819 {TGSI_OPCODE_LFENCE, 0, ALU_OP0_NOP, tgsi_unsupported},
6820 {TGSI_OPCODE_SFENCE, 0, ALU_OP0_NOP, tgsi_unsupported},
6821 {TGSI_OPCODE_BARRIER, 0, ALU_OP0_NOP, tgsi_unsupported},
6822 {TGSI_OPCODE_ATOMUADD, 0, ALU_OP0_NOP, tgsi_unsupported},
6823 {TGSI_OPCODE_ATOMXCHG, 0, ALU_OP0_NOP, tgsi_unsupported},
6824 {TGSI_OPCODE_ATOMCAS, 0, ALU_OP0_NOP, tgsi_unsupported},
6825 {TGSI_OPCODE_ATOMAND, 0, ALU_OP0_NOP, tgsi_unsupported},
6826 {TGSI_OPCODE_ATOMOR, 0, ALU_OP0_NOP, tgsi_unsupported},
6827 {TGSI_OPCODE_ATOMXOR, 0, ALU_OP0_NOP, tgsi_unsupported},
6828 {TGSI_OPCODE_ATOMUMIN, 0, ALU_OP0_NOP, tgsi_unsupported},
6829 {TGSI_OPCODE_ATOMUMAX, 0, ALU_OP0_NOP, tgsi_unsupported},
6830 {TGSI_OPCODE_ATOMIMIN, 0, ALU_OP0_NOP, tgsi_unsupported},
6831 {TGSI_OPCODE_ATOMIMAX, 0, ALU_OP0_NOP, tgsi_unsupported},
6832 {TGSI_OPCODE_TEX2, 0, FETCH_OP_SAMPLE, tgsi_tex},
6833 {TGSI_OPCODE_TXB2, 0, FETCH_OP_SAMPLE_LB, tgsi_tex},
6834 {TGSI_OPCODE_TXL2, 0, FETCH_OP_SAMPLE_L, tgsi_tex},
6835 {TGSI_OPCODE_LAST, 0, ALU_OP0_NOP, tgsi_unsupported},
6836 };
6837
6838 static struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = {
6839 {TGSI_OPCODE_ARL, 0, ALU_OP0_NOP, tgsi_eg_arl},
6840 {TGSI_OPCODE_MOV, 0, ALU_OP1_MOV, tgsi_op2},
6841 {TGSI_OPCODE_LIT, 0, ALU_OP0_NOP, tgsi_lit},
6842 {TGSI_OPCODE_RCP, 0, ALU_OP1_RECIP_IEEE, cayman_emit_float_instr},
6843 {TGSI_OPCODE_RSQ, 0, ALU_OP1_RECIPSQRT_IEEE, cayman_emit_float_instr},
6844 {TGSI_OPCODE_EXP, 0, ALU_OP0_NOP, tgsi_exp},
6845 {TGSI_OPCODE_LOG, 0, ALU_OP0_NOP, tgsi_log},
6846 {TGSI_OPCODE_MUL, 0, ALU_OP2_MUL, tgsi_op2},
6847 {TGSI_OPCODE_ADD, 0, ALU_OP2_ADD, tgsi_op2},
6848 {TGSI_OPCODE_DP3, 0, ALU_OP2_DOT4, tgsi_dp},
6849 {TGSI_OPCODE_DP4, 0, ALU_OP2_DOT4, tgsi_dp},
6850 {TGSI_OPCODE_DST, 0, ALU_OP0_NOP, tgsi_opdst},
6851 {TGSI_OPCODE_MIN, 0, ALU_OP2_MIN, tgsi_op2},
6852 {TGSI_OPCODE_MAX, 0, ALU_OP2_MAX, tgsi_op2},
6853 {TGSI_OPCODE_SLT, 0, ALU_OP2_SETGT, tgsi_op2_swap},
6854 {TGSI_OPCODE_SGE, 0, ALU_OP2_SETGE, tgsi_op2},
6855 {TGSI_OPCODE_MAD, 1, ALU_OP3_MULADD, tgsi_op3},
6856 {TGSI_OPCODE_SUB, 0, ALU_OP2_ADD, tgsi_op2},
6857 {TGSI_OPCODE_LRP, 0, ALU_OP0_NOP, tgsi_lrp},
6858 {TGSI_OPCODE_CND, 0, ALU_OP0_NOP, tgsi_unsupported},
6859 /* gap */
6860 {20, 0, ALU_OP0_NOP, tgsi_unsupported},
6861 {TGSI_OPCODE_DP2A, 0, ALU_OP0_NOP, tgsi_unsupported},
6862 /* gap */
6863 {22, 0, ALU_OP0_NOP, tgsi_unsupported},
6864 {23, 0, ALU_OP0_NOP, tgsi_unsupported},
6865 {TGSI_OPCODE_FRC, 0, ALU_OP1_FRACT, tgsi_op2},
6866 {TGSI_OPCODE_CLAMP, 0, ALU_OP0_NOP, tgsi_unsupported},
6867 {TGSI_OPCODE_FLR, 0, ALU_OP1_FLOOR, tgsi_op2},
6868 {TGSI_OPCODE_ROUND, 0, ALU_OP1_RNDNE, tgsi_op2},
6869 {TGSI_OPCODE_EX2, 0, ALU_OP1_EXP_IEEE, cayman_emit_float_instr},
6870 {TGSI_OPCODE_LG2, 0, ALU_OP1_LOG_IEEE, cayman_emit_float_instr},
6871 {TGSI_OPCODE_POW, 0, ALU_OP0_NOP, cayman_pow},
6872 {TGSI_OPCODE_XPD, 0, ALU_OP0_NOP, tgsi_xpd},
6873 /* gap */
6874 {32, 0, ALU_OP0_NOP, tgsi_unsupported},
6875 {TGSI_OPCODE_ABS, 0, ALU_OP1_MOV, tgsi_op2},
6876 {TGSI_OPCODE_RCC, 0, ALU_OP0_NOP, tgsi_unsupported},
6877 {TGSI_OPCODE_DPH, 0, ALU_OP2_DOT4, tgsi_dp},
6878 {TGSI_OPCODE_COS, 0, ALU_OP1_COS, cayman_trig},
6879 {TGSI_OPCODE_DDX, 0, FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
6880 {TGSI_OPCODE_DDY, 0, FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
6881 {TGSI_OPCODE_KILL, 0, ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */
6882 {TGSI_OPCODE_PK2H, 0, ALU_OP0_NOP, tgsi_unsupported},
6883 {TGSI_OPCODE_PK2US, 0, ALU_OP0_NOP, tgsi_unsupported},
6884 {TGSI_OPCODE_PK4B, 0, ALU_OP0_NOP, tgsi_unsupported},
6885 {TGSI_OPCODE_PK4UB, 0, ALU_OP0_NOP, tgsi_unsupported},
6886 {TGSI_OPCODE_RFL, 0, ALU_OP0_NOP, tgsi_unsupported},
6887 {TGSI_OPCODE_SEQ, 0, ALU_OP2_SETE, tgsi_op2},
6888 {TGSI_OPCODE_SFL, 0, ALU_OP0_NOP, tgsi_unsupported},
6889 {TGSI_OPCODE_SGT, 0, ALU_OP2_SETGT, tgsi_op2},
6890 {TGSI_OPCODE_SIN, 0, ALU_OP1_SIN, cayman_trig},
6891 {TGSI_OPCODE_SLE, 0, ALU_OP2_SETGE, tgsi_op2_swap},
6892 {TGSI_OPCODE_SNE, 0, ALU_OP2_SETNE, tgsi_op2},
6893 {TGSI_OPCODE_STR, 0, ALU_OP0_NOP, tgsi_unsupported},
6894 {TGSI_OPCODE_TEX, 0, FETCH_OP_SAMPLE, tgsi_tex},
6895 {TGSI_OPCODE_TXD, 0, FETCH_OP_SAMPLE_G, tgsi_tex},
6896 {TGSI_OPCODE_TXP, 0, FETCH_OP_SAMPLE, tgsi_tex},
6897 {TGSI_OPCODE_UP2H, 0, ALU_OP0_NOP, tgsi_unsupported},
6898 {TGSI_OPCODE_UP2US, 0, ALU_OP0_NOP, tgsi_unsupported},
6899 {TGSI_OPCODE_UP4B, 0, ALU_OP0_NOP, tgsi_unsupported},
6900 {TGSI_OPCODE_UP4UB, 0, ALU_OP0_NOP, tgsi_unsupported},
6901 {TGSI_OPCODE_X2D, 0, ALU_OP0_NOP, tgsi_unsupported},
6902 {TGSI_OPCODE_ARA, 0, ALU_OP0_NOP, tgsi_unsupported},
6903 {TGSI_OPCODE_ARR, 0, ALU_OP0_NOP, tgsi_eg_arl},
6904 {TGSI_OPCODE_BRA, 0, ALU_OP0_NOP, tgsi_unsupported},
6905 {TGSI_OPCODE_CAL, 0, ALU_OP0_NOP, tgsi_unsupported},
6906 {TGSI_OPCODE_RET, 0, ALU_OP0_NOP, tgsi_unsupported},
6907 {TGSI_OPCODE_SSG, 0, ALU_OP0_NOP, tgsi_ssg},
6908 {TGSI_OPCODE_CMP, 0, ALU_OP0_NOP, tgsi_cmp},
6909 {TGSI_OPCODE_SCS, 0, ALU_OP0_NOP, tgsi_scs},
6910 {TGSI_OPCODE_TXB, 0, FETCH_OP_SAMPLE_LB, tgsi_tex},
6911 {TGSI_OPCODE_NRM, 0, ALU_OP0_NOP, tgsi_unsupported},
6912 {TGSI_OPCODE_DIV, 0, ALU_OP0_NOP, tgsi_unsupported},
6913 {TGSI_OPCODE_DP2, 0, ALU_OP2_DOT4, tgsi_dp},
6914 {TGSI_OPCODE_TXL, 0, FETCH_OP_SAMPLE_L, tgsi_tex},
6915 {TGSI_OPCODE_BRK, 0, CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
6916 {TGSI_OPCODE_IF, 0, ALU_OP0_NOP, tgsi_if},
6917 {TGSI_OPCODE_UIF, 0, ALU_OP0_NOP, tgsi_uif},
6918 {76, 0, ALU_OP0_NOP, tgsi_unsupported},
6919 {TGSI_OPCODE_ELSE, 0, ALU_OP0_NOP, tgsi_else},
6920 {TGSI_OPCODE_ENDIF, 0, ALU_OP0_NOP, tgsi_endif},
6921 /* gap */
6922 {79, 0, ALU_OP0_NOP, tgsi_unsupported},
6923 {80, 0, ALU_OP0_NOP, tgsi_unsupported},
6924 {TGSI_OPCODE_PUSHA, 0, ALU_OP0_NOP, tgsi_unsupported},
6925 {TGSI_OPCODE_POPA, 0, ALU_OP0_NOP, tgsi_unsupported},
6926 {TGSI_OPCODE_CEIL, 0, ALU_OP1_CEIL, tgsi_op2},
6927 {TGSI_OPCODE_I2F, 0, ALU_OP1_INT_TO_FLT, tgsi_op2},
6928 {TGSI_OPCODE_NOT, 0, ALU_OP1_NOT_INT, tgsi_op2},
6929 {TGSI_OPCODE_TRUNC, 0, ALU_OP1_TRUNC, tgsi_op2},
6930 {TGSI_OPCODE_SHL, 0, ALU_OP2_LSHL_INT, tgsi_op2},
6931 /* gap */
6932 {88, 0, ALU_OP0_NOP, tgsi_unsupported},
6933 {TGSI_OPCODE_AND, 0, ALU_OP2_AND_INT, tgsi_op2},
6934 {TGSI_OPCODE_OR, 0, ALU_OP2_OR_INT, tgsi_op2},
6935 {TGSI_OPCODE_MOD, 0, ALU_OP0_NOP, tgsi_imod},
6936 {TGSI_OPCODE_XOR, 0, ALU_OP2_XOR_INT, tgsi_op2},
6937 {TGSI_OPCODE_SAD, 0, ALU_OP0_NOP, tgsi_unsupported},
6938 {TGSI_OPCODE_TXF, 0, FETCH_OP_LD, tgsi_tex},
6939 {TGSI_OPCODE_TXQ, 0, FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
6940 {TGSI_OPCODE_CONT, 0, CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
6941 {TGSI_OPCODE_EMIT, 0, CF_OP_EMIT_VERTEX, tgsi_gs_emit},
6942 {TGSI_OPCODE_ENDPRIM, 0, CF_OP_CUT_VERTEX, tgsi_gs_emit},
6943 {TGSI_OPCODE_BGNLOOP, 0, ALU_OP0_NOP, tgsi_bgnloop},
6944 {TGSI_OPCODE_BGNSUB, 0, ALU_OP0_NOP, tgsi_unsupported},
6945 {TGSI_OPCODE_ENDLOOP, 0, ALU_OP0_NOP, tgsi_endloop},
6946 {TGSI_OPCODE_ENDSUB, 0, ALU_OP0_NOP, tgsi_unsupported},
6947 {TGSI_OPCODE_TXQ_LZ, 0, FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
6948 /* gap */
6949 {104, 0, ALU_OP0_NOP, tgsi_unsupported},
6950 {105, 0, ALU_OP0_NOP, tgsi_unsupported},
6951 {106, 0, ALU_OP0_NOP, tgsi_unsupported},
6952 {TGSI_OPCODE_NOP, 0, ALU_OP0_NOP, tgsi_unsupported},
6953 /* gap */
6954 {TGSI_OPCODE_FSEQ, 0, ALU_OP2_SETE_DX10, tgsi_op2},
6955 {TGSI_OPCODE_FSGE, 0, ALU_OP2_SETGE_DX10, tgsi_op2},
6956 {TGSI_OPCODE_FSLT, 0, ALU_OP2_SETGT_DX10, tgsi_op2_swap},
6957 {TGSI_OPCODE_FSNE, 0, ALU_OP2_SETNE_DX10, tgsi_op2_swap},
6958 {TGSI_OPCODE_NRM4, 0, ALU_OP0_NOP, tgsi_unsupported},
6959 {TGSI_OPCODE_CALLNZ, 0, ALU_OP0_NOP, tgsi_unsupported},
6960 /* gap */
6961 {114, 0, ALU_OP0_NOP, tgsi_unsupported},
6962 {TGSI_OPCODE_BREAKC, 0, ALU_OP0_NOP, tgsi_unsupported},
6963 {TGSI_OPCODE_KILL_IF, 0, ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */
6964 {TGSI_OPCODE_END, 0, ALU_OP0_NOP, tgsi_end}, /* aka HALT */
6965 /* gap */
6966 {118, 0, ALU_OP0_NOP, tgsi_unsupported},
6967 {TGSI_OPCODE_F2I, 0, ALU_OP1_FLT_TO_INT, tgsi_op2},
6968 {TGSI_OPCODE_IDIV, 0, ALU_OP0_NOP, tgsi_idiv},
6969 {TGSI_OPCODE_IMAX, 0, ALU_OP2_MAX_INT, tgsi_op2},
6970 {TGSI_OPCODE_IMIN, 0, ALU_OP2_MIN_INT, tgsi_op2},
6971 {TGSI_OPCODE_INEG, 0, ALU_OP2_SUB_INT, tgsi_ineg},
6972 {TGSI_OPCODE_ISGE, 0, ALU_OP2_SETGE_INT, tgsi_op2},
6973 {TGSI_OPCODE_ISHR, 0, ALU_OP2_ASHR_INT, tgsi_op2},
6974 {TGSI_OPCODE_ISLT, 0, ALU_OP2_SETGT_INT, tgsi_op2_swap},
6975 {TGSI_OPCODE_F2U, 0, ALU_OP1_FLT_TO_UINT, tgsi_op2},
6976 {TGSI_OPCODE_U2F, 0, ALU_OP1_UINT_TO_FLT, tgsi_op2},
6977 {TGSI_OPCODE_UADD, 0, ALU_OP2_ADD_INT, tgsi_op2},
6978 {TGSI_OPCODE_UDIV, 0, ALU_OP0_NOP, tgsi_udiv},
6979 {TGSI_OPCODE_UMAD, 0, ALU_OP0_NOP, tgsi_umad},
6980 {TGSI_OPCODE_UMAX, 0, ALU_OP2_MAX_UINT, tgsi_op2},
6981 {TGSI_OPCODE_UMIN, 0, ALU_OP2_MIN_UINT, tgsi_op2},
6982 {TGSI_OPCODE_UMOD, 0, ALU_OP0_NOP, tgsi_umod},
6983 {TGSI_OPCODE_UMUL, 0, ALU_OP2_MULLO_INT, cayman_mul_int_instr},
6984 {TGSI_OPCODE_USEQ, 0, ALU_OP2_SETE_INT, tgsi_op2},
6985 {TGSI_OPCODE_USGE, 0, ALU_OP2_SETGE_UINT, tgsi_op2},
6986 {TGSI_OPCODE_USHR, 0, ALU_OP2_LSHR_INT, tgsi_op2},
6987 {TGSI_OPCODE_USLT, 0, ALU_OP2_SETGT_UINT, tgsi_op2_swap},
6988 {TGSI_OPCODE_USNE, 0, ALU_OP2_SETNE_INT, tgsi_op2},
6989 {TGSI_OPCODE_SWITCH, 0, ALU_OP0_NOP, tgsi_unsupported},
6990 {TGSI_OPCODE_CASE, 0, ALU_OP0_NOP, tgsi_unsupported},
6991 {TGSI_OPCODE_DEFAULT, 0, ALU_OP0_NOP, tgsi_unsupported},
6992 {TGSI_OPCODE_ENDSWITCH, 0, ALU_OP0_NOP, tgsi_unsupported},
6993 {TGSI_OPCODE_SAMPLE, 0, 0, tgsi_unsupported},
6994 {TGSI_OPCODE_SAMPLE_I, 0, 0, tgsi_unsupported},
6995 {TGSI_OPCODE_SAMPLE_I_MS, 0, 0, tgsi_unsupported},
6996 {TGSI_OPCODE_SAMPLE_B, 0, 0, tgsi_unsupported},
6997 {TGSI_OPCODE_SAMPLE_C, 0, 0, tgsi_unsupported},
6998 {TGSI_OPCODE_SAMPLE_C_LZ, 0, 0, tgsi_unsupported},
6999 {TGSI_OPCODE_SAMPLE_D, 0, 0, tgsi_unsupported},
7000 {TGSI_OPCODE_SAMPLE_L, 0, 0, tgsi_unsupported},
7001 {TGSI_OPCODE_GATHER4, 0, 0, tgsi_unsupported},
7002 {TGSI_OPCODE_SVIEWINFO, 0, 0, tgsi_unsupported},
7003 {TGSI_OPCODE_SAMPLE_POS, 0, 0, tgsi_unsupported},
7004 {TGSI_OPCODE_SAMPLE_INFO, 0, 0, tgsi_unsupported},
7005 {TGSI_OPCODE_UARL, 0, ALU_OP1_MOVA_INT, tgsi_eg_arl},
7006 {TGSI_OPCODE_UCMP, 0, ALU_OP0_NOP, tgsi_ucmp},
7007 {TGSI_OPCODE_IABS, 0, 0, tgsi_iabs},
7008 {TGSI_OPCODE_ISSG, 0, 0, tgsi_issg},
7009 {TGSI_OPCODE_LOAD, 0, ALU_OP0_NOP, tgsi_unsupported},
7010 {TGSI_OPCODE_STORE, 0, ALU_OP0_NOP, tgsi_unsupported},
7011 {TGSI_OPCODE_MFENCE, 0, ALU_OP0_NOP, tgsi_unsupported},
7012 {TGSI_OPCODE_LFENCE, 0, ALU_OP0_NOP, tgsi_unsupported},
7013 {TGSI_OPCODE_SFENCE, 0, ALU_OP0_NOP, tgsi_unsupported},
7014 {TGSI_OPCODE_BARRIER, 0, ALU_OP0_NOP, tgsi_unsupported},
7015 {TGSI_OPCODE_ATOMUADD, 0, ALU_OP0_NOP, tgsi_unsupported},
7016 {TGSI_OPCODE_ATOMXCHG, 0, ALU_OP0_NOP, tgsi_unsupported},
7017 {TGSI_OPCODE_ATOMCAS, 0, ALU_OP0_NOP, tgsi_unsupported},
7018 {TGSI_OPCODE_ATOMAND, 0, ALU_OP0_NOP, tgsi_unsupported},
7019 {TGSI_OPCODE_ATOMOR, 0, ALU_OP0_NOP, tgsi_unsupported},
7020 {TGSI_OPCODE_ATOMXOR, 0, ALU_OP0_NOP, tgsi_unsupported},
7021 {TGSI_OPCODE_ATOMUMIN, 0, ALU_OP0_NOP, tgsi_unsupported},
7022 {TGSI_OPCODE_ATOMUMAX, 0, ALU_OP0_NOP, tgsi_unsupported},
7023 {TGSI_OPCODE_ATOMIMIN, 0, ALU_OP0_NOP, tgsi_unsupported},
7024 {TGSI_OPCODE_ATOMIMAX, 0, ALU_OP0_NOP, tgsi_unsupported},
7025 {TGSI_OPCODE_TEX2, 0, FETCH_OP_SAMPLE, tgsi_tex},
7026 {TGSI_OPCODE_TXB2, 0, FETCH_OP_SAMPLE_LB, tgsi_tex},
7027 {TGSI_OPCODE_TXL2, 0, FETCH_OP_SAMPLE_L, tgsi_tex},
7028 {TGSI_OPCODE_LAST, 0, ALU_OP0_NOP, tgsi_unsupported},
7029 };