r600g: kill off r600_shader_tgsi_instruction::{tgsi_opcode,is_op3}
[mesa.git] / src / gallium / drivers / r600 / r600_shader.c
1 /*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 */
23 #include "r600_sq.h"
24 #include "r600_llvm.h"
25 #include "r600_formats.h"
26 #include "r600_opcodes.h"
27 #include "r600_shader.h"
28 #include "r600d.h"
29
30 #include "sb/sb_public.h"
31
32 #include "pipe/p_shader_tokens.h"
33 #include "tgsi/tgsi_info.h"
34 #include "tgsi/tgsi_parse.h"
35 #include "tgsi/tgsi_scan.h"
36 #include "tgsi/tgsi_dump.h"
37 #include "util/u_memory.h"
38 #include "util/u_math.h"
39 #include <stdio.h>
40 #include <errno.h>
41
42 /* CAYMAN notes
43 Why CAYMAN got loops for lots of instructions is explained here.
44
45 -These 8xx t-slot only ops are implemented in all vector slots.
46 MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT
47 These 8xx t-slot only opcodes become vector ops, with all four
48 slots expecting the arguments on sources a and b. Result is
49 broadcast to all channels.
50 MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT
51 These 8xx t-slot only opcodes become vector ops in the z, y, and
52 x slots.
53 EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64
54 RECIPSQRT_IEEE/CLAMPED/FF/_64/CLAMPED_64
55 SQRT_IEEE/_64
56 SIN/COS
57 The w slot may have an independent co-issued operation, or if the
58 result is required to be in the w slot, the opcode above may be
59 issued in the w slot as well.
60 The compiler must issue the source argument to slots z, y, and x
61 */
62
63 static int r600_shader_from_tgsi(struct r600_context *rctx,
64 struct r600_pipe_shader *pipeshader,
65 struct r600_shader_key key);
66
67
68 static void r600_add_gpr_array(struct r600_shader *ps, int start_gpr,
69 int size, unsigned comp_mask) {
70
71 if (!size)
72 return;
73
74 if (ps->num_arrays == ps->max_arrays) {
75 ps->max_arrays += 64;
76 ps->arrays = realloc(ps->arrays, ps->max_arrays *
77 sizeof(struct r600_shader_array));
78 }
79
80 int n = ps->num_arrays;
81 ++ps->num_arrays;
82
83 ps->arrays[n].comp_mask = comp_mask;
84 ps->arrays[n].gpr_start = start_gpr;
85 ps->arrays[n].gpr_count = size;
86 }
87
88 static void r600_dump_streamout(struct pipe_stream_output_info *so)
89 {
90 unsigned i;
91
92 fprintf(stderr, "STREAMOUT\n");
93 for (i = 0; i < so->num_outputs; i++) {
94 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
95 so->output[i].start_component;
96 fprintf(stderr, " %i: MEM_STREAM0_BUF%i[%i..%i] <- OUT[%i].%s%s%s%s%s\n",
97 i, so->output[i].output_buffer,
98 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
99 so->output[i].register_index,
100 mask & 1 ? "x" : "",
101 mask & 2 ? "y" : "",
102 mask & 4 ? "z" : "",
103 mask & 8 ? "w" : "",
104 so->output[i].dst_offset < so->output[i].start_component ? " (will lower)" : "");
105 }
106 }
107
108 static int store_shader(struct pipe_context *ctx,
109 struct r600_pipe_shader *shader)
110 {
111 struct r600_context *rctx = (struct r600_context *)ctx;
112 uint32_t *ptr, i;
113
114 if (shader->bo == NULL) {
115 shader->bo = (struct r600_resource*)
116 pipe_buffer_create(ctx->screen, PIPE_BIND_CUSTOM, PIPE_USAGE_IMMUTABLE, shader->shader.bc.ndw * 4);
117 if (shader->bo == NULL) {
118 return -ENOMEM;
119 }
120 ptr = r600_buffer_map_sync_with_rings(&rctx->b, shader->bo, PIPE_TRANSFER_WRITE);
121 if (R600_BIG_ENDIAN) {
122 for (i = 0; i < shader->shader.bc.ndw; ++i) {
123 ptr[i] = util_cpu_to_le32(shader->shader.bc.bytecode[i]);
124 }
125 } else {
126 memcpy(ptr, shader->shader.bc.bytecode, shader->shader.bc.ndw * sizeof(*ptr));
127 }
128 rctx->b.ws->buffer_unmap(shader->bo->cs_buf);
129 }
130
131 return 0;
132 }
133
134 int r600_pipe_shader_create(struct pipe_context *ctx,
135 struct r600_pipe_shader *shader,
136 struct r600_shader_key key)
137 {
138 struct r600_context *rctx = (struct r600_context *)ctx;
139 struct r600_pipe_shader_selector *sel = shader->selector;
140 int r;
141 bool dump = r600_can_dump_shader(&rctx->screen->b, sel->tokens);
142 unsigned use_sb = !(rctx->screen->b.debug_flags & DBG_NO_SB);
143 unsigned sb_disasm = use_sb || (rctx->screen->b.debug_flags & DBG_SB_DISASM);
144 unsigned export_shader = key.vs_as_es;
145
146 shader->shader.bc.isa = rctx->isa;
147
148 if (dump) {
149 fprintf(stderr, "--------------------------------------------------------------\n");
150 tgsi_dump(sel->tokens, 0);
151
152 if (sel->so.num_outputs) {
153 r600_dump_streamout(&sel->so);
154 }
155 }
156 r = r600_shader_from_tgsi(rctx, shader, key);
157 if (r) {
158 R600_ERR("translation from TGSI failed !\n");
159 goto error;
160 }
161
162 /* disable SB for geom shaders - it can't handle the CF_EMIT instructions */
163 use_sb &= (shader->shader.processor_type != TGSI_PROCESSOR_GEOMETRY);
164 /* disable SB for shaders using CF_INDEX_0/1 (sampler/ubo array indexing) as it doesn't handle those currently */
165 use_sb &= !shader->shader.uses_index_registers;
166
167 /* Check if the bytecode has already been built. When using the llvm
168 * backend, r600_shader_from_tgsi() will take care of building the
169 * bytecode.
170 */
171 if (!shader->shader.bc.bytecode) {
172 r = r600_bytecode_build(&shader->shader.bc);
173 if (r) {
174 R600_ERR("building bytecode failed !\n");
175 goto error;
176 }
177 }
178
179 if (dump && !sb_disasm) {
180 fprintf(stderr, "--------------------------------------------------------------\n");
181 r600_bytecode_disasm(&shader->shader.bc);
182 fprintf(stderr, "______________________________________________________________\n");
183 } else if ((dump && sb_disasm) || use_sb) {
184 r = r600_sb_bytecode_process(rctx, &shader->shader.bc, &shader->shader,
185 dump, use_sb);
186 if (r) {
187 R600_ERR("r600_sb_bytecode_process failed !\n");
188 goto error;
189 }
190 }
191
192 if (shader->gs_copy_shader) {
193 if (dump) {
194 // dump copy shader
195 r = r600_sb_bytecode_process(rctx, &shader->gs_copy_shader->shader.bc,
196 &shader->gs_copy_shader->shader, dump, 0);
197 if (r)
198 goto error;
199 }
200
201 if ((r = store_shader(ctx, shader->gs_copy_shader)))
202 goto error;
203 }
204
205 /* Store the shader in a buffer. */
206 if ((r = store_shader(ctx, shader)))
207 goto error;
208
209 /* Build state. */
210 switch (shader->shader.processor_type) {
211 case TGSI_PROCESSOR_GEOMETRY:
212 if (rctx->b.chip_class >= EVERGREEN) {
213 evergreen_update_gs_state(ctx, shader);
214 evergreen_update_vs_state(ctx, shader->gs_copy_shader);
215 } else {
216 r600_update_gs_state(ctx, shader);
217 r600_update_vs_state(ctx, shader->gs_copy_shader);
218 }
219 break;
220 case TGSI_PROCESSOR_VERTEX:
221 if (rctx->b.chip_class >= EVERGREEN) {
222 if (export_shader)
223 evergreen_update_es_state(ctx, shader);
224 else
225 evergreen_update_vs_state(ctx, shader);
226 } else {
227 if (export_shader)
228 r600_update_es_state(ctx, shader);
229 else
230 r600_update_vs_state(ctx, shader);
231 }
232 break;
233 case TGSI_PROCESSOR_FRAGMENT:
234 if (rctx->b.chip_class >= EVERGREEN) {
235 evergreen_update_ps_state(ctx, shader);
236 } else {
237 r600_update_ps_state(ctx, shader);
238 }
239 break;
240 default:
241 r = -EINVAL;
242 goto error;
243 }
244 return 0;
245
246 error:
247 r600_pipe_shader_destroy(ctx, shader);
248 return r;
249 }
250
251 void r600_pipe_shader_destroy(struct pipe_context *ctx, struct r600_pipe_shader *shader)
252 {
253 pipe_resource_reference((struct pipe_resource**)&shader->bo, NULL);
254 r600_bytecode_clear(&shader->shader.bc);
255 r600_release_command_buffer(&shader->command_buffer);
256 }
257
258 /*
259 * tgsi -> r600 shader
260 */
261 struct r600_shader_tgsi_instruction;
262
263 struct r600_shader_src {
264 unsigned sel;
265 unsigned swizzle[4];
266 unsigned neg;
267 unsigned abs;
268 unsigned rel;
269 unsigned kc_bank;
270 boolean kc_rel; /* true if cache bank is indexed */
271 uint32_t value[4];
272 };
273
274 struct eg_interp {
275 boolean enabled;
276 unsigned ij_index;
277 };
278
279 struct r600_shader_ctx {
280 struct tgsi_shader_info info;
281 struct tgsi_parse_context parse;
282 const struct tgsi_token *tokens;
283 unsigned type;
284 unsigned file_offset[TGSI_FILE_COUNT];
285 unsigned temp_reg;
286 struct r600_shader_tgsi_instruction *inst_info;
287 struct r600_bytecode *bc;
288 struct r600_shader *shader;
289 struct r600_shader_src src[4];
290 uint32_t *literals;
291 uint32_t nliterals;
292 uint32_t max_driver_temp_used;
293 boolean use_llvm;
294 /* needed for evergreen interpolation */
295 struct eg_interp eg_interpolators[6]; // indexed by Persp/Linear * 3 + sample/center/centroid
296 /* evergreen/cayman also store sample mask in face register */
297 int face_gpr;
298 /* sample id is .w component stored in fixed point position register */
299 int fixed_pt_position_gpr;
300 int colors_used;
301 boolean clip_vertex_write;
302 unsigned cv_output;
303 unsigned edgeflag_output;
304 int fragcoord_input;
305 int native_integers;
306 int next_ring_offset;
307 int gs_out_ring_offset;
308 int gs_next_vertex;
309 struct r600_shader *gs_for_vs;
310 int gs_export_gpr_treg;
311 };
312
313 struct r600_shader_tgsi_instruction {
314 unsigned op;
315 int (*process)(struct r600_shader_ctx *ctx);
316 };
317
318 static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, bool ind);
319 static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[];
320 static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx);
321 static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason);
322 static void fc_pushlevel(struct r600_shader_ctx *ctx, int type);
323 static int tgsi_else(struct r600_shader_ctx *ctx);
324 static int tgsi_endif(struct r600_shader_ctx *ctx);
325 static int tgsi_bgnloop(struct r600_shader_ctx *ctx);
326 static int tgsi_endloop(struct r600_shader_ctx *ctx);
327 static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx);
328 static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
329 unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,
330 unsigned int dst_reg);
331 static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
332 const struct r600_shader_src *shader_src,
333 unsigned chan);
334
335 static int tgsi_is_supported(struct r600_shader_ctx *ctx)
336 {
337 struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction;
338 int j;
339
340 if (i->Instruction.NumDstRegs > 1) {
341 R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs);
342 return -EINVAL;
343 }
344 if (i->Instruction.Predicate) {
345 R600_ERR("predicate unsupported\n");
346 return -EINVAL;
347 }
348 #if 0
349 if (i->Instruction.Label) {
350 R600_ERR("label unsupported\n");
351 return -EINVAL;
352 }
353 #endif
354 for (j = 0; j < i->Instruction.NumSrcRegs; j++) {
355 if (i->Src[j].Register.Dimension) {
356 switch (i->Src[j].Register.File) {
357 case TGSI_FILE_CONSTANT:
358 break;
359 case TGSI_FILE_INPUT:
360 if (ctx->type == TGSI_PROCESSOR_GEOMETRY)
361 break;
362 default:
363 R600_ERR("unsupported src %d (dimension %d)\n", j,
364 i->Src[j].Register.Dimension);
365 return -EINVAL;
366 }
367 }
368 }
369 for (j = 0; j < i->Instruction.NumDstRegs; j++) {
370 if (i->Dst[j].Register.Dimension) {
371 R600_ERR("unsupported dst (dimension)\n");
372 return -EINVAL;
373 }
374 }
375 return 0;
376 }
377
378 int eg_get_interpolator_index(unsigned interpolate, unsigned location)
379 {
380 if (interpolate == TGSI_INTERPOLATE_COLOR ||
381 interpolate == TGSI_INTERPOLATE_LINEAR ||
382 interpolate == TGSI_INTERPOLATE_PERSPECTIVE)
383 {
384 int is_linear = interpolate == TGSI_INTERPOLATE_LINEAR;
385 int loc;
386
387 switch(location) {
388 case TGSI_INTERPOLATE_LOC_CENTER:
389 loc = 1;
390 break;
391 case TGSI_INTERPOLATE_LOC_CENTROID:
392 loc = 2;
393 break;
394 case TGSI_INTERPOLATE_LOC_SAMPLE:
395 default:
396 loc = 0; break;
397 }
398
399 return is_linear * 3 + loc;
400 }
401
402 return -1;
403 }
404
405 static void evergreen_interp_assign_ij_index(struct r600_shader_ctx *ctx,
406 int input)
407 {
408 int i = eg_get_interpolator_index(
409 ctx->shader->input[input].interpolate,
410 ctx->shader->input[input].interpolate_location);
411 assert(i >= 0);
412 ctx->shader->input[input].ij_index = ctx->eg_interpolators[i].ij_index;
413 }
414
415 static int evergreen_interp_alu(struct r600_shader_ctx *ctx, int input)
416 {
417 int i, r;
418 struct r600_bytecode_alu alu;
419 int gpr = 0, base_chan = 0;
420 int ij_index = ctx->shader->input[input].ij_index;
421
422 /* work out gpr and base_chan from index */
423 gpr = ij_index / 2;
424 base_chan = (2 * (ij_index % 2)) + 1;
425
426 for (i = 0; i < 8; i++) {
427 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
428
429 if (i < 4)
430 alu.op = ALU_OP2_INTERP_ZW;
431 else
432 alu.op = ALU_OP2_INTERP_XY;
433
434 if ((i > 1) && (i < 6)) {
435 alu.dst.sel = ctx->shader->input[input].gpr;
436 alu.dst.write = 1;
437 }
438
439 alu.dst.chan = i % 4;
440
441 alu.src[0].sel = gpr;
442 alu.src[0].chan = (base_chan - (i % 2));
443
444 alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
445
446 alu.bank_swizzle_force = SQ_ALU_VEC_210;
447 if ((i % 4) == 3)
448 alu.last = 1;
449 r = r600_bytecode_add_alu(ctx->bc, &alu);
450 if (r)
451 return r;
452 }
453 return 0;
454 }
455
456 static int evergreen_interp_flat(struct r600_shader_ctx *ctx, int input)
457 {
458 int i, r;
459 struct r600_bytecode_alu alu;
460
461 for (i = 0; i < 4; i++) {
462 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
463
464 alu.op = ALU_OP1_INTERP_LOAD_P0;
465
466 alu.dst.sel = ctx->shader->input[input].gpr;
467 alu.dst.write = 1;
468
469 alu.dst.chan = i;
470
471 alu.src[0].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
472 alu.src[0].chan = i;
473
474 if (i == 3)
475 alu.last = 1;
476 r = r600_bytecode_add_alu(ctx->bc, &alu);
477 if (r)
478 return r;
479 }
480 return 0;
481 }
482
483 /*
484 * Special export handling in shaders
485 *
486 * shader export ARRAY_BASE for EXPORT_POS:
487 * 60 is position
488 * 61 is misc vector
489 * 62, 63 are clip distance vectors
490 *
491 * The use of the values exported in 61-63 are controlled by PA_CL_VS_OUT_CNTL:
492 * VS_OUT_MISC_VEC_ENA - enables the use of all fields in export 61
493 * USE_VTX_POINT_SIZE - point size in the X channel of export 61
494 * USE_VTX_EDGE_FLAG - edge flag in the Y channel of export 61
495 * USE_VTX_RENDER_TARGET_INDX - render target index in the Z channel of export 61
496 * USE_VTX_VIEWPORT_INDX - viewport index in the W channel of export 61
497 * USE_VTX_KILL_FLAG - kill flag in the Z channel of export 61 (mutually
498 * exclusive from render target index)
499 * VS_OUT_CCDIST0_VEC_ENA/VS_OUT_CCDIST1_VEC_ENA - enable clip distance vectors
500 *
501 *
502 * shader export ARRAY_BASE for EXPORT_PIXEL:
503 * 0-7 CB targets
504 * 61 computed Z vector
505 *
506 * The use of the values exported in the computed Z vector are controlled
507 * by DB_SHADER_CONTROL:
508 * Z_EXPORT_ENABLE - Z as a float in RED
509 * STENCIL_REF_EXPORT_ENABLE - stencil ref as int in GREEN
510 * COVERAGE_TO_MASK_ENABLE - alpha to mask in ALPHA
511 * MASK_EXPORT_ENABLE - pixel sample mask in BLUE
512 * DB_SOURCE_FORMAT - export control restrictions
513 *
514 */
515
516
517 /* Map name/sid pair from tgsi to the 8-bit semantic index for SPI setup */
518 static int r600_spi_sid(struct r600_shader_io * io)
519 {
520 int index, name = io->name;
521
522 /* These params are handled differently, they don't need
523 * semantic indices, so we'll use 0 for them.
524 */
525 if (name == TGSI_SEMANTIC_POSITION ||
526 name == TGSI_SEMANTIC_PSIZE ||
527 name == TGSI_SEMANTIC_EDGEFLAG ||
528 name == TGSI_SEMANTIC_FACE ||
529 name == TGSI_SEMANTIC_SAMPLEMASK)
530 index = 0;
531 else {
532 if (name == TGSI_SEMANTIC_GENERIC) {
533 /* For generic params simply use sid from tgsi */
534 index = io->sid;
535 } else {
536 /* For non-generic params - pack name and sid into 8 bits */
537 index = 0x80 | (name<<3) | (io->sid);
538 }
539
540 /* Make sure that all really used indices have nonzero value, so
541 * we can just compare it to 0 later instead of comparing the name
542 * with different values to detect special cases. */
543 index++;
544 }
545
546 return index;
547 };
548
549 /* turn input into interpolate on EG */
550 static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index)
551 {
552 int r = 0;
553
554 if (ctx->shader->input[index].spi_sid) {
555 ctx->shader->input[index].lds_pos = ctx->shader->nlds++;
556 if (ctx->shader->input[index].interpolate > 0) {
557 evergreen_interp_assign_ij_index(ctx, index);
558 if (!ctx->use_llvm)
559 r = evergreen_interp_alu(ctx, index);
560 } else {
561 if (!ctx->use_llvm)
562 r = evergreen_interp_flat(ctx, index);
563 }
564 }
565 return r;
566 }
567
568 static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back)
569 {
570 struct r600_bytecode_alu alu;
571 int i, r;
572 int gpr_front = ctx->shader->input[front].gpr;
573 int gpr_back = ctx->shader->input[back].gpr;
574
575 for (i = 0; i < 4; i++) {
576 memset(&alu, 0, sizeof(alu));
577 alu.op = ALU_OP3_CNDGT;
578 alu.is_op3 = 1;
579 alu.dst.write = 1;
580 alu.dst.sel = gpr_front;
581 alu.src[0].sel = ctx->face_gpr;
582 alu.src[1].sel = gpr_front;
583 alu.src[2].sel = gpr_back;
584
585 alu.dst.chan = i;
586 alu.src[1].chan = i;
587 alu.src[2].chan = i;
588 alu.last = (i==3);
589
590 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
591 return r;
592 }
593
594 return 0;
595 }
596
597 static int vs_add_primid_output(struct r600_shader_ctx *ctx, int prim_id_sid)
598 {
599 int i;
600 i = ctx->shader->noutput++;
601 ctx->shader->output[i].name = TGSI_SEMANTIC_PRIMID;
602 ctx->shader->output[i].sid = 0;
603 ctx->shader->output[i].gpr = 0;
604 ctx->shader->output[i].interpolate = TGSI_INTERPOLATE_CONSTANT;
605 ctx->shader->output[i].write_mask = 0x4;
606 ctx->shader->output[i].spi_sid = prim_id_sid;
607
608 return 0;
609 }
610
611 static int tgsi_declaration(struct r600_shader_ctx *ctx)
612 {
613 struct tgsi_full_declaration *d = &ctx->parse.FullToken.FullDeclaration;
614 int r, i, j, count = d->Range.Last - d->Range.First + 1;
615
616 switch (d->Declaration.File) {
617 case TGSI_FILE_INPUT:
618 i = ctx->shader->ninput;
619 assert(i < Elements(ctx->shader->input));
620 ctx->shader->ninput += count;
621 ctx->shader->input[i].name = d->Semantic.Name;
622 ctx->shader->input[i].sid = d->Semantic.Index;
623 ctx->shader->input[i].interpolate = d->Interp.Interpolate;
624 ctx->shader->input[i].interpolate_location = d->Interp.Location;
625 ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First;
626 if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
627 ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]);
628 switch (ctx->shader->input[i].name) {
629 case TGSI_SEMANTIC_FACE:
630 if (ctx->face_gpr != -1)
631 ctx->shader->input[i].gpr = ctx->face_gpr; /* already allocated by allocate_system_value_inputs */
632 else
633 ctx->face_gpr = ctx->shader->input[i].gpr;
634 break;
635 case TGSI_SEMANTIC_COLOR:
636 ctx->colors_used++;
637 break;
638 case TGSI_SEMANTIC_POSITION:
639 ctx->fragcoord_input = i;
640 break;
641 case TGSI_SEMANTIC_PRIMID:
642 /* set this for now */
643 ctx->shader->gs_prim_id_input = true;
644 ctx->shader->ps_prim_id_input = i;
645 break;
646 }
647 if (ctx->bc->chip_class >= EVERGREEN) {
648 if ((r = evergreen_interp_input(ctx, i)))
649 return r;
650 }
651 } else if (ctx->type == TGSI_PROCESSOR_GEOMETRY) {
652 /* FIXME probably skip inputs if they aren't passed in the ring */
653 ctx->shader->input[i].ring_offset = ctx->next_ring_offset;
654 ctx->next_ring_offset += 16;
655 if (ctx->shader->input[i].name == TGSI_SEMANTIC_PRIMID)
656 ctx->shader->gs_prim_id_input = true;
657 }
658 for (j = 1; j < count; ++j) {
659 ctx->shader->input[i + j] = ctx->shader->input[i];
660 ctx->shader->input[i + j].gpr += j;
661 }
662 break;
663 case TGSI_FILE_OUTPUT:
664 i = ctx->shader->noutput++;
665 assert(i < Elements(ctx->shader->output));
666 ctx->shader->output[i].name = d->Semantic.Name;
667 ctx->shader->output[i].sid = d->Semantic.Index;
668 ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First;
669 ctx->shader->output[i].interpolate = d->Interp.Interpolate;
670 ctx->shader->output[i].write_mask = d->Declaration.UsageMask;
671 if (ctx->type == TGSI_PROCESSOR_VERTEX ||
672 ctx->type == TGSI_PROCESSOR_GEOMETRY) {
673 ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]);
674 switch (d->Semantic.Name) {
675 case TGSI_SEMANTIC_CLIPDIST:
676 ctx->shader->clip_dist_write |= d->Declaration.UsageMask << (d->Semantic.Index << 2);
677 break;
678 case TGSI_SEMANTIC_PSIZE:
679 ctx->shader->vs_out_misc_write = 1;
680 ctx->shader->vs_out_point_size = 1;
681 break;
682 case TGSI_SEMANTIC_EDGEFLAG:
683 ctx->shader->vs_out_misc_write = 1;
684 ctx->shader->vs_out_edgeflag = 1;
685 ctx->edgeflag_output = i;
686 break;
687 case TGSI_SEMANTIC_VIEWPORT_INDEX:
688 ctx->shader->vs_out_misc_write = 1;
689 ctx->shader->vs_out_viewport = 1;
690 break;
691 case TGSI_SEMANTIC_LAYER:
692 ctx->shader->vs_out_misc_write = 1;
693 ctx->shader->vs_out_layer = 1;
694 break;
695 case TGSI_SEMANTIC_CLIPVERTEX:
696 ctx->clip_vertex_write = TRUE;
697 ctx->cv_output = i;
698 break;
699 }
700 if (ctx->type == TGSI_PROCESSOR_GEOMETRY) {
701 ctx->gs_out_ring_offset += 16;
702 }
703 } else if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
704 switch (d->Semantic.Name) {
705 case TGSI_SEMANTIC_COLOR:
706 ctx->shader->nr_ps_max_color_exports++;
707 break;
708 }
709 }
710 break;
711 case TGSI_FILE_TEMPORARY:
712 if (ctx->info.indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
713 if (d->Array.ArrayID) {
714 r600_add_gpr_array(ctx->shader,
715 ctx->file_offset[TGSI_FILE_TEMPORARY] +
716 d->Range.First,
717 d->Range.Last - d->Range.First + 1, 0x0F);
718 }
719 }
720 break;
721
722 case TGSI_FILE_CONSTANT:
723 case TGSI_FILE_SAMPLER:
724 case TGSI_FILE_ADDRESS:
725 break;
726
727 case TGSI_FILE_SYSTEM_VALUE:
728 if (d->Semantic.Name == TGSI_SEMANTIC_SAMPLEMASK ||
729 d->Semantic.Name == TGSI_SEMANTIC_SAMPLEID ||
730 d->Semantic.Name == TGSI_SEMANTIC_SAMPLEPOS) {
731 break; /* Already handled from allocate_system_value_inputs */
732 } else if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) {
733 if (!ctx->native_integers) {
734 struct r600_bytecode_alu alu;
735 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
736
737 alu.op = ALU_OP1_INT_TO_FLT;
738 alu.src[0].sel = 0;
739 alu.src[0].chan = 3;
740
741 alu.dst.sel = 0;
742 alu.dst.chan = 3;
743 alu.dst.write = 1;
744 alu.last = 1;
745
746 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
747 return r;
748 }
749 break;
750 } else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID)
751 break;
752 else if (d->Semantic.Name == TGSI_SEMANTIC_INVOCATIONID)
753 break;
754 default:
755 R600_ERR("unsupported file %d declaration\n", d->Declaration.File);
756 return -EINVAL;
757 }
758 return 0;
759 }
760
761 static int r600_get_temp(struct r600_shader_ctx *ctx)
762 {
763 return ctx->temp_reg + ctx->max_driver_temp_used++;
764 }
765
766 static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_offset)
767 {
768 struct tgsi_parse_context parse;
769 struct {
770 boolean enabled;
771 int *reg;
772 unsigned name, alternate_name;
773 } inputs[2] = {
774 { false, &ctx->face_gpr, TGSI_SEMANTIC_SAMPLEMASK, ~0u }, /* lives in Front Face GPR.z */
775
776 { false, &ctx->fixed_pt_position_gpr, TGSI_SEMANTIC_SAMPLEID, TGSI_SEMANTIC_SAMPLEPOS } /* SAMPLEID is in Fixed Point Position GPR.w */
777 };
778 int i, k, num_regs = 0;
779
780 if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) {
781 return 0;
782 }
783
784 /* need to scan shader for system values and interpolateAtSample/Offset/Centroid */
785 while (!tgsi_parse_end_of_tokens(&parse)) {
786 tgsi_parse_token(&parse);
787
788 if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) {
789 const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
790 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE ||
791 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
792 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID)
793 {
794 int interpolate, location, k;
795
796 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
797 location = TGSI_INTERPOLATE_LOC_CENTER;
798 inputs[1].enabled = true; /* needs SAMPLEID */
799 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
800 location = TGSI_INTERPOLATE_LOC_CENTER;
801 /* Needs sample positions, currently those are always available */
802 } else {
803 location = TGSI_INTERPOLATE_LOC_CENTROID;
804 }
805
806 interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index];
807 k = eg_get_interpolator_index(interpolate, location);
808 ctx->eg_interpolators[k].enabled = true;
809 }
810 } else if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_DECLARATION) {
811 struct tgsi_full_declaration *d = &parse.FullToken.FullDeclaration;
812 if (d->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
813 for (k = 0; k < Elements(inputs); k++) {
814 if (d->Semantic.Name == inputs[k].name ||
815 d->Semantic.Name == inputs[k].alternate_name) {
816 inputs[k].enabled = true;
817 }
818 }
819 }
820 }
821 }
822
823 tgsi_parse_free(&parse);
824
825 for (i = 0; i < Elements(inputs); i++) {
826 boolean enabled = inputs[i].enabled;
827 int *reg = inputs[i].reg;
828 unsigned name = inputs[i].name;
829
830 if (enabled) {
831 int gpr = gpr_offset + num_regs++;
832
833 // add to inputs, allocate a gpr
834 k = ctx->shader->ninput ++;
835 ctx->shader->input[k].name = name;
836 ctx->shader->input[k].sid = 0;
837 ctx->shader->input[k].interpolate = TGSI_INTERPOLATE_CONSTANT;
838 ctx->shader->input[k].interpolate_location = TGSI_INTERPOLATE_LOC_CENTER;
839 *reg = ctx->shader->input[k].gpr = gpr;
840 }
841 }
842
843 return gpr_offset + num_regs;
844 }
845
846 /*
847 * for evergreen we need to scan the shader to find the number of GPRs we need to
848 * reserve for interpolation and system values
849 *
850 * we need to know if we are going to emit
851 * any sample or centroid inputs
852 * if perspective and linear are required
853 */
854 static int evergreen_gpr_count(struct r600_shader_ctx *ctx)
855 {
856 int i;
857 int num_baryc;
858 struct tgsi_parse_context parse;
859
860 memset(&ctx->eg_interpolators, 0, sizeof(ctx->eg_interpolators));
861
862 for (i = 0; i < ctx->info.num_inputs; i++) {
863 int k;
864 /* skip position/face/mask/sampleid */
865 if (ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_POSITION ||
866 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_FACE ||
867 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEMASK ||
868 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEID)
869 continue;
870
871 k = eg_get_interpolator_index(
872 ctx->info.input_interpolate[i],
873 ctx->info.input_interpolate_loc[i]);
874 if (k >= 0)
875 ctx->eg_interpolators[k].enabled = TRUE;
876 }
877
878 if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) {
879 return 0;
880 }
881
882 /* need to scan shader for system values and interpolateAtSample/Offset/Centroid */
883 while (!tgsi_parse_end_of_tokens(&parse)) {
884 tgsi_parse_token(&parse);
885
886 if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) {
887 const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
888 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE ||
889 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
890 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID)
891 {
892 int interpolate, location, k;
893
894 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
895 location = TGSI_INTERPOLATE_LOC_CENTER;
896 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
897 location = TGSI_INTERPOLATE_LOC_CENTER;
898 } else {
899 location = TGSI_INTERPOLATE_LOC_CENTROID;
900 }
901
902 interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index];
903 k = eg_get_interpolator_index(interpolate, location);
904 ctx->eg_interpolators[k].enabled = true;
905 }
906 }
907 }
908
909 tgsi_parse_free(&parse);
910
911 /* assign gpr to each interpolator according to priority */
912 num_baryc = 0;
913 for (i = 0; i < Elements(ctx->eg_interpolators); i++) {
914 if (ctx->eg_interpolators[i].enabled) {
915 ctx->eg_interpolators[i].ij_index = num_baryc;
916 num_baryc ++;
917 }
918 }
919
920 /* XXX PULL MODEL and LINE STIPPLE */
921
922 num_baryc = (num_baryc + 1) >> 1;
923 return allocate_system_value_inputs(ctx, num_baryc);
924 }
925
926 /* sample_id_sel == NULL means fetch for current sample */
927 static int load_sample_position(struct r600_shader_ctx *ctx, struct r600_shader_src *sample_id, int chan_sel)
928 {
929 struct r600_bytecode_vtx vtx;
930 int r, t1;
931
932 assert(ctx->fixed_pt_position_gpr != -1);
933
934 t1 = r600_get_temp(ctx);
935
936 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
937 vtx.op = FETCH_OP_VFETCH;
938 vtx.buffer_id = R600_SAMPLE_POSITIONS_CONST_BUFFER;
939 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
940 if (sample_id == NULL) {
941 vtx.src_gpr = ctx->fixed_pt_position_gpr; // SAMPLEID is in .w;
942 vtx.src_sel_x = 3;
943 }
944 else {
945 struct r600_bytecode_alu alu;
946
947 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
948 alu.op = ALU_OP1_MOV;
949 r600_bytecode_src(&alu.src[0], sample_id, chan_sel);
950 alu.dst.sel = t1;
951 alu.dst.write = 1;
952 alu.last = 1;
953 r = r600_bytecode_add_alu(ctx->bc, &alu);
954 if (r)
955 return r;
956
957 vtx.src_gpr = t1;
958 vtx.src_sel_x = 0;
959 }
960 vtx.mega_fetch_count = 16;
961 vtx.dst_gpr = t1;
962 vtx.dst_sel_x = 0;
963 vtx.dst_sel_y = 1;
964 vtx.dst_sel_z = 2;
965 vtx.dst_sel_w = 3;
966 vtx.data_format = FMT_32_32_32_32_FLOAT;
967 vtx.num_format_all = 2;
968 vtx.format_comp_all = 1;
969 vtx.use_const_fields = 0;
970 vtx.offset = 1; // first element is size of buffer
971 vtx.endian = r600_endian_swap(32);
972 vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */
973
974 r = r600_bytecode_add_vtx(ctx->bc, &vtx);
975 if (r)
976 return r;
977
978 return t1;
979 }
980
981 static void tgsi_src(struct r600_shader_ctx *ctx,
982 const struct tgsi_full_src_register *tgsi_src,
983 struct r600_shader_src *r600_src)
984 {
985 memset(r600_src, 0, sizeof(*r600_src));
986 r600_src->swizzle[0] = tgsi_src->Register.SwizzleX;
987 r600_src->swizzle[1] = tgsi_src->Register.SwizzleY;
988 r600_src->swizzle[2] = tgsi_src->Register.SwizzleZ;
989 r600_src->swizzle[3] = tgsi_src->Register.SwizzleW;
990 r600_src->neg = tgsi_src->Register.Negate;
991 r600_src->abs = tgsi_src->Register.Absolute;
992
993 if (tgsi_src->Register.File == TGSI_FILE_IMMEDIATE) {
994 int index;
995 if ((tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleY) &&
996 (tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleZ) &&
997 (tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleW)) {
998
999 index = tgsi_src->Register.Index * 4 + tgsi_src->Register.SwizzleX;
1000 r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel, &r600_src->neg);
1001 if (r600_src->sel != V_SQ_ALU_SRC_LITERAL)
1002 return;
1003 }
1004 index = tgsi_src->Register.Index;
1005 r600_src->sel = V_SQ_ALU_SRC_LITERAL;
1006 memcpy(r600_src->value, ctx->literals + index * 4, sizeof(r600_src->value));
1007 } else if (tgsi_src->Register.File == TGSI_FILE_SYSTEM_VALUE) {
1008 if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEMASK) {
1009 r600_src->swizzle[0] = 2; // Z value
1010 r600_src->swizzle[1] = 2;
1011 r600_src->swizzle[2] = 2;
1012 r600_src->swizzle[3] = 2;
1013 r600_src->sel = ctx->face_gpr;
1014 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEID) {
1015 r600_src->swizzle[0] = 3; // W value
1016 r600_src->swizzle[1] = 3;
1017 r600_src->swizzle[2] = 3;
1018 r600_src->swizzle[3] = 3;
1019 r600_src->sel = ctx->fixed_pt_position_gpr;
1020 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEPOS) {
1021 r600_src->swizzle[0] = 0;
1022 r600_src->swizzle[1] = 1;
1023 r600_src->swizzle[2] = 4;
1024 r600_src->swizzle[3] = 4;
1025 r600_src->sel = load_sample_position(ctx, NULL, -1);
1026 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INSTANCEID) {
1027 r600_src->swizzle[0] = 3;
1028 r600_src->swizzle[1] = 3;
1029 r600_src->swizzle[2] = 3;
1030 r600_src->swizzle[3] = 3;
1031 r600_src->sel = 0;
1032 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTEXID) {
1033 r600_src->swizzle[0] = 0;
1034 r600_src->swizzle[1] = 0;
1035 r600_src->swizzle[2] = 0;
1036 r600_src->swizzle[3] = 0;
1037 r600_src->sel = 0;
1038 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) {
1039 r600_src->swizzle[0] = 3;
1040 r600_src->swizzle[1] = 3;
1041 r600_src->swizzle[2] = 3;
1042 r600_src->swizzle[3] = 3;
1043 r600_src->sel = 1;
1044 }
1045 } else {
1046 if (tgsi_src->Register.Indirect)
1047 r600_src->rel = V_SQ_REL_RELATIVE;
1048 r600_src->sel = tgsi_src->Register.Index;
1049 r600_src->sel += ctx->file_offset[tgsi_src->Register.File];
1050 }
1051 if (tgsi_src->Register.File == TGSI_FILE_CONSTANT) {
1052 if (tgsi_src->Register.Dimension) {
1053 r600_src->kc_bank = tgsi_src->Dimension.Index;
1054 if (tgsi_src->Dimension.Indirect) {
1055 r600_src->kc_rel = 1;
1056 }
1057 }
1058 }
1059 }
1060
1061 static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
1062 unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,
1063 unsigned int dst_reg)
1064 {
1065 struct r600_bytecode_vtx vtx;
1066 unsigned int ar_reg;
1067 int r;
1068
1069 if (offset) {
1070 struct r600_bytecode_alu alu;
1071
1072 memset(&alu, 0, sizeof(alu));
1073
1074 alu.op = ALU_OP2_ADD_INT;
1075 alu.src[0].sel = ctx->bc->ar_reg;
1076 alu.src[0].chan = ar_chan;
1077
1078 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1079 alu.src[1].value = offset;
1080
1081 alu.dst.sel = dst_reg;
1082 alu.dst.chan = ar_chan;
1083 alu.dst.write = 1;
1084 alu.last = 1;
1085
1086 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1087 return r;
1088
1089 ar_reg = dst_reg;
1090 } else {
1091 ar_reg = ctx->bc->ar_reg;
1092 }
1093
1094 memset(&vtx, 0, sizeof(vtx));
1095 vtx.buffer_id = cb_idx;
1096 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1097 vtx.src_gpr = ar_reg;
1098 vtx.src_sel_x = ar_chan;
1099 vtx.mega_fetch_count = 16;
1100 vtx.dst_gpr = dst_reg;
1101 vtx.dst_sel_x = 0; /* SEL_X */
1102 vtx.dst_sel_y = 1; /* SEL_Y */
1103 vtx.dst_sel_z = 2; /* SEL_Z */
1104 vtx.dst_sel_w = 3; /* SEL_W */
1105 vtx.data_format = FMT_32_32_32_32_FLOAT;
1106 vtx.num_format_all = 2; /* NUM_FORMAT_SCALED */
1107 vtx.format_comp_all = 1; /* FORMAT_COMP_SIGNED */
1108 vtx.endian = r600_endian_swap(32);
1109 vtx.buffer_index_mode = cb_rel; // cb_rel ? V_SQ_CF_INDEX_0 : V_SQ_CF_INDEX_NONE;
1110
1111 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1112 return r;
1113
1114 return 0;
1115 }
1116
1117 static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1118 {
1119 struct r600_bytecode_vtx vtx;
1120 int r;
1121 unsigned index = src->Register.Index;
1122 unsigned vtx_id = src->Dimension.Index;
1123 int offset_reg = vtx_id / 3;
1124 int offset_chan = vtx_id % 3;
1125
1126 /* offsets of per-vertex data in ESGS ring are passed to GS in R0.x, R0.y,
1127 * R0.w, R1.x, R1.y, R1.z (it seems R0.z is used for PrimitiveID) */
1128
1129 if (offset_reg == 0 && offset_chan == 2)
1130 offset_chan = 3;
1131
1132 if (src->Dimension.Indirect) {
1133 int treg[3];
1134 int t2;
1135 struct r600_bytecode_alu alu;
1136 int r, i;
1137
1138 /* you have got to be shitting me -
1139 we have to put the R0.x/y/w into Rt.x Rt+1.x Rt+2.x then index reg from Rt.
1140 at least this is what fglrx seems to do. */
1141 for (i = 0; i < 3; i++) {
1142 treg[i] = r600_get_temp(ctx);
1143 }
1144 t2 = r600_get_temp(ctx);
1145 for (i = 0; i < 3; i++) {
1146 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1147 alu.op = ALU_OP1_MOV;
1148 alu.src[0].sel = 0;
1149 alu.src[0].chan = i == 2 ? 3 : i;
1150 alu.dst.sel = treg[i];
1151 alu.dst.chan = 0;
1152 alu.dst.write = 1;
1153 alu.last = 1;
1154 r = r600_bytecode_add_alu(ctx->bc, &alu);
1155 if (r)
1156 return r;
1157 }
1158 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1159 alu.op = ALU_OP1_MOV;
1160 alu.src[0].sel = treg[0];
1161 alu.src[0].rel = 1;
1162 alu.dst.sel = t2;
1163 alu.dst.write = 1;
1164 alu.last = 1;
1165 r = r600_bytecode_add_alu(ctx->bc, &alu);
1166 if (r)
1167 return r;
1168 offset_reg = t2;
1169 }
1170
1171
1172 memset(&vtx, 0, sizeof(vtx));
1173 vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
1174 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1175 vtx.src_gpr = offset_reg;
1176 vtx.src_sel_x = offset_chan;
1177 vtx.offset = index * 16; /*bytes*/
1178 vtx.mega_fetch_count = 16;
1179 vtx.dst_gpr = dst_reg;
1180 vtx.dst_sel_x = 0; /* SEL_X */
1181 vtx.dst_sel_y = 1; /* SEL_Y */
1182 vtx.dst_sel_z = 2; /* SEL_Z */
1183 vtx.dst_sel_w = 3; /* SEL_W */
1184 if (ctx->bc->chip_class >= EVERGREEN) {
1185 vtx.use_const_fields = 1;
1186 } else {
1187 vtx.data_format = FMT_32_32_32_32_FLOAT;
1188 }
1189
1190 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1191 return r;
1192
1193 return 0;
1194 }
1195
1196 static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx)
1197 {
1198 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1199 int i;
1200
1201 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1202 struct tgsi_full_src_register *src = &inst->Src[i];
1203
1204 if (src->Register.File == TGSI_FILE_INPUT) {
1205 if (ctx->shader->input[src->Register.Index].name == TGSI_SEMANTIC_PRIMID) {
1206 /* primitive id is in R0.z */
1207 ctx->src[i].sel = 0;
1208 ctx->src[i].swizzle[0] = 2;
1209 }
1210 }
1211 if (src->Register.File == TGSI_FILE_INPUT && src->Register.Dimension) {
1212 int treg = r600_get_temp(ctx);
1213
1214 fetch_gs_input(ctx, src, treg);
1215 ctx->src[i].sel = treg;
1216 }
1217 }
1218 return 0;
1219 }
1220
1221 static int tgsi_split_constant(struct r600_shader_ctx *ctx)
1222 {
1223 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1224 struct r600_bytecode_alu alu;
1225 int i, j, k, nconst, r;
1226
1227 for (i = 0, nconst = 0; i < inst->Instruction.NumSrcRegs; i++) {
1228 if (inst->Src[i].Register.File == TGSI_FILE_CONSTANT) {
1229 nconst++;
1230 }
1231 tgsi_src(ctx, &inst->Src[i], &ctx->src[i]);
1232 }
1233 for (i = 0, j = nconst - 1; i < inst->Instruction.NumSrcRegs; i++) {
1234 if (inst->Src[i].Register.File != TGSI_FILE_CONSTANT) {
1235 continue;
1236 }
1237
1238 if (ctx->src[i].kc_rel)
1239 ctx->shader->uses_index_registers = true;
1240
1241 if (ctx->src[i].rel) {
1242 int chan = inst->Src[i].Indirect.Swizzle;
1243 int treg = r600_get_temp(ctx);
1244 if ((r = tgsi_fetch_rel_const(ctx, ctx->src[i].kc_bank, ctx->src[i].kc_rel, ctx->src[i].sel - 512, chan, treg)))
1245 return r;
1246
1247 ctx->src[i].kc_bank = 0;
1248 ctx->src[i].kc_rel = 0;
1249 ctx->src[i].sel = treg;
1250 ctx->src[i].rel = 0;
1251 j--;
1252 } else if (j > 0) {
1253 int treg = r600_get_temp(ctx);
1254 for (k = 0; k < 4; k++) {
1255 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1256 alu.op = ALU_OP1_MOV;
1257 alu.src[0].sel = ctx->src[i].sel;
1258 alu.src[0].chan = k;
1259 alu.src[0].rel = ctx->src[i].rel;
1260 alu.src[0].kc_bank = ctx->src[i].kc_bank;
1261 alu.src[0].kc_rel = ctx->src[i].kc_rel;
1262 alu.dst.sel = treg;
1263 alu.dst.chan = k;
1264 alu.dst.write = 1;
1265 if (k == 3)
1266 alu.last = 1;
1267 r = r600_bytecode_add_alu(ctx->bc, &alu);
1268 if (r)
1269 return r;
1270 }
1271 ctx->src[i].sel = treg;
1272 ctx->src[i].rel =0;
1273 j--;
1274 }
1275 }
1276 return 0;
1277 }
1278
1279 /* need to move any immediate into a temp - for trig functions which use literal for PI stuff */
1280 static int tgsi_split_literal_constant(struct r600_shader_ctx *ctx)
1281 {
1282 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1283 struct r600_bytecode_alu alu;
1284 int i, j, k, nliteral, r;
1285
1286 for (i = 0, nliteral = 0; i < inst->Instruction.NumSrcRegs; i++) {
1287 if (ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
1288 nliteral++;
1289 }
1290 }
1291 for (i = 0, j = nliteral - 1; i < inst->Instruction.NumSrcRegs; i++) {
1292 if (j > 0 && ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
1293 int treg = r600_get_temp(ctx);
1294 for (k = 0; k < 4; k++) {
1295 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1296 alu.op = ALU_OP1_MOV;
1297 alu.src[0].sel = ctx->src[i].sel;
1298 alu.src[0].chan = k;
1299 alu.src[0].value = ctx->src[i].value[k];
1300 alu.dst.sel = treg;
1301 alu.dst.chan = k;
1302 alu.dst.write = 1;
1303 if (k == 3)
1304 alu.last = 1;
1305 r = r600_bytecode_add_alu(ctx->bc, &alu);
1306 if (r)
1307 return r;
1308 }
1309 ctx->src[i].sel = treg;
1310 j--;
1311 }
1312 }
1313 return 0;
1314 }
1315
1316 static int process_twoside_color_inputs(struct r600_shader_ctx *ctx)
1317 {
1318 int i, r, count = ctx->shader->ninput;
1319
1320 for (i = 0; i < count; i++) {
1321 if (ctx->shader->input[i].name == TGSI_SEMANTIC_COLOR) {
1322 r = select_twoside_color(ctx, i, ctx->shader->input[i].back_color_input);
1323 if (r)
1324 return r;
1325 }
1326 }
1327 return 0;
1328 }
1329
1330 static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output_info *so)
1331 {
1332 unsigned so_gpr[PIPE_MAX_SHADER_OUTPUTS];
1333 int i, j, r;
1334
1335 /* Sanity checking. */
1336 if (so->num_outputs > PIPE_MAX_SHADER_OUTPUTS) {
1337 R600_ERR("Too many stream outputs: %d\n", so->num_outputs);
1338 r = -EINVAL;
1339 goto out_err;
1340 }
1341 for (i = 0; i < so->num_outputs; i++) {
1342 if (so->output[i].output_buffer >= 4) {
1343 R600_ERR("Exceeded the max number of stream output buffers, got: %d\n",
1344 so->output[i].output_buffer);
1345 r = -EINVAL;
1346 goto out_err;
1347 }
1348 }
1349
1350 /* Initialize locations where the outputs are stored. */
1351 for (i = 0; i < so->num_outputs; i++) {
1352 so_gpr[i] = ctx->shader->output[so->output[i].register_index].gpr;
1353
1354 /* Lower outputs with dst_offset < start_component.
1355 *
1356 * We can only output 4D vectors with a write mask, e.g. we can
1357 * only output the W component at offset 3, etc. If we want
1358 * to store Y, Z, or W at buffer offset 0, we need to use MOV
1359 * to move it to X and output X. */
1360 if (so->output[i].dst_offset < so->output[i].start_component) {
1361 unsigned tmp = r600_get_temp(ctx);
1362
1363 for (j = 0; j < so->output[i].num_components; j++) {
1364 struct r600_bytecode_alu alu;
1365 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1366 alu.op = ALU_OP1_MOV;
1367 alu.src[0].sel = so_gpr[i];
1368 alu.src[0].chan = so->output[i].start_component + j;
1369
1370 alu.dst.sel = tmp;
1371 alu.dst.chan = j;
1372 alu.dst.write = 1;
1373 if (j == so->output[i].num_components - 1)
1374 alu.last = 1;
1375 r = r600_bytecode_add_alu(ctx->bc, &alu);
1376 if (r)
1377 return r;
1378 }
1379 so->output[i].start_component = 0;
1380 so_gpr[i] = tmp;
1381 }
1382 }
1383
1384 /* Write outputs to buffers. */
1385 for (i = 0; i < so->num_outputs; i++) {
1386 struct r600_bytecode_output output;
1387
1388 memset(&output, 0, sizeof(struct r600_bytecode_output));
1389 output.gpr = so_gpr[i];
1390 output.elem_size = so->output[i].num_components;
1391 output.array_base = so->output[i].dst_offset - so->output[i].start_component;
1392 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
1393 output.burst_count = 1;
1394 /* array_size is an upper limit for the burst_count
1395 * with MEM_STREAM instructions */
1396 output.array_size = 0xFFF;
1397 output.comp_mask = ((1 << so->output[i].num_components) - 1) << so->output[i].start_component;
1398 if (ctx->bc->chip_class >= EVERGREEN) {
1399 switch (so->output[i].output_buffer) {
1400 case 0:
1401 output.op = CF_OP_MEM_STREAM0_BUF0;
1402 break;
1403 case 1:
1404 output.op = CF_OP_MEM_STREAM0_BUF1;
1405 break;
1406 case 2:
1407 output.op = CF_OP_MEM_STREAM0_BUF2;
1408 break;
1409 case 3:
1410 output.op = CF_OP_MEM_STREAM0_BUF3;
1411 break;
1412 }
1413 } else {
1414 switch (so->output[i].output_buffer) {
1415 case 0:
1416 output.op = CF_OP_MEM_STREAM0;
1417 break;
1418 case 1:
1419 output.op = CF_OP_MEM_STREAM1;
1420 break;
1421 case 2:
1422 output.op = CF_OP_MEM_STREAM2;
1423 break;
1424 case 3:
1425 output.op = CF_OP_MEM_STREAM3;
1426 break;
1427 }
1428 }
1429 r = r600_bytecode_add_output(ctx->bc, &output);
1430 if (r)
1431 goto out_err;
1432 }
1433 return 0;
1434 out_err:
1435 return r;
1436 }
1437
1438 static void convert_edgeflag_to_int(struct r600_shader_ctx *ctx)
1439 {
1440 struct r600_bytecode_alu alu;
1441 unsigned reg;
1442
1443 if (!ctx->shader->vs_out_edgeflag)
1444 return;
1445
1446 reg = ctx->shader->output[ctx->edgeflag_output].gpr;
1447
1448 /* clamp(x, 0, 1) */
1449 memset(&alu, 0, sizeof(alu));
1450 alu.op = ALU_OP1_MOV;
1451 alu.src[0].sel = reg;
1452 alu.dst.sel = reg;
1453 alu.dst.write = 1;
1454 alu.dst.clamp = 1;
1455 alu.last = 1;
1456 r600_bytecode_add_alu(ctx->bc, &alu);
1457
1458 memset(&alu, 0, sizeof(alu));
1459 alu.op = ALU_OP1_FLT_TO_INT;
1460 alu.src[0].sel = reg;
1461 alu.dst.sel = reg;
1462 alu.dst.write = 1;
1463 alu.last = 1;
1464 r600_bytecode_add_alu(ctx->bc, &alu);
1465 }
1466
1467 static int generate_gs_copy_shader(struct r600_context *rctx,
1468 struct r600_pipe_shader *gs,
1469 struct pipe_stream_output_info *so)
1470 {
1471 struct r600_shader_ctx ctx = {};
1472 struct r600_shader *gs_shader = &gs->shader;
1473 struct r600_pipe_shader *cshader;
1474 int ocnt = gs_shader->noutput;
1475 struct r600_bytecode_alu alu;
1476 struct r600_bytecode_vtx vtx;
1477 struct r600_bytecode_output output;
1478 struct r600_bytecode_cf *cf_jump, *cf_pop,
1479 *last_exp_pos = NULL, *last_exp_param = NULL;
1480 int i, next_clip_pos = 61, next_param = 0;
1481
1482 cshader = calloc(1, sizeof(struct r600_pipe_shader));
1483 if (!cshader)
1484 return 0;
1485
1486 memcpy(cshader->shader.output, gs_shader->output, ocnt *
1487 sizeof(struct r600_shader_io));
1488
1489 cshader->shader.noutput = ocnt;
1490
1491 ctx.shader = &cshader->shader;
1492 ctx.bc = &ctx.shader->bc;
1493 ctx.type = ctx.bc->type = TGSI_PROCESSOR_VERTEX;
1494
1495 r600_bytecode_init(ctx.bc, rctx->b.chip_class, rctx->b.family,
1496 rctx->screen->has_compressed_msaa_texturing);
1497
1498 ctx.bc->isa = rctx->isa;
1499
1500 /* R0.x = R0.x & 0x3fffffff */
1501 memset(&alu, 0, sizeof(alu));
1502 alu.op = ALU_OP2_AND_INT;
1503 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1504 alu.src[1].value = 0x3fffffff;
1505 alu.dst.write = 1;
1506 r600_bytecode_add_alu(ctx.bc, &alu);
1507
1508 /* R0.y = R0.x >> 30 */
1509 memset(&alu, 0, sizeof(alu));
1510 alu.op = ALU_OP2_LSHR_INT;
1511 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1512 alu.src[1].value = 0x1e;
1513 alu.dst.chan = 1;
1514 alu.dst.write = 1;
1515 alu.last = 1;
1516 r600_bytecode_add_alu(ctx.bc, &alu);
1517
1518 /* PRED_SETE_INT __, R0.y, 0 */
1519 memset(&alu, 0, sizeof(alu));
1520 alu.op = ALU_OP2_PRED_SETE_INT;
1521 alu.src[0].chan = 1;
1522 alu.src[1].sel = V_SQ_ALU_SRC_0;
1523 alu.execute_mask = 1;
1524 alu.update_pred = 1;
1525 alu.last = 1;
1526 r600_bytecode_add_alu_type(ctx.bc, &alu, CF_OP_ALU_PUSH_BEFORE);
1527
1528 r600_bytecode_add_cfinst(ctx.bc, CF_OP_JUMP);
1529 cf_jump = ctx.bc->cf_last;
1530
1531 /* fetch vertex data from GSVS ring */
1532 for (i = 0; i < ocnt; ++i) {
1533 struct r600_shader_io *out = &ctx.shader->output[i];
1534 out->gpr = i + 1;
1535 out->ring_offset = i * 16;
1536
1537 memset(&vtx, 0, sizeof(vtx));
1538 vtx.op = FETCH_OP_VFETCH;
1539 vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
1540 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1541 vtx.offset = out->ring_offset;
1542 vtx.dst_gpr = out->gpr;
1543 vtx.dst_sel_x = 0;
1544 vtx.dst_sel_y = 1;
1545 vtx.dst_sel_z = 2;
1546 vtx.dst_sel_w = 3;
1547 if (rctx->b.chip_class >= EVERGREEN) {
1548 vtx.use_const_fields = 1;
1549 } else {
1550 vtx.data_format = FMT_32_32_32_32_FLOAT;
1551 }
1552
1553 r600_bytecode_add_vtx(ctx.bc, &vtx);
1554 }
1555
1556 /* XXX handle clipvertex, streamout? */
1557 emit_streamout(&ctx, so);
1558
1559 /* export vertex data */
1560 /* XXX factor out common code with r600_shader_from_tgsi ? */
1561 for (i = 0; i < ocnt; ++i) {
1562 struct r600_shader_io *out = &ctx.shader->output[i];
1563
1564 if (out->name == TGSI_SEMANTIC_CLIPVERTEX)
1565 continue;
1566
1567 memset(&output, 0, sizeof(output));
1568 output.gpr = out->gpr;
1569 output.elem_size = 3;
1570 output.swizzle_x = 0;
1571 output.swizzle_y = 1;
1572 output.swizzle_z = 2;
1573 output.swizzle_w = 3;
1574 output.burst_count = 1;
1575 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
1576 output.op = CF_OP_EXPORT;
1577 switch (out->name) {
1578 case TGSI_SEMANTIC_POSITION:
1579 output.array_base = 60;
1580 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1581 break;
1582
1583 case TGSI_SEMANTIC_PSIZE:
1584 output.array_base = 61;
1585 if (next_clip_pos == 61)
1586 next_clip_pos = 62;
1587 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1588 output.swizzle_y = 7;
1589 output.swizzle_z = 7;
1590 output.swizzle_w = 7;
1591 ctx.shader->vs_out_misc_write = 1;
1592 ctx.shader->vs_out_point_size = 1;
1593 break;
1594 case TGSI_SEMANTIC_LAYER:
1595 if (out->spi_sid) {
1596 /* duplicate it as PARAM to pass to the pixel shader */
1597 output.array_base = next_param++;
1598 r600_bytecode_add_output(ctx.bc, &output);
1599 last_exp_param = ctx.bc->cf_last;
1600 }
1601 output.array_base = 61;
1602 if (next_clip_pos == 61)
1603 next_clip_pos = 62;
1604 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1605 output.swizzle_x = 7;
1606 output.swizzle_y = 7;
1607 output.swizzle_z = 0;
1608 output.swizzle_w = 7;
1609 ctx.shader->vs_out_misc_write = 1;
1610 ctx.shader->vs_out_layer = 1;
1611 break;
1612 case TGSI_SEMANTIC_VIEWPORT_INDEX:
1613 if (out->spi_sid) {
1614 /* duplicate it as PARAM to pass to the pixel shader */
1615 output.array_base = next_param++;
1616 r600_bytecode_add_output(ctx.bc, &output);
1617 last_exp_param = ctx.bc->cf_last;
1618 }
1619 output.array_base = 61;
1620 if (next_clip_pos == 61)
1621 next_clip_pos = 62;
1622 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1623 ctx.shader->vs_out_misc_write = 1;
1624 ctx.shader->vs_out_viewport = 1;
1625 output.swizzle_x = 7;
1626 output.swizzle_y = 7;
1627 output.swizzle_z = 7;
1628 output.swizzle_w = 0;
1629 break;
1630 case TGSI_SEMANTIC_CLIPDIST:
1631 /* spi_sid is 0 for clipdistance outputs that were generated
1632 * for clipvertex - we don't need to pass them to PS */
1633 ctx.shader->clip_dist_write = gs->shader.clip_dist_write;
1634 if (out->spi_sid) {
1635 /* duplicate it as PARAM to pass to the pixel shader */
1636 output.array_base = next_param++;
1637 r600_bytecode_add_output(ctx.bc, &output);
1638 last_exp_param = ctx.bc->cf_last;
1639 }
1640 output.array_base = next_clip_pos++;
1641 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1642 break;
1643 case TGSI_SEMANTIC_FOG:
1644 output.swizzle_y = 4; /* 0 */
1645 output.swizzle_z = 4; /* 0 */
1646 output.swizzle_w = 5; /* 1 */
1647 break;
1648 default:
1649 output.array_base = next_param++;
1650 break;
1651 }
1652 r600_bytecode_add_output(ctx.bc, &output);
1653 if (output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM)
1654 last_exp_param = ctx.bc->cf_last;
1655 else
1656 last_exp_pos = ctx.bc->cf_last;
1657 }
1658
1659 if (!last_exp_pos) {
1660 memset(&output, 0, sizeof(output));
1661 output.gpr = 0;
1662 output.elem_size = 3;
1663 output.swizzle_x = 7;
1664 output.swizzle_y = 7;
1665 output.swizzle_z = 7;
1666 output.swizzle_w = 7;
1667 output.burst_count = 1;
1668 output.type = 2;
1669 output.op = CF_OP_EXPORT;
1670 output.array_base = 60;
1671 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1672 r600_bytecode_add_output(ctx.bc, &output);
1673 last_exp_pos = ctx.bc->cf_last;
1674 }
1675
1676 if (!last_exp_param) {
1677 memset(&output, 0, sizeof(output));
1678 output.gpr = 0;
1679 output.elem_size = 3;
1680 output.swizzle_x = 7;
1681 output.swizzle_y = 7;
1682 output.swizzle_z = 7;
1683 output.swizzle_w = 7;
1684 output.burst_count = 1;
1685 output.type = 2;
1686 output.op = CF_OP_EXPORT;
1687 output.array_base = next_param++;
1688 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
1689 r600_bytecode_add_output(ctx.bc, &output);
1690 last_exp_param = ctx.bc->cf_last;
1691 }
1692
1693 last_exp_pos->op = CF_OP_EXPORT_DONE;
1694 last_exp_param->op = CF_OP_EXPORT_DONE;
1695
1696 r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
1697 cf_pop = ctx.bc->cf_last;
1698
1699 cf_jump->cf_addr = cf_pop->id + 2;
1700 cf_jump->pop_count = 1;
1701 cf_pop->cf_addr = cf_pop->id + 2;
1702 cf_pop->pop_count = 1;
1703
1704 if (ctx.bc->chip_class == CAYMAN)
1705 cm_bytecode_add_cf_end(ctx.bc);
1706 else {
1707 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
1708 ctx.bc->cf_last->end_of_program = 1;
1709 }
1710
1711 gs->gs_copy_shader = cshader;
1712
1713 ctx.bc->nstack = 1;
1714 cshader->shader.ring_item_size = ocnt * 16;
1715
1716 return r600_bytecode_build(ctx.bc);
1717 }
1718
1719 static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, bool ind)
1720 {
1721 struct r600_bytecode_output output;
1722 int i, k, ring_offset;
1723
1724 for (i = 0; i < ctx->shader->noutput; i++) {
1725 if (ctx->gs_for_vs) {
1726 /* for ES we need to lookup corresponding ring offset expected by GS
1727 * (map this output to GS input by name and sid) */
1728 /* FIXME precompute offsets */
1729 ring_offset = -1;
1730 for(k = 0; k < ctx->gs_for_vs->ninput; ++k) {
1731 struct r600_shader_io *in = &ctx->gs_for_vs->input[k];
1732 struct r600_shader_io *out = &ctx->shader->output[i];
1733 if (in->name == out->name && in->sid == out->sid)
1734 ring_offset = in->ring_offset;
1735 }
1736
1737 if (ring_offset == -1)
1738 continue;
1739 } else
1740 ring_offset = i * 16;
1741
1742 /* next_ring_offset after parsing input decls contains total size of
1743 * single vertex data, gs_next_vertex - current vertex index */
1744 if (!ind)
1745 ring_offset += ctx->gs_out_ring_offset * ctx->gs_next_vertex;
1746
1747 /* get a temp and add the ring offset to the next vertex base in the shader */
1748 memset(&output, 0, sizeof(struct r600_bytecode_output));
1749 output.gpr = ctx->shader->output[i].gpr;
1750 output.elem_size = 3;
1751 output.comp_mask = 0xF;
1752 output.burst_count = 1;
1753
1754 if (ind)
1755 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
1756 else
1757 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
1758 output.op = CF_OP_MEM_RING;
1759
1760
1761 if (ind) {
1762 output.array_base = ring_offset >> 2; /* in dwords */
1763 output.array_size = 0xfff;
1764 output.index_gpr = ctx->gs_export_gpr_treg;
1765 } else
1766 output.array_base = ring_offset >> 2; /* in dwords */
1767 r600_bytecode_add_output(ctx->bc, &output);
1768 }
1769
1770 if (ind) {
1771 struct r600_bytecode_alu alu;
1772 int r;
1773
1774 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1775 alu.op = ALU_OP2_ADD_INT;
1776 alu.src[0].sel = ctx->gs_export_gpr_treg;
1777 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1778 alu.src[1].value = ctx->gs_out_ring_offset >> 4;
1779 alu.dst.sel = ctx->gs_export_gpr_treg;
1780 alu.dst.write = 1;
1781 alu.last = 1;
1782 r = r600_bytecode_add_alu(ctx->bc, &alu);
1783 if (r)
1784 return r;
1785 }
1786 ++ctx->gs_next_vertex;
1787 return 0;
1788 }
1789
1790 static int r600_shader_from_tgsi(struct r600_context *rctx,
1791 struct r600_pipe_shader *pipeshader,
1792 struct r600_shader_key key)
1793 {
1794 struct r600_screen *rscreen = rctx->screen;
1795 struct r600_shader *shader = &pipeshader->shader;
1796 struct tgsi_token *tokens = pipeshader->selector->tokens;
1797 struct pipe_stream_output_info so = pipeshader->selector->so;
1798 struct tgsi_full_immediate *immediate;
1799 struct tgsi_full_property *property;
1800 struct r600_shader_ctx ctx;
1801 struct r600_bytecode_output output[32];
1802 unsigned output_done, noutput;
1803 unsigned opcode;
1804 int i, j, k, r = 0;
1805 int next_param_base = 0, next_clip_base;
1806 int max_color_exports = MAX2(key.nr_cbufs, 1);
1807 /* Declarations used by llvm code */
1808 bool use_llvm = false;
1809 bool indirect_gprs;
1810 bool ring_outputs = false;
1811 bool pos_emitted = false;
1812
1813 #ifdef R600_USE_LLVM
1814 use_llvm = rscreen->b.debug_flags & DBG_LLVM;
1815 #endif
1816 ctx.bc = &shader->bc;
1817 ctx.shader = shader;
1818 ctx.native_integers = true;
1819
1820 shader->vs_as_gs_a = key.vs_as_gs_a;
1821 shader->vs_as_es = key.vs_as_es;
1822
1823 r600_bytecode_init(ctx.bc, rscreen->b.chip_class, rscreen->b.family,
1824 rscreen->has_compressed_msaa_texturing);
1825 ctx.tokens = tokens;
1826 tgsi_scan_shader(tokens, &ctx.info);
1827 shader->indirect_files = ctx.info.indirect_files;
1828 indirect_gprs = ctx.info.indirect_files & ~(1 << TGSI_FILE_CONSTANT);
1829 tgsi_parse_init(&ctx.parse, tokens);
1830 ctx.type = ctx.parse.FullHeader.Processor.Processor;
1831 shader->processor_type = ctx.type;
1832 ctx.bc->type = shader->processor_type;
1833
1834 ring_outputs = key.vs_as_es || (ctx.type == TGSI_PROCESSOR_GEOMETRY);
1835
1836 if (key.vs_as_es) {
1837 ctx.gs_for_vs = &rctx->gs_shader->current->shader;
1838 } else {
1839 ctx.gs_for_vs = NULL;
1840 }
1841
1842 ctx.next_ring_offset = 0;
1843 ctx.gs_out_ring_offset = 0;
1844 ctx.gs_next_vertex = 0;
1845
1846 shader->uses_index_registers = false;
1847 ctx.face_gpr = -1;
1848 ctx.fixed_pt_position_gpr = -1;
1849 ctx.fragcoord_input = -1;
1850 ctx.colors_used = 0;
1851 ctx.clip_vertex_write = 0;
1852
1853 shader->nr_ps_color_exports = 0;
1854 shader->nr_ps_max_color_exports = 0;
1855
1856 shader->two_side = key.color_two_side;
1857
1858 /* register allocations */
1859 /* Values [0,127] correspond to GPR[0..127].
1860 * Values [128,159] correspond to constant buffer bank 0
1861 * Values [160,191] correspond to constant buffer bank 1
1862 * Values [256,511] correspond to cfile constants c[0..255]. (Gone on EG)
1863 * Values [256,287] correspond to constant buffer bank 2 (EG)
1864 * Values [288,319] correspond to constant buffer bank 3 (EG)
1865 * Other special values are shown in the list below.
1866 * 244 ALU_SRC_1_DBL_L: special constant 1.0 double-float, LSW. (RV670+)
1867 * 245 ALU_SRC_1_DBL_M: special constant 1.0 double-float, MSW. (RV670+)
1868 * 246 ALU_SRC_0_5_DBL_L: special constant 0.5 double-float, LSW. (RV670+)
1869 * 247 ALU_SRC_0_5_DBL_M: special constant 0.5 double-float, MSW. (RV670+)
1870 * 248 SQ_ALU_SRC_0: special constant 0.0.
1871 * 249 SQ_ALU_SRC_1: special constant 1.0 float.
1872 * 250 SQ_ALU_SRC_1_INT: special constant 1 integer.
1873 * 251 SQ_ALU_SRC_M_1_INT: special constant -1 integer.
1874 * 252 SQ_ALU_SRC_0_5: special constant 0.5 float.
1875 * 253 SQ_ALU_SRC_LITERAL: literal constant.
1876 * 254 SQ_ALU_SRC_PV: previous vector result.
1877 * 255 SQ_ALU_SRC_PS: previous scalar result.
1878 */
1879 for (i = 0; i < TGSI_FILE_COUNT; i++) {
1880 ctx.file_offset[i] = 0;
1881 }
1882
1883 #ifdef R600_USE_LLVM
1884 if (use_llvm && ctx.info.indirect_files && (ctx.info.indirect_files & (1 << TGSI_FILE_CONSTANT)) != ctx.info.indirect_files) {
1885 fprintf(stderr, "Warning: R600 LLVM backend does not support "
1886 "indirect adressing. Falling back to TGSI "
1887 "backend.\n");
1888 use_llvm = 0;
1889 }
1890 #endif
1891 if (ctx.type == TGSI_PROCESSOR_VERTEX) {
1892 ctx.file_offset[TGSI_FILE_INPUT] = 1;
1893 if (!use_llvm) {
1894 r600_bytecode_add_cfinst(ctx.bc, CF_OP_CALL_FS);
1895 }
1896 }
1897 if (ctx.type == TGSI_PROCESSOR_FRAGMENT) {
1898 if (ctx.bc->chip_class >= EVERGREEN)
1899 ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx);
1900 else
1901 ctx.file_offset[TGSI_FILE_INPUT] = allocate_system_value_inputs(&ctx, ctx.file_offset[TGSI_FILE_INPUT]);
1902 }
1903 if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
1904 /* FIXME 1 would be enough in some cases (3 or less input vertices) */
1905 ctx.file_offset[TGSI_FILE_INPUT] = 2;
1906 }
1907 ctx.use_llvm = use_llvm;
1908
1909 if (use_llvm) {
1910 ctx.file_offset[TGSI_FILE_OUTPUT] =
1911 ctx.file_offset[TGSI_FILE_INPUT];
1912 } else {
1913 ctx.file_offset[TGSI_FILE_OUTPUT] =
1914 ctx.file_offset[TGSI_FILE_INPUT] +
1915 ctx.info.file_max[TGSI_FILE_INPUT] + 1;
1916 }
1917 ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] +
1918 ctx.info.file_max[TGSI_FILE_OUTPUT] + 1;
1919
1920 /* Outside the GPR range. This will be translated to one of the
1921 * kcache banks later. */
1922 ctx.file_offset[TGSI_FILE_CONSTANT] = 512;
1923
1924 ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL;
1925 ctx.bc->ar_reg = ctx.file_offset[TGSI_FILE_TEMPORARY] +
1926 ctx.info.file_max[TGSI_FILE_TEMPORARY] + 1;
1927 if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
1928 ctx.gs_export_gpr_treg = ctx.bc->ar_reg + 1;
1929 ctx.temp_reg = ctx.bc->ar_reg + 2;
1930 ctx.bc->index_reg[0] = ctx.bc->ar_reg + 3;
1931 ctx.bc->index_reg[1] = ctx.bc->ar_reg + 4;
1932 } else {
1933 ctx.temp_reg = ctx.bc->ar_reg + 1;
1934 ctx.bc->index_reg[0] = ctx.bc->ar_reg + 2;
1935 ctx.bc->index_reg[1] = ctx.bc->ar_reg + 3;
1936 }
1937
1938 if (indirect_gprs) {
1939 shader->max_arrays = 0;
1940 shader->num_arrays = 0;
1941
1942 if (ctx.info.indirect_files & (1 << TGSI_FILE_INPUT)) {
1943 r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_INPUT],
1944 ctx.file_offset[TGSI_FILE_OUTPUT] -
1945 ctx.file_offset[TGSI_FILE_INPUT],
1946 0x0F);
1947 }
1948 if (ctx.info.indirect_files & (1 << TGSI_FILE_OUTPUT)) {
1949 r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_OUTPUT],
1950 ctx.file_offset[TGSI_FILE_TEMPORARY] -
1951 ctx.file_offset[TGSI_FILE_OUTPUT],
1952 0x0F);
1953 }
1954 }
1955
1956 ctx.nliterals = 0;
1957 ctx.literals = NULL;
1958 shader->fs_write_all = FALSE;
1959
1960 if (shader->vs_as_gs_a)
1961 vs_add_primid_output(&ctx, key.vs_prim_id_out);
1962
1963 while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
1964 tgsi_parse_token(&ctx.parse);
1965 switch (ctx.parse.FullToken.Token.Type) {
1966 case TGSI_TOKEN_TYPE_IMMEDIATE:
1967 immediate = &ctx.parse.FullToken.FullImmediate;
1968 ctx.literals = realloc(ctx.literals, (ctx.nliterals + 1) * 16);
1969 if(ctx.literals == NULL) {
1970 r = -ENOMEM;
1971 goto out_err;
1972 }
1973 ctx.literals[ctx.nliterals * 4 + 0] = immediate->u[0].Uint;
1974 ctx.literals[ctx.nliterals * 4 + 1] = immediate->u[1].Uint;
1975 ctx.literals[ctx.nliterals * 4 + 2] = immediate->u[2].Uint;
1976 ctx.literals[ctx.nliterals * 4 + 3] = immediate->u[3].Uint;
1977 ctx.nliterals++;
1978 break;
1979 case TGSI_TOKEN_TYPE_DECLARATION:
1980 r = tgsi_declaration(&ctx);
1981 if (r)
1982 goto out_err;
1983 break;
1984 case TGSI_TOKEN_TYPE_INSTRUCTION:
1985 break;
1986 case TGSI_TOKEN_TYPE_PROPERTY:
1987 property = &ctx.parse.FullToken.FullProperty;
1988 switch (property->Property.PropertyName) {
1989 case TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS:
1990 if (property->u[0].Data == 1)
1991 shader->fs_write_all = TRUE;
1992 break;
1993 case TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION:
1994 if (property->u[0].Data == 1)
1995 shader->vs_position_window_space = TRUE;
1996 break;
1997 case TGSI_PROPERTY_VS_PROHIBIT_UCPS:
1998 /* we don't need this one */
1999 break;
2000 case TGSI_PROPERTY_GS_INPUT_PRIM:
2001 shader->gs_input_prim = property->u[0].Data;
2002 break;
2003 case TGSI_PROPERTY_GS_OUTPUT_PRIM:
2004 shader->gs_output_prim = property->u[0].Data;
2005 break;
2006 case TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES:
2007 shader->gs_max_out_vertices = property->u[0].Data;
2008 break;
2009 case TGSI_PROPERTY_GS_INVOCATIONS:
2010 shader->gs_num_invocations = property->u[0].Data;
2011 break;
2012 }
2013 break;
2014 default:
2015 R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type);
2016 r = -EINVAL;
2017 goto out_err;
2018 }
2019 }
2020
2021 shader->ring_item_size = ctx.next_ring_offset;
2022
2023 /* Process two side if needed */
2024 if (shader->two_side && ctx.colors_used) {
2025 int i, count = ctx.shader->ninput;
2026 unsigned next_lds_loc = ctx.shader->nlds;
2027
2028 /* additional inputs will be allocated right after the existing inputs,
2029 * we won't need them after the color selection, so we don't need to
2030 * reserve these gprs for the rest of the shader code and to adjust
2031 * output offsets etc. */
2032 int gpr = ctx.file_offset[TGSI_FILE_INPUT] +
2033 ctx.info.file_max[TGSI_FILE_INPUT] + 1;
2034
2035 /* if two sided and neither face or sample mask is used by shader, ensure face_gpr is emitted */
2036 if (ctx.face_gpr == -1) {
2037 i = ctx.shader->ninput++;
2038 ctx.shader->input[i].name = TGSI_SEMANTIC_FACE;
2039 ctx.shader->input[i].spi_sid = 0;
2040 ctx.shader->input[i].gpr = gpr++;
2041 ctx.face_gpr = ctx.shader->input[i].gpr;
2042 }
2043
2044 for (i = 0; i < count; i++) {
2045 if (ctx.shader->input[i].name == TGSI_SEMANTIC_COLOR) {
2046 int ni = ctx.shader->ninput++;
2047 memcpy(&ctx.shader->input[ni],&ctx.shader->input[i], sizeof(struct r600_shader_io));
2048 ctx.shader->input[ni].name = TGSI_SEMANTIC_BCOLOR;
2049 ctx.shader->input[ni].spi_sid = r600_spi_sid(&ctx.shader->input[ni]);
2050 ctx.shader->input[ni].gpr = gpr++;
2051 // TGSI to LLVM needs to know the lds position of inputs.
2052 // Non LLVM path computes it later (in process_twoside_color)
2053 ctx.shader->input[ni].lds_pos = next_lds_loc++;
2054 ctx.shader->input[i].back_color_input = ni;
2055 if (ctx.bc->chip_class >= EVERGREEN) {
2056 if ((r = evergreen_interp_input(&ctx, ni)))
2057 return r;
2058 }
2059 }
2060 }
2061 }
2062
2063 /* LLVM backend setup */
2064 #ifdef R600_USE_LLVM
2065 if (use_llvm) {
2066 struct radeon_llvm_context radeon_llvm_ctx;
2067 LLVMModuleRef mod;
2068 bool dump = r600_can_dump_shader(&rscreen->b, tokens);
2069 boolean use_kill = false;
2070
2071 memset(&radeon_llvm_ctx, 0, sizeof(radeon_llvm_ctx));
2072 radeon_llvm_ctx.type = ctx.type;
2073 radeon_llvm_ctx.two_side = shader->two_side;
2074 radeon_llvm_ctx.face_gpr = ctx.face_gpr;
2075 radeon_llvm_ctx.inputs_count = ctx.shader->ninput + 1;
2076 radeon_llvm_ctx.r600_inputs = ctx.shader->input;
2077 radeon_llvm_ctx.r600_outputs = ctx.shader->output;
2078 radeon_llvm_ctx.color_buffer_count = max_color_exports;
2079 radeon_llvm_ctx.chip_class = ctx.bc->chip_class;
2080 radeon_llvm_ctx.fs_color_all = shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN);
2081 radeon_llvm_ctx.stream_outputs = &so;
2082 radeon_llvm_ctx.clip_vertex = ctx.cv_output;
2083 radeon_llvm_ctx.alpha_to_one = key.alpha_to_one;
2084 radeon_llvm_ctx.has_compressed_msaa_texturing =
2085 ctx.bc->has_compressed_msaa_texturing;
2086 mod = r600_tgsi_llvm(&radeon_llvm_ctx, tokens);
2087 ctx.shader->has_txq_cube_array_z_comp = radeon_llvm_ctx.has_txq_cube_array_z_comp;
2088 ctx.shader->uses_tex_buffers = radeon_llvm_ctx.uses_tex_buffers;
2089
2090 if (r600_llvm_compile(mod, rscreen->b.family, ctx.bc, &use_kill, dump)) {
2091 radeon_llvm_dispose(&radeon_llvm_ctx);
2092 use_llvm = 0;
2093 fprintf(stderr, "R600 LLVM backend failed to compile "
2094 "shader. Falling back to TGSI\n");
2095 } else {
2096 ctx.file_offset[TGSI_FILE_OUTPUT] =
2097 ctx.file_offset[TGSI_FILE_INPUT];
2098 }
2099 if (use_kill)
2100 ctx.shader->uses_kill = use_kill;
2101 radeon_llvm_dispose(&radeon_llvm_ctx);
2102 }
2103 #endif
2104 /* End of LLVM backend setup */
2105
2106 if (shader->fs_write_all && rscreen->b.chip_class >= EVERGREEN)
2107 shader->nr_ps_max_color_exports = 8;
2108
2109 if (!use_llvm) {
2110 if (ctx.fragcoord_input >= 0) {
2111 if (ctx.bc->chip_class == CAYMAN) {
2112 for (j = 0 ; j < 4; j++) {
2113 struct r600_bytecode_alu alu;
2114 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2115 alu.op = ALU_OP1_RECIP_IEEE;
2116 alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
2117 alu.src[0].chan = 3;
2118
2119 alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
2120 alu.dst.chan = j;
2121 alu.dst.write = (j == 3);
2122 alu.last = 1;
2123 if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
2124 return r;
2125 }
2126 } else {
2127 struct r600_bytecode_alu alu;
2128 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2129 alu.op = ALU_OP1_RECIP_IEEE;
2130 alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
2131 alu.src[0].chan = 3;
2132
2133 alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
2134 alu.dst.chan = 3;
2135 alu.dst.write = 1;
2136 alu.last = 1;
2137 if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
2138 return r;
2139 }
2140 }
2141
2142 if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
2143 struct r600_bytecode_alu alu;
2144 int r;
2145
2146 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2147 alu.op = ALU_OP1_MOV;
2148 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
2149 alu.src[0].value = 0;
2150 alu.dst.sel = ctx.gs_export_gpr_treg;
2151 alu.dst.write = 1;
2152 alu.last = 1;
2153 r = r600_bytecode_add_alu(ctx.bc, &alu);
2154 if (r)
2155 return r;
2156 }
2157 if (shader->two_side && ctx.colors_used) {
2158 if ((r = process_twoside_color_inputs(&ctx)))
2159 return r;
2160 }
2161
2162 tgsi_parse_init(&ctx.parse, tokens);
2163 while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
2164 tgsi_parse_token(&ctx.parse);
2165 switch (ctx.parse.FullToken.Token.Type) {
2166 case TGSI_TOKEN_TYPE_INSTRUCTION:
2167 r = tgsi_is_supported(&ctx);
2168 if (r)
2169 goto out_err;
2170 ctx.max_driver_temp_used = 0;
2171 /* reserve first tmp for everyone */
2172 r600_get_temp(&ctx);
2173
2174 opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode;
2175 if ((r = tgsi_split_constant(&ctx)))
2176 goto out_err;
2177 if ((r = tgsi_split_literal_constant(&ctx)))
2178 goto out_err;
2179 if (ctx.type == TGSI_PROCESSOR_GEOMETRY)
2180 if ((r = tgsi_split_gs_inputs(&ctx)))
2181 goto out_err;
2182 if (ctx.bc->chip_class == CAYMAN)
2183 ctx.inst_info = &cm_shader_tgsi_instruction[opcode];
2184 else if (ctx.bc->chip_class >= EVERGREEN)
2185 ctx.inst_info = &eg_shader_tgsi_instruction[opcode];
2186 else
2187 ctx.inst_info = &r600_shader_tgsi_instruction[opcode];
2188 r = ctx.inst_info->process(&ctx);
2189 if (r)
2190 goto out_err;
2191 break;
2192 default:
2193 break;
2194 }
2195 }
2196 }
2197
2198 /* Reset the temporary register counter. */
2199 ctx.max_driver_temp_used = 0;
2200
2201 noutput = shader->noutput;
2202
2203 if (!ring_outputs && ctx.clip_vertex_write) {
2204 unsigned clipdist_temp[2];
2205
2206 clipdist_temp[0] = r600_get_temp(&ctx);
2207 clipdist_temp[1] = r600_get_temp(&ctx);
2208
2209 /* need to convert a clipvertex write into clipdistance writes and not export
2210 the clip vertex anymore */
2211
2212 memset(&shader->output[noutput], 0, 2*sizeof(struct r600_shader_io));
2213 shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
2214 shader->output[noutput].gpr = clipdist_temp[0];
2215 noutput++;
2216 shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
2217 shader->output[noutput].gpr = clipdist_temp[1];
2218 noutput++;
2219
2220 /* reset spi_sid for clipvertex output to avoid confusing spi */
2221 shader->output[ctx.cv_output].spi_sid = 0;
2222
2223 shader->clip_dist_write = 0xFF;
2224
2225 for (i = 0; i < 8; i++) {
2226 int oreg = i >> 2;
2227 int ochan = i & 3;
2228
2229 for (j = 0; j < 4; j++) {
2230 struct r600_bytecode_alu alu;
2231 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2232 alu.op = ALU_OP2_DOT4;
2233 alu.src[0].sel = shader->output[ctx.cv_output].gpr;
2234 alu.src[0].chan = j;
2235
2236 alu.src[1].sel = 512 + i;
2237 alu.src[1].kc_bank = R600_UCP_CONST_BUFFER;
2238 alu.src[1].chan = j;
2239
2240 alu.dst.sel = clipdist_temp[oreg];
2241 alu.dst.chan = j;
2242 alu.dst.write = (j == ochan);
2243 if (j == 3)
2244 alu.last = 1;
2245 if (!use_llvm)
2246 r = r600_bytecode_add_alu(ctx.bc, &alu);
2247 if (r)
2248 return r;
2249 }
2250 }
2251 }
2252
2253 /* Add stream outputs. */
2254 if (!ring_outputs && ctx.type == TGSI_PROCESSOR_VERTEX &&
2255 so.num_outputs && !use_llvm)
2256 emit_streamout(&ctx, &so);
2257
2258 convert_edgeflag_to_int(&ctx);
2259
2260 if (ring_outputs) {
2261 if (key.vs_as_es)
2262 emit_gs_ring_writes(&ctx, FALSE);
2263 } else {
2264 /* Export output */
2265 next_clip_base = shader->vs_out_misc_write ? 62 : 61;
2266
2267 for (i = 0, j = 0; i < noutput; i++, j++) {
2268 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
2269 output[j].gpr = shader->output[i].gpr;
2270 output[j].elem_size = 3;
2271 output[j].swizzle_x = 0;
2272 output[j].swizzle_y = 1;
2273 output[j].swizzle_z = 2;
2274 output[j].swizzle_w = 3;
2275 output[j].burst_count = 1;
2276 output[j].type = -1;
2277 output[j].op = CF_OP_EXPORT;
2278 switch (ctx.type) {
2279 case TGSI_PROCESSOR_VERTEX:
2280 switch (shader->output[i].name) {
2281 case TGSI_SEMANTIC_POSITION:
2282 output[j].array_base = 60;
2283 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2284 pos_emitted = true;
2285 break;
2286
2287 case TGSI_SEMANTIC_PSIZE:
2288 output[j].array_base = 61;
2289 output[j].swizzle_y = 7;
2290 output[j].swizzle_z = 7;
2291 output[j].swizzle_w = 7;
2292 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2293 pos_emitted = true;
2294 break;
2295 case TGSI_SEMANTIC_EDGEFLAG:
2296 output[j].array_base = 61;
2297 output[j].swizzle_x = 7;
2298 output[j].swizzle_y = 0;
2299 output[j].swizzle_z = 7;
2300 output[j].swizzle_w = 7;
2301 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2302 pos_emitted = true;
2303 break;
2304 case TGSI_SEMANTIC_LAYER:
2305 /* spi_sid is 0 for outputs that are
2306 * not consumed by PS */
2307 if (shader->output[i].spi_sid) {
2308 output[j].array_base = next_param_base++;
2309 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2310 j++;
2311 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
2312 }
2313 output[j].array_base = 61;
2314 output[j].swizzle_x = 7;
2315 output[j].swizzle_y = 7;
2316 output[j].swizzle_z = 0;
2317 output[j].swizzle_w = 7;
2318 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2319 pos_emitted = true;
2320 break;
2321 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2322 /* spi_sid is 0 for outputs that are
2323 * not consumed by PS */
2324 if (shader->output[i].spi_sid) {
2325 output[j].array_base = next_param_base++;
2326 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2327 j++;
2328 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
2329 }
2330 output[j].array_base = 61;
2331 output[j].swizzle_x = 7;
2332 output[j].swizzle_y = 7;
2333 output[j].swizzle_z = 7;
2334 output[j].swizzle_w = 0;
2335 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2336 pos_emitted = true;
2337 break;
2338 case TGSI_SEMANTIC_CLIPVERTEX:
2339 j--;
2340 break;
2341 case TGSI_SEMANTIC_CLIPDIST:
2342 output[j].array_base = next_clip_base++;
2343 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2344 pos_emitted = true;
2345 /* spi_sid is 0 for clipdistance outputs that were generated
2346 * for clipvertex - we don't need to pass them to PS */
2347 if (shader->output[i].spi_sid) {
2348 j++;
2349 /* duplicate it as PARAM to pass to the pixel shader */
2350 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
2351 output[j].array_base = next_param_base++;
2352 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2353 }
2354 break;
2355 case TGSI_SEMANTIC_FOG:
2356 output[j].swizzle_y = 4; /* 0 */
2357 output[j].swizzle_z = 4; /* 0 */
2358 output[j].swizzle_w = 5; /* 1 */
2359 break;
2360 case TGSI_SEMANTIC_PRIMID:
2361 output[j].swizzle_x = 2;
2362 output[j].swizzle_y = 4; /* 0 */
2363 output[j].swizzle_z = 4; /* 0 */
2364 output[j].swizzle_w = 4; /* 0 */
2365 break;
2366 }
2367
2368 break;
2369 case TGSI_PROCESSOR_FRAGMENT:
2370 if (shader->output[i].name == TGSI_SEMANTIC_COLOR) {
2371 /* never export more colors than the number of CBs */
2372 if (shader->output[i].sid >= max_color_exports) {
2373 /* skip export */
2374 j--;
2375 continue;
2376 }
2377 output[j].swizzle_w = key.alpha_to_one ? 5 : 3;
2378 output[j].array_base = shader->output[i].sid;
2379 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2380 shader->nr_ps_color_exports++;
2381 if (shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN)) {
2382 for (k = 1; k < max_color_exports; k++) {
2383 j++;
2384 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
2385 output[j].gpr = shader->output[i].gpr;
2386 output[j].elem_size = 3;
2387 output[j].swizzle_x = 0;
2388 output[j].swizzle_y = 1;
2389 output[j].swizzle_z = 2;
2390 output[j].swizzle_w = key.alpha_to_one ? 5 : 3;
2391 output[j].burst_count = 1;
2392 output[j].array_base = k;
2393 output[j].op = CF_OP_EXPORT;
2394 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2395 shader->nr_ps_color_exports++;
2396 }
2397 }
2398 } else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) {
2399 output[j].array_base = 61;
2400 output[j].swizzle_x = 2;
2401 output[j].swizzle_y = 7;
2402 output[j].swizzle_z = output[j].swizzle_w = 7;
2403 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2404 } else if (shader->output[i].name == TGSI_SEMANTIC_STENCIL) {
2405 output[j].array_base = 61;
2406 output[j].swizzle_x = 7;
2407 output[j].swizzle_y = 1;
2408 output[j].swizzle_z = output[j].swizzle_w = 7;
2409 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2410 } else if (shader->output[i].name == TGSI_SEMANTIC_SAMPLEMASK) {
2411 output[j].array_base = 61;
2412 output[j].swizzle_x = 7;
2413 output[j].swizzle_y = 7;
2414 output[j].swizzle_z = 0;
2415 output[j].swizzle_w = 7;
2416 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2417 } else {
2418 R600_ERR("unsupported fragment output name %d\n", shader->output[i].name);
2419 r = -EINVAL;
2420 goto out_err;
2421 }
2422 break;
2423 default:
2424 R600_ERR("unsupported processor type %d\n", ctx.type);
2425 r = -EINVAL;
2426 goto out_err;
2427 }
2428
2429 if (output[j].type==-1) {
2430 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2431 output[j].array_base = next_param_base++;
2432 }
2433 }
2434
2435 /* add fake position export */
2436 if (ctx.type == TGSI_PROCESSOR_VERTEX && pos_emitted == false) {
2437 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
2438 output[j].gpr = 0;
2439 output[j].elem_size = 3;
2440 output[j].swizzle_x = 7;
2441 output[j].swizzle_y = 7;
2442 output[j].swizzle_z = 7;
2443 output[j].swizzle_w = 7;
2444 output[j].burst_count = 1;
2445 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2446 output[j].array_base = 60;
2447 output[j].op = CF_OP_EXPORT;
2448 j++;
2449 }
2450
2451 /* add fake param output for vertex shader if no param is exported */
2452 if (ctx.type == TGSI_PROCESSOR_VERTEX && next_param_base == 0) {
2453 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
2454 output[j].gpr = 0;
2455 output[j].elem_size = 3;
2456 output[j].swizzle_x = 7;
2457 output[j].swizzle_y = 7;
2458 output[j].swizzle_z = 7;
2459 output[j].swizzle_w = 7;
2460 output[j].burst_count = 1;
2461 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2462 output[j].array_base = 0;
2463 output[j].op = CF_OP_EXPORT;
2464 j++;
2465 }
2466
2467 /* add fake pixel export */
2468 if (ctx.type == TGSI_PROCESSOR_FRAGMENT && shader->nr_ps_color_exports == 0) {
2469 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
2470 output[j].gpr = 0;
2471 output[j].elem_size = 3;
2472 output[j].swizzle_x = 7;
2473 output[j].swizzle_y = 7;
2474 output[j].swizzle_z = 7;
2475 output[j].swizzle_w = 7;
2476 output[j].burst_count = 1;
2477 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2478 output[j].array_base = 0;
2479 output[j].op = CF_OP_EXPORT;
2480 j++;
2481 }
2482
2483 noutput = j;
2484
2485 /* set export done on last export of each type */
2486 for (i = noutput - 1, output_done = 0; i >= 0; i--) {
2487 if (!(output_done & (1 << output[i].type))) {
2488 output_done |= (1 << output[i].type);
2489 output[i].op = CF_OP_EXPORT_DONE;
2490 }
2491 }
2492 /* add output to bytecode */
2493 if (!use_llvm) {
2494 for (i = 0; i < noutput; i++) {
2495 r = r600_bytecode_add_output(ctx.bc, &output[i]);
2496 if (r)
2497 goto out_err;
2498 }
2499 }
2500 }
2501
2502 /* add program end */
2503 if (!use_llvm) {
2504 if (ctx.bc->chip_class == CAYMAN)
2505 cm_bytecode_add_cf_end(ctx.bc);
2506 else {
2507 const struct cf_op_info *last = NULL;
2508
2509 if (ctx.bc->cf_last)
2510 last = r600_isa_cf(ctx.bc->cf_last->op);
2511
2512 /* alu clause instructions don't have EOP bit, so add NOP */
2513 if (!last || last->flags & CF_ALU || ctx.bc->cf_last->op == CF_OP_LOOP_END || ctx.bc->cf_last->op == CF_OP_CALL_FS)
2514 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
2515
2516 ctx.bc->cf_last->end_of_program = 1;
2517 }
2518 }
2519
2520 /* check GPR limit - we have 124 = 128 - 4
2521 * (4 are reserved as alu clause temporary registers) */
2522 if (ctx.bc->ngpr > 124) {
2523 R600_ERR("GPR limit exceeded - shader requires %d registers\n", ctx.bc->ngpr);
2524 r = -ENOMEM;
2525 goto out_err;
2526 }
2527
2528 if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
2529 if ((r = generate_gs_copy_shader(rctx, pipeshader, &so)))
2530 return r;
2531 }
2532
2533 free(ctx.literals);
2534 tgsi_parse_free(&ctx.parse);
2535 return 0;
2536 out_err:
2537 free(ctx.literals);
2538 tgsi_parse_free(&ctx.parse);
2539 return r;
2540 }
2541
2542 static int tgsi_unsupported(struct r600_shader_ctx *ctx)
2543 {
2544 const unsigned tgsi_opcode =
2545 ctx->parse.FullToken.FullInstruction.Instruction.Opcode;
2546 R600_ERR("%s tgsi opcode unsupported\n",
2547 tgsi_get_opcode_name(tgsi_opcode));
2548 return -EINVAL;
2549 }
2550
2551 static int tgsi_end(struct r600_shader_ctx *ctx)
2552 {
2553 return 0;
2554 }
2555
2556 static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
2557 const struct r600_shader_src *shader_src,
2558 unsigned chan)
2559 {
2560 bc_src->sel = shader_src->sel;
2561 bc_src->chan = shader_src->swizzle[chan];
2562 bc_src->neg = shader_src->neg;
2563 bc_src->abs = shader_src->abs;
2564 bc_src->rel = shader_src->rel;
2565 bc_src->value = shader_src->value[bc_src->chan];
2566 bc_src->kc_bank = shader_src->kc_bank;
2567 bc_src->kc_rel = shader_src->kc_rel;
2568 }
2569
2570 static void r600_bytecode_src_set_abs(struct r600_bytecode_alu_src *bc_src)
2571 {
2572 bc_src->abs = 1;
2573 bc_src->neg = 0;
2574 }
2575
2576 static void r600_bytecode_src_toggle_neg(struct r600_bytecode_alu_src *bc_src)
2577 {
2578 bc_src->neg = !bc_src->neg;
2579 }
2580
2581 static void tgsi_dst(struct r600_shader_ctx *ctx,
2582 const struct tgsi_full_dst_register *tgsi_dst,
2583 unsigned swizzle,
2584 struct r600_bytecode_alu_dst *r600_dst)
2585 {
2586 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2587
2588 r600_dst->sel = tgsi_dst->Register.Index;
2589 r600_dst->sel += ctx->file_offset[tgsi_dst->Register.File];
2590 r600_dst->chan = swizzle;
2591 r600_dst->write = 1;
2592 if (tgsi_dst->Register.Indirect)
2593 r600_dst->rel = V_SQ_REL_RELATIVE;
2594 if (inst->Instruction.Saturate) {
2595 r600_dst->clamp = 1;
2596 }
2597 }
2598
2599 static int tgsi_last_instruction(unsigned writemask)
2600 {
2601 int i, lasti = 0;
2602
2603 for (i = 0; i < 4; i++) {
2604 if (writemask & (1 << i)) {
2605 lasti = i;
2606 }
2607 }
2608 return lasti;
2609 }
2610
2611 static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap, int trans_only)
2612 {
2613 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2614 struct r600_bytecode_alu alu;
2615 unsigned write_mask = inst->Dst[0].Register.WriteMask;
2616 int i, j, r, lasti = tgsi_last_instruction(write_mask);
2617 /* use temp register if trans_only and more than one dst component */
2618 int use_tmp = trans_only && (write_mask ^ (1 << lasti));
2619
2620 for (i = 0; i <= lasti; i++) {
2621 if (!(write_mask & (1 << i)))
2622 continue;
2623
2624 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2625 if (use_tmp) {
2626 alu.dst.sel = ctx->temp_reg;
2627 alu.dst.chan = i;
2628 alu.dst.write = 1;
2629 } else
2630 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2631
2632 alu.op = ctx->inst_info->op;
2633 if (!swap) {
2634 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
2635 r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
2636 }
2637 } else {
2638 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
2639 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
2640 }
2641 /* handle some special cases */
2642 switch (inst->Instruction.Opcode) {
2643 case TGSI_OPCODE_SUB:
2644 r600_bytecode_src_toggle_neg(&alu.src[1]);
2645 break;
2646 case TGSI_OPCODE_ABS:
2647 r600_bytecode_src_set_abs(&alu.src[0]);
2648 break;
2649 default:
2650 break;
2651 }
2652 if (i == lasti || trans_only) {
2653 alu.last = 1;
2654 }
2655 r = r600_bytecode_add_alu(ctx->bc, &alu);
2656 if (r)
2657 return r;
2658 }
2659
2660 if (use_tmp) {
2661 /* move result from temp to dst */
2662 for (i = 0; i <= lasti; i++) {
2663 if (!(write_mask & (1 << i)))
2664 continue;
2665
2666 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2667 alu.op = ALU_OP1_MOV;
2668 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2669 alu.src[0].sel = ctx->temp_reg;
2670 alu.src[0].chan = i;
2671 alu.last = (i == lasti);
2672
2673 r = r600_bytecode_add_alu(ctx->bc, &alu);
2674 if (r)
2675 return r;
2676 }
2677 }
2678 return 0;
2679 }
2680
2681 static int tgsi_op2(struct r600_shader_ctx *ctx)
2682 {
2683 return tgsi_op2_s(ctx, 0, 0);
2684 }
2685
2686 static int tgsi_op2_swap(struct r600_shader_ctx *ctx)
2687 {
2688 return tgsi_op2_s(ctx, 1, 0);
2689 }
2690
2691 static int tgsi_op2_trans(struct r600_shader_ctx *ctx)
2692 {
2693 return tgsi_op2_s(ctx, 0, 1);
2694 }
2695
2696 static int tgsi_ineg(struct r600_shader_ctx *ctx)
2697 {
2698 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2699 struct r600_bytecode_alu alu;
2700 int i, r;
2701 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
2702
2703 for (i = 0; i < lasti + 1; i++) {
2704
2705 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
2706 continue;
2707 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2708 alu.op = ctx->inst_info->op;
2709
2710 alu.src[0].sel = V_SQ_ALU_SRC_0;
2711
2712 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
2713
2714 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2715
2716 if (i == lasti) {
2717 alu.last = 1;
2718 }
2719 r = r600_bytecode_add_alu(ctx->bc, &alu);
2720 if (r)
2721 return r;
2722 }
2723 return 0;
2724
2725 }
2726
2727 static int cayman_emit_float_instr(struct r600_shader_ctx *ctx)
2728 {
2729 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2730 int i, j, r;
2731 struct r600_bytecode_alu alu;
2732 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
2733
2734 for (i = 0 ; i < last_slot; i++) {
2735 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2736 alu.op = ctx->inst_info->op;
2737 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
2738 r600_bytecode_src(&alu.src[j], &ctx->src[j], 0);
2739
2740 /* RSQ should take the absolute value of src */
2741 if (inst->Instruction.Opcode == TGSI_OPCODE_RSQ) {
2742 r600_bytecode_src_set_abs(&alu.src[j]);
2743 }
2744 }
2745 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2746 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
2747
2748 if (i == last_slot - 1)
2749 alu.last = 1;
2750 r = r600_bytecode_add_alu(ctx->bc, &alu);
2751 if (r)
2752 return r;
2753 }
2754 return 0;
2755 }
2756
2757 static int cayman_mul_int_instr(struct r600_shader_ctx *ctx)
2758 {
2759 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2760 int i, j, k, r;
2761 struct r600_bytecode_alu alu;
2762 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
2763 int t1 = ctx->temp_reg;
2764
2765 for (k = 0; k <= lasti; k++) {
2766 if (!(inst->Dst[0].Register.WriteMask & (1 << k)))
2767 continue;
2768
2769 for (i = 0 ; i < 4; i++) {
2770 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2771 alu.op = ctx->inst_info->op;
2772 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
2773 r600_bytecode_src(&alu.src[j], &ctx->src[j], k);
2774 }
2775 alu.dst.sel = t1;
2776 alu.dst.chan = i;
2777 alu.dst.write = (i == k);
2778 if (i == 3)
2779 alu.last = 1;
2780 r = r600_bytecode_add_alu(ctx->bc, &alu);
2781 if (r)
2782 return r;
2783 }
2784 }
2785
2786 for (i = 0 ; i <= lasti; i++) {
2787 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
2788 continue;
2789 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2790 alu.op = ALU_OP1_MOV;
2791 alu.src[0].sel = t1;
2792 alu.src[0].chan = i;
2793 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2794 alu.dst.write = 1;
2795 if (i == lasti)
2796 alu.last = 1;
2797 r = r600_bytecode_add_alu(ctx->bc, &alu);
2798 if (r)
2799 return r;
2800 }
2801
2802 return 0;
2803 }
2804
2805 /*
2806 * r600 - trunc to -PI..PI range
2807 * r700 - normalize by dividing by 2PI
2808 * see fdo bug 27901
2809 */
2810 static int tgsi_setup_trig(struct r600_shader_ctx *ctx)
2811 {
2812 static float half_inv_pi = 1.0 /(3.1415926535 * 2);
2813 static float double_pi = 3.1415926535 * 2;
2814 static float neg_pi = -3.1415926535;
2815
2816 int r;
2817 struct r600_bytecode_alu alu;
2818
2819 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2820 alu.op = ALU_OP3_MULADD;
2821 alu.is_op3 = 1;
2822
2823 alu.dst.chan = 0;
2824 alu.dst.sel = ctx->temp_reg;
2825 alu.dst.write = 1;
2826
2827 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
2828
2829 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2830 alu.src[1].chan = 0;
2831 alu.src[1].value = *(uint32_t *)&half_inv_pi;
2832 alu.src[2].sel = V_SQ_ALU_SRC_0_5;
2833 alu.src[2].chan = 0;
2834 alu.last = 1;
2835 r = r600_bytecode_add_alu(ctx->bc, &alu);
2836 if (r)
2837 return r;
2838
2839 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2840 alu.op = ALU_OP1_FRACT;
2841
2842 alu.dst.chan = 0;
2843 alu.dst.sel = ctx->temp_reg;
2844 alu.dst.write = 1;
2845
2846 alu.src[0].sel = ctx->temp_reg;
2847 alu.src[0].chan = 0;
2848 alu.last = 1;
2849 r = r600_bytecode_add_alu(ctx->bc, &alu);
2850 if (r)
2851 return r;
2852
2853 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2854 alu.op = ALU_OP3_MULADD;
2855 alu.is_op3 = 1;
2856
2857 alu.dst.chan = 0;
2858 alu.dst.sel = ctx->temp_reg;
2859 alu.dst.write = 1;
2860
2861 alu.src[0].sel = ctx->temp_reg;
2862 alu.src[0].chan = 0;
2863
2864 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2865 alu.src[1].chan = 0;
2866 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
2867 alu.src[2].chan = 0;
2868
2869 if (ctx->bc->chip_class == R600) {
2870 alu.src[1].value = *(uint32_t *)&double_pi;
2871 alu.src[2].value = *(uint32_t *)&neg_pi;
2872 } else {
2873 alu.src[1].sel = V_SQ_ALU_SRC_1;
2874 alu.src[2].sel = V_SQ_ALU_SRC_0_5;
2875 alu.src[2].neg = 1;
2876 }
2877
2878 alu.last = 1;
2879 r = r600_bytecode_add_alu(ctx->bc, &alu);
2880 if (r)
2881 return r;
2882 return 0;
2883 }
2884
2885 static int cayman_trig(struct r600_shader_ctx *ctx)
2886 {
2887 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2888 struct r600_bytecode_alu alu;
2889 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
2890 int i, r;
2891
2892 r = tgsi_setup_trig(ctx);
2893 if (r)
2894 return r;
2895
2896
2897 for (i = 0; i < last_slot; i++) {
2898 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2899 alu.op = ctx->inst_info->op;
2900 alu.dst.chan = i;
2901
2902 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2903 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
2904
2905 alu.src[0].sel = ctx->temp_reg;
2906 alu.src[0].chan = 0;
2907 if (i == last_slot - 1)
2908 alu.last = 1;
2909 r = r600_bytecode_add_alu(ctx->bc, &alu);
2910 if (r)
2911 return r;
2912 }
2913 return 0;
2914 }
2915
2916 static int tgsi_trig(struct r600_shader_ctx *ctx)
2917 {
2918 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2919 struct r600_bytecode_alu alu;
2920 int i, r;
2921 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
2922
2923 r = tgsi_setup_trig(ctx);
2924 if (r)
2925 return r;
2926
2927 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2928 alu.op = ctx->inst_info->op;
2929 alu.dst.chan = 0;
2930 alu.dst.sel = ctx->temp_reg;
2931 alu.dst.write = 1;
2932
2933 alu.src[0].sel = ctx->temp_reg;
2934 alu.src[0].chan = 0;
2935 alu.last = 1;
2936 r = r600_bytecode_add_alu(ctx->bc, &alu);
2937 if (r)
2938 return r;
2939
2940 /* replicate result */
2941 for (i = 0; i < lasti + 1; i++) {
2942 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
2943 continue;
2944
2945 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2946 alu.op = ALU_OP1_MOV;
2947
2948 alu.src[0].sel = ctx->temp_reg;
2949 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2950 if (i == lasti)
2951 alu.last = 1;
2952 r = r600_bytecode_add_alu(ctx->bc, &alu);
2953 if (r)
2954 return r;
2955 }
2956 return 0;
2957 }
2958
2959 static int tgsi_scs(struct r600_shader_ctx *ctx)
2960 {
2961 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2962 struct r600_bytecode_alu alu;
2963 int i, r;
2964
2965 /* We'll only need the trig stuff if we are going to write to the
2966 * X or Y components of the destination vector.
2967 */
2968 if (likely(inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY)) {
2969 r = tgsi_setup_trig(ctx);
2970 if (r)
2971 return r;
2972 }
2973
2974 /* dst.x = COS */
2975 if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2976 if (ctx->bc->chip_class == CAYMAN) {
2977 for (i = 0 ; i < 3; i++) {
2978 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2979 alu.op = ALU_OP1_COS;
2980 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2981
2982 if (i == 0)
2983 alu.dst.write = 1;
2984 else
2985 alu.dst.write = 0;
2986 alu.src[0].sel = ctx->temp_reg;
2987 alu.src[0].chan = 0;
2988 if (i == 2)
2989 alu.last = 1;
2990 r = r600_bytecode_add_alu(ctx->bc, &alu);
2991 if (r)
2992 return r;
2993 }
2994 } else {
2995 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2996 alu.op = ALU_OP1_COS;
2997 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
2998
2999 alu.src[0].sel = ctx->temp_reg;
3000 alu.src[0].chan = 0;
3001 alu.last = 1;
3002 r = r600_bytecode_add_alu(ctx->bc, &alu);
3003 if (r)
3004 return r;
3005 }
3006 }
3007
3008 /* dst.y = SIN */
3009 if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3010 if (ctx->bc->chip_class == CAYMAN) {
3011 for (i = 0 ; i < 3; i++) {
3012 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3013 alu.op = ALU_OP1_SIN;
3014 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3015 if (i == 1)
3016 alu.dst.write = 1;
3017 else
3018 alu.dst.write = 0;
3019 alu.src[0].sel = ctx->temp_reg;
3020 alu.src[0].chan = 0;
3021 if (i == 2)
3022 alu.last = 1;
3023 r = r600_bytecode_add_alu(ctx->bc, &alu);
3024 if (r)
3025 return r;
3026 }
3027 } else {
3028 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3029 alu.op = ALU_OP1_SIN;
3030 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
3031
3032 alu.src[0].sel = ctx->temp_reg;
3033 alu.src[0].chan = 0;
3034 alu.last = 1;
3035 r = r600_bytecode_add_alu(ctx->bc, &alu);
3036 if (r)
3037 return r;
3038 }
3039 }
3040
3041 /* dst.z = 0.0; */
3042 if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3043 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3044
3045 alu.op = ALU_OP1_MOV;
3046
3047 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
3048
3049 alu.src[0].sel = V_SQ_ALU_SRC_0;
3050 alu.src[0].chan = 0;
3051
3052 alu.last = 1;
3053
3054 r = r600_bytecode_add_alu(ctx->bc, &alu);
3055 if (r)
3056 return r;
3057 }
3058
3059 /* dst.w = 1.0; */
3060 if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3061 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3062
3063 alu.op = ALU_OP1_MOV;
3064
3065 tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
3066
3067 alu.src[0].sel = V_SQ_ALU_SRC_1;
3068 alu.src[0].chan = 0;
3069
3070 alu.last = 1;
3071
3072 r = r600_bytecode_add_alu(ctx->bc, &alu);
3073 if (r)
3074 return r;
3075 }
3076
3077 return 0;
3078 }
3079
3080 static int tgsi_kill(struct r600_shader_ctx *ctx)
3081 {
3082 const struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3083 struct r600_bytecode_alu alu;
3084 int i, r;
3085
3086 for (i = 0; i < 4; i++) {
3087 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3088 alu.op = ctx->inst_info->op;
3089
3090 alu.dst.chan = i;
3091
3092 alu.src[0].sel = V_SQ_ALU_SRC_0;
3093
3094 if (inst->Instruction.Opcode == TGSI_OPCODE_KILL) {
3095 alu.src[1].sel = V_SQ_ALU_SRC_1;
3096 alu.src[1].neg = 1;
3097 } else {
3098 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3099 }
3100 if (i == 3) {
3101 alu.last = 1;
3102 }
3103 r = r600_bytecode_add_alu(ctx->bc, &alu);
3104 if (r)
3105 return r;
3106 }
3107
3108 /* kill must be last in ALU */
3109 ctx->bc->force_add_cf = 1;
3110 ctx->shader->uses_kill = TRUE;
3111 return 0;
3112 }
3113
3114 static int tgsi_lit(struct r600_shader_ctx *ctx)
3115 {
3116 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3117 struct r600_bytecode_alu alu;
3118 int r;
3119
3120 /* tmp.x = max(src.y, 0.0) */
3121 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3122 alu.op = ALU_OP2_MAX;
3123 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
3124 alu.src[1].sel = V_SQ_ALU_SRC_0; /*0.0*/
3125 alu.src[1].chan = 1;
3126
3127 alu.dst.sel = ctx->temp_reg;
3128 alu.dst.chan = 0;
3129 alu.dst.write = 1;
3130
3131 alu.last = 1;
3132 r = r600_bytecode_add_alu(ctx->bc, &alu);
3133 if (r)
3134 return r;
3135
3136 if (inst->Dst[0].Register.WriteMask & (1 << 2))
3137 {
3138 int chan;
3139 int sel;
3140 int i;
3141
3142 if (ctx->bc->chip_class == CAYMAN) {
3143 for (i = 0; i < 3; i++) {
3144 /* tmp.z = log(tmp.x) */
3145 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3146 alu.op = ALU_OP1_LOG_CLAMPED;
3147 alu.src[0].sel = ctx->temp_reg;
3148 alu.src[0].chan = 0;
3149 alu.dst.sel = ctx->temp_reg;
3150 alu.dst.chan = i;
3151 if (i == 2) {
3152 alu.dst.write = 1;
3153 alu.last = 1;
3154 } else
3155 alu.dst.write = 0;
3156
3157 r = r600_bytecode_add_alu(ctx->bc, &alu);
3158 if (r)
3159 return r;
3160 }
3161 } else {
3162 /* tmp.z = log(tmp.x) */
3163 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3164 alu.op = ALU_OP1_LOG_CLAMPED;
3165 alu.src[0].sel = ctx->temp_reg;
3166 alu.src[0].chan = 0;
3167 alu.dst.sel = ctx->temp_reg;
3168 alu.dst.chan = 2;
3169 alu.dst.write = 1;
3170 alu.last = 1;
3171 r = r600_bytecode_add_alu(ctx->bc, &alu);
3172 if (r)
3173 return r;
3174 }
3175
3176 chan = alu.dst.chan;
3177 sel = alu.dst.sel;
3178
3179 /* tmp.x = amd MUL_LIT(tmp.z, src.w, src.x ) */
3180 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3181 alu.op = ALU_OP3_MUL_LIT;
3182 alu.src[0].sel = sel;
3183 alu.src[0].chan = chan;
3184 r600_bytecode_src(&alu.src[1], &ctx->src[0], 3);
3185 r600_bytecode_src(&alu.src[2], &ctx->src[0], 0);
3186 alu.dst.sel = ctx->temp_reg;
3187 alu.dst.chan = 0;
3188 alu.dst.write = 1;
3189 alu.is_op3 = 1;
3190 alu.last = 1;
3191 r = r600_bytecode_add_alu(ctx->bc, &alu);
3192 if (r)
3193 return r;
3194
3195 if (ctx->bc->chip_class == CAYMAN) {
3196 for (i = 0; i < 3; i++) {
3197 /* dst.z = exp(tmp.x) */
3198 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3199 alu.op = ALU_OP1_EXP_IEEE;
3200 alu.src[0].sel = ctx->temp_reg;
3201 alu.src[0].chan = 0;
3202 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3203 if (i == 2) {
3204 alu.dst.write = 1;
3205 alu.last = 1;
3206 } else
3207 alu.dst.write = 0;
3208 r = r600_bytecode_add_alu(ctx->bc, &alu);
3209 if (r)
3210 return r;
3211 }
3212 } else {
3213 /* dst.z = exp(tmp.x) */
3214 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3215 alu.op = ALU_OP1_EXP_IEEE;
3216 alu.src[0].sel = ctx->temp_reg;
3217 alu.src[0].chan = 0;
3218 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
3219 alu.last = 1;
3220 r = r600_bytecode_add_alu(ctx->bc, &alu);
3221 if (r)
3222 return r;
3223 }
3224 }
3225
3226 /* dst.x, <- 1.0 */
3227 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3228 alu.op = ALU_OP1_MOV;
3229 alu.src[0].sel = V_SQ_ALU_SRC_1; /*1.0*/
3230 alu.src[0].chan = 0;
3231 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
3232 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 0) & 1;
3233 r = r600_bytecode_add_alu(ctx->bc, &alu);
3234 if (r)
3235 return r;
3236
3237 /* dst.y = max(src.x, 0.0) */
3238 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3239 alu.op = ALU_OP2_MAX;
3240 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
3241 alu.src[1].sel = V_SQ_ALU_SRC_0; /*0.0*/
3242 alu.src[1].chan = 0;
3243 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
3244 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 1) & 1;
3245 r = r600_bytecode_add_alu(ctx->bc, &alu);
3246 if (r)
3247 return r;
3248
3249 /* dst.w, <- 1.0 */
3250 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3251 alu.op = ALU_OP1_MOV;
3252 alu.src[0].sel = V_SQ_ALU_SRC_1;
3253 alu.src[0].chan = 0;
3254 tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
3255 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 3) & 1;
3256 alu.last = 1;
3257 r = r600_bytecode_add_alu(ctx->bc, &alu);
3258 if (r)
3259 return r;
3260
3261 return 0;
3262 }
3263
3264 static int tgsi_rsq(struct r600_shader_ctx *ctx)
3265 {
3266 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3267 struct r600_bytecode_alu alu;
3268 int i, r;
3269
3270 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3271
3272 /* XXX:
3273 * For state trackers other than OpenGL, we'll want to use
3274 * _RECIPSQRT_IEEE instead.
3275 */
3276 alu.op = ALU_OP1_RECIPSQRT_CLAMPED;
3277
3278 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
3279 r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
3280 r600_bytecode_src_set_abs(&alu.src[i]);
3281 }
3282 alu.dst.sel = ctx->temp_reg;
3283 alu.dst.write = 1;
3284 alu.last = 1;
3285 r = r600_bytecode_add_alu(ctx->bc, &alu);
3286 if (r)
3287 return r;
3288 /* replicate result */
3289 return tgsi_helper_tempx_replicate(ctx);
3290 }
3291
3292 static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx)
3293 {
3294 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3295 struct r600_bytecode_alu alu;
3296 int i, r;
3297
3298 for (i = 0; i < 4; i++) {
3299 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3300 alu.src[0].sel = ctx->temp_reg;
3301 alu.op = ALU_OP1_MOV;
3302 alu.dst.chan = i;
3303 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3304 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
3305 if (i == 3)
3306 alu.last = 1;
3307 r = r600_bytecode_add_alu(ctx->bc, &alu);
3308 if (r)
3309 return r;
3310 }
3311 return 0;
3312 }
3313
3314 static int tgsi_trans_srcx_replicate(struct r600_shader_ctx *ctx)
3315 {
3316 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3317 struct r600_bytecode_alu alu;
3318 int i, r;
3319
3320 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3321 alu.op = ctx->inst_info->op;
3322 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
3323 r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
3324 }
3325 alu.dst.sel = ctx->temp_reg;
3326 alu.dst.write = 1;
3327 alu.last = 1;
3328 r = r600_bytecode_add_alu(ctx->bc, &alu);
3329 if (r)
3330 return r;
3331 /* replicate result */
3332 return tgsi_helper_tempx_replicate(ctx);
3333 }
3334
3335 static int cayman_pow(struct r600_shader_ctx *ctx)
3336 {
3337 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3338 int i, r;
3339 struct r600_bytecode_alu alu;
3340 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
3341
3342 for (i = 0; i < 3; i++) {
3343 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3344 alu.op = ALU_OP1_LOG_IEEE;
3345 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
3346 alu.dst.sel = ctx->temp_reg;
3347 alu.dst.chan = i;
3348 alu.dst.write = 1;
3349 if (i == 2)
3350 alu.last = 1;
3351 r = r600_bytecode_add_alu(ctx->bc, &alu);
3352 if (r)
3353 return r;
3354 }
3355
3356 /* b * LOG2(a) */
3357 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3358 alu.op = ALU_OP2_MUL;
3359 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
3360 alu.src[1].sel = ctx->temp_reg;
3361 alu.dst.sel = ctx->temp_reg;
3362 alu.dst.write = 1;
3363 alu.last = 1;
3364 r = r600_bytecode_add_alu(ctx->bc, &alu);
3365 if (r)
3366 return r;
3367
3368 for (i = 0; i < last_slot; i++) {
3369 /* POW(a,b) = EXP2(b * LOG2(a))*/
3370 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3371 alu.op = ALU_OP1_EXP_IEEE;
3372 alu.src[0].sel = ctx->temp_reg;
3373
3374 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3375 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
3376 if (i == last_slot - 1)
3377 alu.last = 1;
3378 r = r600_bytecode_add_alu(ctx->bc, &alu);
3379 if (r)
3380 return r;
3381 }
3382 return 0;
3383 }
3384
3385 static int tgsi_pow(struct r600_shader_ctx *ctx)
3386 {
3387 struct r600_bytecode_alu alu;
3388 int r;
3389
3390 /* LOG2(a) */
3391 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3392 alu.op = ALU_OP1_LOG_IEEE;
3393 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
3394 alu.dst.sel = ctx->temp_reg;
3395 alu.dst.write = 1;
3396 alu.last = 1;
3397 r = r600_bytecode_add_alu(ctx->bc, &alu);
3398 if (r)
3399 return r;
3400 /* b * LOG2(a) */
3401 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3402 alu.op = ALU_OP2_MUL;
3403 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
3404 alu.src[1].sel = ctx->temp_reg;
3405 alu.dst.sel = ctx->temp_reg;
3406 alu.dst.write = 1;
3407 alu.last = 1;
3408 r = r600_bytecode_add_alu(ctx->bc, &alu);
3409 if (r)
3410 return r;
3411 /* POW(a,b) = EXP2(b * LOG2(a))*/
3412 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3413 alu.op = ALU_OP1_EXP_IEEE;
3414 alu.src[0].sel = ctx->temp_reg;
3415 alu.dst.sel = ctx->temp_reg;
3416 alu.dst.write = 1;
3417 alu.last = 1;
3418 r = r600_bytecode_add_alu(ctx->bc, &alu);
3419 if (r)
3420 return r;
3421 return tgsi_helper_tempx_replicate(ctx);
3422 }
3423
3424 static int tgsi_divmod(struct r600_shader_ctx *ctx, int mod, int signed_op)
3425 {
3426 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3427 struct r600_bytecode_alu alu;
3428 int i, r, j;
3429 unsigned write_mask = inst->Dst[0].Register.WriteMask;
3430 int tmp0 = ctx->temp_reg;
3431 int tmp1 = r600_get_temp(ctx);
3432 int tmp2 = r600_get_temp(ctx);
3433 int tmp3 = r600_get_temp(ctx);
3434 /* Unsigned path:
3435 *
3436 * we need to represent src1 as src2*q + r, where q - quotient, r - remainder
3437 *
3438 * 1. tmp0.x = rcp (src2) = 2^32/src2 + e, where e is rounding error
3439 * 2. tmp0.z = lo (tmp0.x * src2)
3440 * 3. tmp0.w = -tmp0.z
3441 * 4. tmp0.y = hi (tmp0.x * src2)
3442 * 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z) = abs(lo(rcp*src2))
3443 * 6. tmp0.w = hi (tmp0.z * tmp0.x) = e, rounding error
3444 * 7. tmp1.x = tmp0.x - tmp0.w
3445 * 8. tmp1.y = tmp0.x + tmp0.w
3446 * 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x)
3447 * 10. tmp0.z = hi(tmp0.x * src1) = q
3448 * 11. tmp0.y = lo (tmp0.z * src2) = src2*q = src1 - r
3449 *
3450 * 12. tmp0.w = src1 - tmp0.y = r
3451 * 13. tmp1.x = tmp0.w >= src2 = r >= src2 (uint comparison)
3452 * 14. tmp1.y = src1 >= tmp0.y = r >= 0 (uint comparison)
3453 *
3454 * if DIV
3455 *
3456 * 15. tmp1.z = tmp0.z + 1 = q + 1
3457 * 16. tmp1.w = tmp0.z - 1 = q - 1
3458 *
3459 * else MOD
3460 *
3461 * 15. tmp1.z = tmp0.w - src2 = r - src2
3462 * 16. tmp1.w = tmp0.w + src2 = r + src2
3463 *
3464 * endif
3465 *
3466 * 17. tmp1.x = tmp1.x & tmp1.y
3467 *
3468 * DIV: 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z
3469 * MOD: 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z
3470 *
3471 * 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z
3472 * 20. dst = src2==0 ? MAX_UINT : tmp0.z
3473 *
3474 * Signed path:
3475 *
3476 * Same as unsigned, using abs values of the operands,
3477 * and fixing the sign of the result in the end.
3478 */
3479
3480 for (i = 0; i < 4; i++) {
3481 if (!(write_mask & (1<<i)))
3482 continue;
3483
3484 if (signed_op) {
3485
3486 /* tmp2.x = -src0 */
3487 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3488 alu.op = ALU_OP2_SUB_INT;
3489
3490 alu.dst.sel = tmp2;
3491 alu.dst.chan = 0;
3492 alu.dst.write = 1;
3493
3494 alu.src[0].sel = V_SQ_ALU_SRC_0;
3495
3496 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3497
3498 alu.last = 1;
3499 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3500 return r;
3501
3502 /* tmp2.y = -src1 */
3503 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3504 alu.op = ALU_OP2_SUB_INT;
3505
3506 alu.dst.sel = tmp2;
3507 alu.dst.chan = 1;
3508 alu.dst.write = 1;
3509
3510 alu.src[0].sel = V_SQ_ALU_SRC_0;
3511
3512 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3513
3514 alu.last = 1;
3515 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3516 return r;
3517
3518 /* tmp2.z sign bit is set if src0 and src2 signs are different */
3519 /* it will be a sign of the quotient */
3520 if (!mod) {
3521
3522 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3523 alu.op = ALU_OP2_XOR_INT;
3524
3525 alu.dst.sel = tmp2;
3526 alu.dst.chan = 2;
3527 alu.dst.write = 1;
3528
3529 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3530 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3531
3532 alu.last = 1;
3533 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3534 return r;
3535 }
3536
3537 /* tmp2.x = |src0| */
3538 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3539 alu.op = ALU_OP3_CNDGE_INT;
3540 alu.is_op3 = 1;
3541
3542 alu.dst.sel = tmp2;
3543 alu.dst.chan = 0;
3544 alu.dst.write = 1;
3545
3546 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3547 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3548 alu.src[2].sel = tmp2;
3549 alu.src[2].chan = 0;
3550
3551 alu.last = 1;
3552 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3553 return r;
3554
3555 /* tmp2.y = |src1| */
3556 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3557 alu.op = ALU_OP3_CNDGE_INT;
3558 alu.is_op3 = 1;
3559
3560 alu.dst.sel = tmp2;
3561 alu.dst.chan = 1;
3562 alu.dst.write = 1;
3563
3564 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
3565 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3566 alu.src[2].sel = tmp2;
3567 alu.src[2].chan = 1;
3568
3569 alu.last = 1;
3570 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3571 return r;
3572
3573 }
3574
3575 /* 1. tmp0.x = rcp_u (src2) = 2^32/src2 + e, where e is rounding error */
3576 if (ctx->bc->chip_class == CAYMAN) {
3577 /* tmp3.x = u2f(src2) */
3578 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3579 alu.op = ALU_OP1_UINT_TO_FLT;
3580
3581 alu.dst.sel = tmp3;
3582 alu.dst.chan = 0;
3583 alu.dst.write = 1;
3584
3585 if (signed_op) {
3586 alu.src[0].sel = tmp2;
3587 alu.src[0].chan = 1;
3588 } else {
3589 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
3590 }
3591
3592 alu.last = 1;
3593 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3594 return r;
3595
3596 /* tmp0.x = recip(tmp3.x) */
3597 for (j = 0 ; j < 3; j++) {
3598 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3599 alu.op = ALU_OP1_RECIP_IEEE;
3600
3601 alu.dst.sel = tmp0;
3602 alu.dst.chan = j;
3603 alu.dst.write = (j == 0);
3604
3605 alu.src[0].sel = tmp3;
3606 alu.src[0].chan = 0;
3607
3608 if (j == 2)
3609 alu.last = 1;
3610 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3611 return r;
3612 }
3613
3614 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3615 alu.op = ALU_OP2_MUL;
3616
3617 alu.src[0].sel = tmp0;
3618 alu.src[0].chan = 0;
3619
3620 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
3621 alu.src[1].value = 0x4f800000;
3622
3623 alu.dst.sel = tmp3;
3624 alu.dst.write = 1;
3625 alu.last = 1;
3626 r = r600_bytecode_add_alu(ctx->bc, &alu);
3627 if (r)
3628 return r;
3629
3630 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3631 alu.op = ALU_OP1_FLT_TO_UINT;
3632
3633 alu.dst.sel = tmp0;
3634 alu.dst.chan = 0;
3635 alu.dst.write = 1;
3636
3637 alu.src[0].sel = tmp3;
3638 alu.src[0].chan = 0;
3639
3640 alu.last = 1;
3641 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3642 return r;
3643
3644 } else {
3645 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3646 alu.op = ALU_OP1_RECIP_UINT;
3647
3648 alu.dst.sel = tmp0;
3649 alu.dst.chan = 0;
3650 alu.dst.write = 1;
3651
3652 if (signed_op) {
3653 alu.src[0].sel = tmp2;
3654 alu.src[0].chan = 1;
3655 } else {
3656 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
3657 }
3658
3659 alu.last = 1;
3660 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3661 return r;
3662 }
3663
3664 /* 2. tmp0.z = lo (tmp0.x * src2) */
3665 if (ctx->bc->chip_class == CAYMAN) {
3666 for (j = 0 ; j < 4; j++) {
3667 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3668 alu.op = ALU_OP2_MULLO_UINT;
3669
3670 alu.dst.sel = tmp0;
3671 alu.dst.chan = j;
3672 alu.dst.write = (j == 2);
3673
3674 alu.src[0].sel = tmp0;
3675 alu.src[0].chan = 0;
3676 if (signed_op) {
3677 alu.src[1].sel = tmp2;
3678 alu.src[1].chan = 1;
3679 } else {
3680 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3681 }
3682
3683 alu.last = (j == 3);
3684 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3685 return r;
3686 }
3687 } else {
3688 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3689 alu.op = ALU_OP2_MULLO_UINT;
3690
3691 alu.dst.sel = tmp0;
3692 alu.dst.chan = 2;
3693 alu.dst.write = 1;
3694
3695 alu.src[0].sel = tmp0;
3696 alu.src[0].chan = 0;
3697 if (signed_op) {
3698 alu.src[1].sel = tmp2;
3699 alu.src[1].chan = 1;
3700 } else {
3701 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3702 }
3703
3704 alu.last = 1;
3705 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3706 return r;
3707 }
3708
3709 /* 3. tmp0.w = -tmp0.z */
3710 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3711 alu.op = ALU_OP2_SUB_INT;
3712
3713 alu.dst.sel = tmp0;
3714 alu.dst.chan = 3;
3715 alu.dst.write = 1;
3716
3717 alu.src[0].sel = V_SQ_ALU_SRC_0;
3718 alu.src[1].sel = tmp0;
3719 alu.src[1].chan = 2;
3720
3721 alu.last = 1;
3722 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3723 return r;
3724
3725 /* 4. tmp0.y = hi (tmp0.x * src2) */
3726 if (ctx->bc->chip_class == CAYMAN) {
3727 for (j = 0 ; j < 4; j++) {
3728 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3729 alu.op = ALU_OP2_MULHI_UINT;
3730
3731 alu.dst.sel = tmp0;
3732 alu.dst.chan = j;
3733 alu.dst.write = (j == 1);
3734
3735 alu.src[0].sel = tmp0;
3736 alu.src[0].chan = 0;
3737
3738 if (signed_op) {
3739 alu.src[1].sel = tmp2;
3740 alu.src[1].chan = 1;
3741 } else {
3742 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3743 }
3744 alu.last = (j == 3);
3745 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3746 return r;
3747 }
3748 } else {
3749 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3750 alu.op = ALU_OP2_MULHI_UINT;
3751
3752 alu.dst.sel = tmp0;
3753 alu.dst.chan = 1;
3754 alu.dst.write = 1;
3755
3756 alu.src[0].sel = tmp0;
3757 alu.src[0].chan = 0;
3758
3759 if (signed_op) {
3760 alu.src[1].sel = tmp2;
3761 alu.src[1].chan = 1;
3762 } else {
3763 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3764 }
3765
3766 alu.last = 1;
3767 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3768 return r;
3769 }
3770
3771 /* 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z) = abs(lo(rcp*src)) */
3772 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3773 alu.op = ALU_OP3_CNDE_INT;
3774 alu.is_op3 = 1;
3775
3776 alu.dst.sel = tmp0;
3777 alu.dst.chan = 2;
3778 alu.dst.write = 1;
3779
3780 alu.src[0].sel = tmp0;
3781 alu.src[0].chan = 1;
3782 alu.src[1].sel = tmp0;
3783 alu.src[1].chan = 3;
3784 alu.src[2].sel = tmp0;
3785 alu.src[2].chan = 2;
3786
3787 alu.last = 1;
3788 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3789 return r;
3790
3791 /* 6. tmp0.w = hi (tmp0.z * tmp0.x) = e, rounding error */
3792 if (ctx->bc->chip_class == CAYMAN) {
3793 for (j = 0 ; j < 4; j++) {
3794 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3795 alu.op = ALU_OP2_MULHI_UINT;
3796
3797 alu.dst.sel = tmp0;
3798 alu.dst.chan = j;
3799 alu.dst.write = (j == 3);
3800
3801 alu.src[0].sel = tmp0;
3802 alu.src[0].chan = 2;
3803
3804 alu.src[1].sel = tmp0;
3805 alu.src[1].chan = 0;
3806
3807 alu.last = (j == 3);
3808 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3809 return r;
3810 }
3811 } else {
3812 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3813 alu.op = ALU_OP2_MULHI_UINT;
3814
3815 alu.dst.sel = tmp0;
3816 alu.dst.chan = 3;
3817 alu.dst.write = 1;
3818
3819 alu.src[0].sel = tmp0;
3820 alu.src[0].chan = 2;
3821
3822 alu.src[1].sel = tmp0;
3823 alu.src[1].chan = 0;
3824
3825 alu.last = 1;
3826 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3827 return r;
3828 }
3829
3830 /* 7. tmp1.x = tmp0.x - tmp0.w */
3831 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3832 alu.op = ALU_OP2_SUB_INT;
3833
3834 alu.dst.sel = tmp1;
3835 alu.dst.chan = 0;
3836 alu.dst.write = 1;
3837
3838 alu.src[0].sel = tmp0;
3839 alu.src[0].chan = 0;
3840 alu.src[1].sel = tmp0;
3841 alu.src[1].chan = 3;
3842
3843 alu.last = 1;
3844 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3845 return r;
3846
3847 /* 8. tmp1.y = tmp0.x + tmp0.w */
3848 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3849 alu.op = ALU_OP2_ADD_INT;
3850
3851 alu.dst.sel = tmp1;
3852 alu.dst.chan = 1;
3853 alu.dst.write = 1;
3854
3855 alu.src[0].sel = tmp0;
3856 alu.src[0].chan = 0;
3857 alu.src[1].sel = tmp0;
3858 alu.src[1].chan = 3;
3859
3860 alu.last = 1;
3861 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3862 return r;
3863
3864 /* 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) */
3865 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3866 alu.op = ALU_OP3_CNDE_INT;
3867 alu.is_op3 = 1;
3868
3869 alu.dst.sel = tmp0;
3870 alu.dst.chan = 0;
3871 alu.dst.write = 1;
3872
3873 alu.src[0].sel = tmp0;
3874 alu.src[0].chan = 1;
3875 alu.src[1].sel = tmp1;
3876 alu.src[1].chan = 1;
3877 alu.src[2].sel = tmp1;
3878 alu.src[2].chan = 0;
3879
3880 alu.last = 1;
3881 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3882 return r;
3883
3884 /* 10. tmp0.z = hi(tmp0.x * src1) = q */
3885 if (ctx->bc->chip_class == CAYMAN) {
3886 for (j = 0 ; j < 4; j++) {
3887 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3888 alu.op = ALU_OP2_MULHI_UINT;
3889
3890 alu.dst.sel = tmp0;
3891 alu.dst.chan = j;
3892 alu.dst.write = (j == 2);
3893
3894 alu.src[0].sel = tmp0;
3895 alu.src[0].chan = 0;
3896
3897 if (signed_op) {
3898 alu.src[1].sel = tmp2;
3899 alu.src[1].chan = 0;
3900 } else {
3901 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3902 }
3903
3904 alu.last = (j == 3);
3905 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3906 return r;
3907 }
3908 } else {
3909 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3910 alu.op = ALU_OP2_MULHI_UINT;
3911
3912 alu.dst.sel = tmp0;
3913 alu.dst.chan = 2;
3914 alu.dst.write = 1;
3915
3916 alu.src[0].sel = tmp0;
3917 alu.src[0].chan = 0;
3918
3919 if (signed_op) {
3920 alu.src[1].sel = tmp2;
3921 alu.src[1].chan = 0;
3922 } else {
3923 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3924 }
3925
3926 alu.last = 1;
3927 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3928 return r;
3929 }
3930
3931 /* 11. tmp0.y = lo (src2 * tmp0.z) = src2*q = src1 - r */
3932 if (ctx->bc->chip_class == CAYMAN) {
3933 for (j = 0 ; j < 4; j++) {
3934 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3935 alu.op = ALU_OP2_MULLO_UINT;
3936
3937 alu.dst.sel = tmp0;
3938 alu.dst.chan = j;
3939 alu.dst.write = (j == 1);
3940
3941 if (signed_op) {
3942 alu.src[0].sel = tmp2;
3943 alu.src[0].chan = 1;
3944 } else {
3945 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
3946 }
3947
3948 alu.src[1].sel = tmp0;
3949 alu.src[1].chan = 2;
3950
3951 alu.last = (j == 3);
3952 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3953 return r;
3954 }
3955 } else {
3956 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3957 alu.op = ALU_OP2_MULLO_UINT;
3958
3959 alu.dst.sel = tmp0;
3960 alu.dst.chan = 1;
3961 alu.dst.write = 1;
3962
3963 if (signed_op) {
3964 alu.src[0].sel = tmp2;
3965 alu.src[0].chan = 1;
3966 } else {
3967 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
3968 }
3969
3970 alu.src[1].sel = tmp0;
3971 alu.src[1].chan = 2;
3972
3973 alu.last = 1;
3974 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3975 return r;
3976 }
3977
3978 /* 12. tmp0.w = src1 - tmp0.y = r */
3979 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3980 alu.op = ALU_OP2_SUB_INT;
3981
3982 alu.dst.sel = tmp0;
3983 alu.dst.chan = 3;
3984 alu.dst.write = 1;
3985
3986 if (signed_op) {
3987 alu.src[0].sel = tmp2;
3988 alu.src[0].chan = 0;
3989 } else {
3990 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3991 }
3992
3993 alu.src[1].sel = tmp0;
3994 alu.src[1].chan = 1;
3995
3996 alu.last = 1;
3997 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3998 return r;
3999
4000 /* 13. tmp1.x = tmp0.w >= src2 = r >= src2 */
4001 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4002 alu.op = ALU_OP2_SETGE_UINT;
4003
4004 alu.dst.sel = tmp1;
4005 alu.dst.chan = 0;
4006 alu.dst.write = 1;
4007
4008 alu.src[0].sel = tmp0;
4009 alu.src[0].chan = 3;
4010 if (signed_op) {
4011 alu.src[1].sel = tmp2;
4012 alu.src[1].chan = 1;
4013 } else {
4014 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4015 }
4016
4017 alu.last = 1;
4018 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4019 return r;
4020
4021 /* 14. tmp1.y = src1 >= tmp0.y = r >= 0 */
4022 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4023 alu.op = ALU_OP2_SETGE_UINT;
4024
4025 alu.dst.sel = tmp1;
4026 alu.dst.chan = 1;
4027 alu.dst.write = 1;
4028
4029 if (signed_op) {
4030 alu.src[0].sel = tmp2;
4031 alu.src[0].chan = 0;
4032 } else {
4033 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4034 }
4035
4036 alu.src[1].sel = tmp0;
4037 alu.src[1].chan = 1;
4038
4039 alu.last = 1;
4040 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4041 return r;
4042
4043 if (mod) { /* UMOD */
4044
4045 /* 15. tmp1.z = tmp0.w - src2 = r - src2 */
4046 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4047 alu.op = ALU_OP2_SUB_INT;
4048
4049 alu.dst.sel = tmp1;
4050 alu.dst.chan = 2;
4051 alu.dst.write = 1;
4052
4053 alu.src[0].sel = tmp0;
4054 alu.src[0].chan = 3;
4055
4056 if (signed_op) {
4057 alu.src[1].sel = tmp2;
4058 alu.src[1].chan = 1;
4059 } else {
4060 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4061 }
4062
4063 alu.last = 1;
4064 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4065 return r;
4066
4067 /* 16. tmp1.w = tmp0.w + src2 = r + src2 */
4068 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4069 alu.op = ALU_OP2_ADD_INT;
4070
4071 alu.dst.sel = tmp1;
4072 alu.dst.chan = 3;
4073 alu.dst.write = 1;
4074
4075 alu.src[0].sel = tmp0;
4076 alu.src[0].chan = 3;
4077 if (signed_op) {
4078 alu.src[1].sel = tmp2;
4079 alu.src[1].chan = 1;
4080 } else {
4081 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4082 }
4083
4084 alu.last = 1;
4085 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4086 return r;
4087
4088 } else { /* UDIV */
4089
4090 /* 15. tmp1.z = tmp0.z + 1 = q + 1 DIV */
4091 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4092 alu.op = ALU_OP2_ADD_INT;
4093
4094 alu.dst.sel = tmp1;
4095 alu.dst.chan = 2;
4096 alu.dst.write = 1;
4097
4098 alu.src[0].sel = tmp0;
4099 alu.src[0].chan = 2;
4100 alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
4101
4102 alu.last = 1;
4103 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4104 return r;
4105
4106 /* 16. tmp1.w = tmp0.z - 1 = q - 1 */
4107 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4108 alu.op = ALU_OP2_ADD_INT;
4109
4110 alu.dst.sel = tmp1;
4111 alu.dst.chan = 3;
4112 alu.dst.write = 1;
4113
4114 alu.src[0].sel = tmp0;
4115 alu.src[0].chan = 2;
4116 alu.src[1].sel = V_SQ_ALU_SRC_M_1_INT;
4117
4118 alu.last = 1;
4119 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4120 return r;
4121
4122 }
4123
4124 /* 17. tmp1.x = tmp1.x & tmp1.y */
4125 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4126 alu.op = ALU_OP2_AND_INT;
4127
4128 alu.dst.sel = tmp1;
4129 alu.dst.chan = 0;
4130 alu.dst.write = 1;
4131
4132 alu.src[0].sel = tmp1;
4133 alu.src[0].chan = 0;
4134 alu.src[1].sel = tmp1;
4135 alu.src[1].chan = 1;
4136
4137 alu.last = 1;
4138 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4139 return r;
4140
4141 /* 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z DIV */
4142 /* 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z MOD */
4143 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4144 alu.op = ALU_OP3_CNDE_INT;
4145 alu.is_op3 = 1;
4146
4147 alu.dst.sel = tmp0;
4148 alu.dst.chan = 2;
4149 alu.dst.write = 1;
4150
4151 alu.src[0].sel = tmp1;
4152 alu.src[0].chan = 0;
4153 alu.src[1].sel = tmp0;
4154 alu.src[1].chan = mod ? 3 : 2;
4155 alu.src[2].sel = tmp1;
4156 alu.src[2].chan = 2;
4157
4158 alu.last = 1;
4159 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4160 return r;
4161
4162 /* 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z */
4163 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4164 alu.op = ALU_OP3_CNDE_INT;
4165 alu.is_op3 = 1;
4166
4167 if (signed_op) {
4168 alu.dst.sel = tmp0;
4169 alu.dst.chan = 2;
4170 alu.dst.write = 1;
4171 } else {
4172 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4173 }
4174
4175 alu.src[0].sel = tmp1;
4176 alu.src[0].chan = 1;
4177 alu.src[1].sel = tmp1;
4178 alu.src[1].chan = 3;
4179 alu.src[2].sel = tmp0;
4180 alu.src[2].chan = 2;
4181
4182 alu.last = 1;
4183 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4184 return r;
4185
4186 if (signed_op) {
4187
4188 /* fix the sign of the result */
4189
4190 if (mod) {
4191
4192 /* tmp0.x = -tmp0.z */
4193 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4194 alu.op = ALU_OP2_SUB_INT;
4195
4196 alu.dst.sel = tmp0;
4197 alu.dst.chan = 0;
4198 alu.dst.write = 1;
4199
4200 alu.src[0].sel = V_SQ_ALU_SRC_0;
4201 alu.src[1].sel = tmp0;
4202 alu.src[1].chan = 2;
4203
4204 alu.last = 1;
4205 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4206 return r;
4207
4208 /* sign of the remainder is the same as the sign of src0 */
4209 /* tmp0.x = src0>=0 ? tmp0.z : tmp0.x */
4210 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4211 alu.op = ALU_OP3_CNDGE_INT;
4212 alu.is_op3 = 1;
4213
4214 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4215
4216 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4217 alu.src[1].sel = tmp0;
4218 alu.src[1].chan = 2;
4219 alu.src[2].sel = tmp0;
4220 alu.src[2].chan = 0;
4221
4222 alu.last = 1;
4223 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4224 return r;
4225
4226 } else {
4227
4228 /* tmp0.x = -tmp0.z */
4229 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4230 alu.op = ALU_OP2_SUB_INT;
4231
4232 alu.dst.sel = tmp0;
4233 alu.dst.chan = 0;
4234 alu.dst.write = 1;
4235
4236 alu.src[0].sel = V_SQ_ALU_SRC_0;
4237 alu.src[1].sel = tmp0;
4238 alu.src[1].chan = 2;
4239
4240 alu.last = 1;
4241 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4242 return r;
4243
4244 /* fix the quotient sign (same as the sign of src0*src1) */
4245 /* tmp0.x = tmp2.z>=0 ? tmp0.z : tmp0.x */
4246 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4247 alu.op = ALU_OP3_CNDGE_INT;
4248 alu.is_op3 = 1;
4249
4250 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4251
4252 alu.src[0].sel = tmp2;
4253 alu.src[0].chan = 2;
4254 alu.src[1].sel = tmp0;
4255 alu.src[1].chan = 2;
4256 alu.src[2].sel = tmp0;
4257 alu.src[2].chan = 0;
4258
4259 alu.last = 1;
4260 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4261 return r;
4262 }
4263 }
4264 }
4265 return 0;
4266 }
4267
4268 static int tgsi_udiv(struct r600_shader_ctx *ctx)
4269 {
4270 return tgsi_divmod(ctx, 0, 0);
4271 }
4272
4273 static int tgsi_umod(struct r600_shader_ctx *ctx)
4274 {
4275 return tgsi_divmod(ctx, 1, 0);
4276 }
4277
4278 static int tgsi_idiv(struct r600_shader_ctx *ctx)
4279 {
4280 return tgsi_divmod(ctx, 0, 1);
4281 }
4282
4283 static int tgsi_imod(struct r600_shader_ctx *ctx)
4284 {
4285 return tgsi_divmod(ctx, 1, 1);
4286 }
4287
4288
4289 static int tgsi_f2i(struct r600_shader_ctx *ctx)
4290 {
4291 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4292 struct r600_bytecode_alu alu;
4293 int i, r;
4294 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4295 int last_inst = tgsi_last_instruction(write_mask);
4296
4297 for (i = 0; i < 4; i++) {
4298 if (!(write_mask & (1<<i)))
4299 continue;
4300
4301 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4302 alu.op = ALU_OP1_TRUNC;
4303
4304 alu.dst.sel = ctx->temp_reg;
4305 alu.dst.chan = i;
4306 alu.dst.write = 1;
4307
4308 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4309 if (i == last_inst)
4310 alu.last = 1;
4311 r = r600_bytecode_add_alu(ctx->bc, &alu);
4312 if (r)
4313 return r;
4314 }
4315
4316 for (i = 0; i < 4; i++) {
4317 if (!(write_mask & (1<<i)))
4318 continue;
4319
4320 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4321 alu.op = ctx->inst_info->op;
4322
4323 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4324
4325 alu.src[0].sel = ctx->temp_reg;
4326 alu.src[0].chan = i;
4327
4328 if (i == last_inst || alu.op == ALU_OP1_FLT_TO_UINT)
4329 alu.last = 1;
4330 r = r600_bytecode_add_alu(ctx->bc, &alu);
4331 if (r)
4332 return r;
4333 }
4334
4335 return 0;
4336 }
4337
4338 static int tgsi_iabs(struct r600_shader_ctx *ctx)
4339 {
4340 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4341 struct r600_bytecode_alu alu;
4342 int i, r;
4343 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4344 int last_inst = tgsi_last_instruction(write_mask);
4345
4346 /* tmp = -src */
4347 for (i = 0; i < 4; i++) {
4348 if (!(write_mask & (1<<i)))
4349 continue;
4350
4351 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4352 alu.op = ALU_OP2_SUB_INT;
4353
4354 alu.dst.sel = ctx->temp_reg;
4355 alu.dst.chan = i;
4356 alu.dst.write = 1;
4357
4358 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4359 alu.src[0].sel = V_SQ_ALU_SRC_0;
4360
4361 if (i == last_inst)
4362 alu.last = 1;
4363 r = r600_bytecode_add_alu(ctx->bc, &alu);
4364 if (r)
4365 return r;
4366 }
4367
4368 /* dst = (src >= 0 ? src : tmp) */
4369 for (i = 0; i < 4; i++) {
4370 if (!(write_mask & (1<<i)))
4371 continue;
4372
4373 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4374 alu.op = ALU_OP3_CNDGE_INT;
4375 alu.is_op3 = 1;
4376 alu.dst.write = 1;
4377
4378 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4379
4380 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4381 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4382 alu.src[2].sel = ctx->temp_reg;
4383 alu.src[2].chan = i;
4384
4385 if (i == last_inst)
4386 alu.last = 1;
4387 r = r600_bytecode_add_alu(ctx->bc, &alu);
4388 if (r)
4389 return r;
4390 }
4391 return 0;
4392 }
4393
4394 static int tgsi_issg(struct r600_shader_ctx *ctx)
4395 {
4396 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4397 struct r600_bytecode_alu alu;
4398 int i, r;
4399 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4400 int last_inst = tgsi_last_instruction(write_mask);
4401
4402 /* tmp = (src >= 0 ? src : -1) */
4403 for (i = 0; i < 4; i++) {
4404 if (!(write_mask & (1<<i)))
4405 continue;
4406
4407 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4408 alu.op = ALU_OP3_CNDGE_INT;
4409 alu.is_op3 = 1;
4410
4411 alu.dst.sel = ctx->temp_reg;
4412 alu.dst.chan = i;
4413 alu.dst.write = 1;
4414
4415 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4416 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4417 alu.src[2].sel = V_SQ_ALU_SRC_M_1_INT;
4418
4419 if (i == last_inst)
4420 alu.last = 1;
4421 r = r600_bytecode_add_alu(ctx->bc, &alu);
4422 if (r)
4423 return r;
4424 }
4425
4426 /* dst = (tmp > 0 ? 1 : tmp) */
4427 for (i = 0; i < 4; i++) {
4428 if (!(write_mask & (1<<i)))
4429 continue;
4430
4431 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4432 alu.op = ALU_OP3_CNDGT_INT;
4433 alu.is_op3 = 1;
4434 alu.dst.write = 1;
4435
4436 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4437
4438 alu.src[0].sel = ctx->temp_reg;
4439 alu.src[0].chan = i;
4440
4441 alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
4442
4443 alu.src[2].sel = ctx->temp_reg;
4444 alu.src[2].chan = i;
4445
4446 if (i == last_inst)
4447 alu.last = 1;
4448 r = r600_bytecode_add_alu(ctx->bc, &alu);
4449 if (r)
4450 return r;
4451 }
4452 return 0;
4453 }
4454
4455
4456
4457 static int tgsi_ssg(struct r600_shader_ctx *ctx)
4458 {
4459 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4460 struct r600_bytecode_alu alu;
4461 int i, r;
4462
4463 /* tmp = (src > 0 ? 1 : src) */
4464 for (i = 0; i < 4; i++) {
4465 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4466 alu.op = ALU_OP3_CNDGT;
4467 alu.is_op3 = 1;
4468
4469 alu.dst.sel = ctx->temp_reg;
4470 alu.dst.chan = i;
4471
4472 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4473 alu.src[1].sel = V_SQ_ALU_SRC_1;
4474 r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
4475
4476 if (i == 3)
4477 alu.last = 1;
4478 r = r600_bytecode_add_alu(ctx->bc, &alu);
4479 if (r)
4480 return r;
4481 }
4482
4483 /* dst = (-tmp > 0 ? -1 : tmp) */
4484 for (i = 0; i < 4; i++) {
4485 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4486 alu.op = ALU_OP3_CNDGT;
4487 alu.is_op3 = 1;
4488 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4489
4490 alu.src[0].sel = ctx->temp_reg;
4491 alu.src[0].chan = i;
4492 alu.src[0].neg = 1;
4493
4494 alu.src[1].sel = V_SQ_ALU_SRC_1;
4495 alu.src[1].neg = 1;
4496
4497 alu.src[2].sel = ctx->temp_reg;
4498 alu.src[2].chan = i;
4499
4500 if (i == 3)
4501 alu.last = 1;
4502 r = r600_bytecode_add_alu(ctx->bc, &alu);
4503 if (r)
4504 return r;
4505 }
4506 return 0;
4507 }
4508
4509 static int tgsi_bfi(struct r600_shader_ctx *ctx)
4510 {
4511 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4512 struct r600_bytecode_alu alu;
4513 int i, r, t1, t2;
4514
4515 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4516 int last_inst = tgsi_last_instruction(write_mask);
4517
4518 t1 = ctx->temp_reg;
4519
4520 for (i = 0; i < 4; i++) {
4521 if (!(write_mask & (1<<i)))
4522 continue;
4523
4524 /* create mask tmp */
4525 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4526 alu.op = ALU_OP2_BFM_INT;
4527 alu.dst.sel = t1;
4528 alu.dst.chan = i;
4529 alu.dst.write = 1;
4530 alu.last = i == last_inst;
4531
4532 r600_bytecode_src(&alu.src[0], &ctx->src[3], i);
4533 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
4534
4535 r = r600_bytecode_add_alu(ctx->bc, &alu);
4536 if (r)
4537 return r;
4538 }
4539
4540 t2 = r600_get_temp(ctx);
4541
4542 for (i = 0; i < 4; i++) {
4543 if (!(write_mask & (1<<i)))
4544 continue;
4545
4546 /* shift insert left */
4547 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4548 alu.op = ALU_OP2_LSHL_INT;
4549 alu.dst.sel = t2;
4550 alu.dst.chan = i;
4551 alu.dst.write = 1;
4552 alu.last = i == last_inst;
4553
4554 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4555 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
4556
4557 r = r600_bytecode_add_alu(ctx->bc, &alu);
4558 if (r)
4559 return r;
4560 }
4561
4562 for (i = 0; i < 4; i++) {
4563 if (!(write_mask & (1<<i)))
4564 continue;
4565
4566 /* actual bitfield insert */
4567 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4568 alu.op = ALU_OP3_BFI_INT;
4569 alu.is_op3 = 1;
4570 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4571 alu.dst.chan = i;
4572 alu.dst.write = 1;
4573 alu.last = i == last_inst;
4574
4575 alu.src[0].sel = t1;
4576 alu.src[0].chan = i;
4577 alu.src[1].sel = t2;
4578 alu.src[1].chan = i;
4579 r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
4580
4581 r = r600_bytecode_add_alu(ctx->bc, &alu);
4582 if (r)
4583 return r;
4584 }
4585
4586 return 0;
4587 }
4588
4589 static int tgsi_msb(struct r600_shader_ctx *ctx)
4590 {
4591 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4592 struct r600_bytecode_alu alu;
4593 int i, r, t1, t2;
4594
4595 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4596 int last_inst = tgsi_last_instruction(write_mask);
4597
4598 assert(ctx->inst_info->op == ALU_OP1_FFBH_INT ||
4599 ctx->inst_info->op == ALU_OP1_FFBH_UINT);
4600
4601 t1 = ctx->temp_reg;
4602
4603 /* bit position is indexed from lsb by TGSI, and from msb by the hardware */
4604 for (i = 0; i < 4; i++) {
4605 if (!(write_mask & (1<<i)))
4606 continue;
4607
4608 /* t1 = FFBH_INT / FFBH_UINT */
4609 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4610 alu.op = ctx->inst_info->op;
4611 alu.dst.sel = t1;
4612 alu.dst.chan = i;
4613 alu.dst.write = 1;
4614 alu.last = i == last_inst;
4615
4616 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4617
4618 r = r600_bytecode_add_alu(ctx->bc, &alu);
4619 if (r)
4620 return r;
4621 }
4622
4623 t2 = r600_get_temp(ctx);
4624
4625 for (i = 0; i < 4; i++) {
4626 if (!(write_mask & (1<<i)))
4627 continue;
4628
4629 /* t2 = 31 - t1 */
4630 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4631 alu.op = ALU_OP2_SUB_INT;
4632 alu.dst.sel = t2;
4633 alu.dst.chan = i;
4634 alu.dst.write = 1;
4635 alu.last = i == last_inst;
4636
4637 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
4638 alu.src[0].value = 31;
4639 alu.src[1].sel = t1;
4640 alu.src[1].chan = i;
4641
4642 r = r600_bytecode_add_alu(ctx->bc, &alu);
4643 if (r)
4644 return r;
4645 }
4646
4647 for (i = 0; i < 4; i++) {
4648 if (!(write_mask & (1<<i)))
4649 continue;
4650
4651 /* result = t1 >= 0 ? t2 : t1 */
4652 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4653 alu.op = ALU_OP3_CNDGE_INT;
4654 alu.is_op3 = 1;
4655 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4656 alu.dst.chan = i;
4657 alu.dst.write = 1;
4658 alu.last = i == last_inst;
4659
4660 alu.src[0].sel = t1;
4661 alu.src[0].chan = i;
4662 alu.src[1].sel = t2;
4663 alu.src[1].chan = i;
4664 alu.src[2].sel = t1;
4665 alu.src[2].chan = i;
4666
4667 r = r600_bytecode_add_alu(ctx->bc, &alu);
4668 if (r)
4669 return r;
4670 }
4671
4672 return 0;
4673 }
4674
4675 static int tgsi_interp_egcm(struct r600_shader_ctx *ctx)
4676 {
4677 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4678 struct r600_bytecode_alu alu;
4679 int r, i = 0, k, interp_gpr, interp_base_chan, tmp, lasti;
4680 unsigned location;
4681 int input;
4682
4683 assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
4684
4685 input = inst->Src[0].Register.Index;
4686
4687 /* Interpolators have been marked for use already by allocate_system_value_inputs */
4688 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
4689 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
4690 location = TGSI_INTERPOLATE_LOC_CENTER; /* sample offset will be added explicitly */
4691 }
4692 else {
4693 location = TGSI_INTERPOLATE_LOC_CENTROID;
4694 }
4695
4696 k = eg_get_interpolator_index(ctx->shader->input[input].interpolate, location);
4697 if (k < 0)
4698 k = 0;
4699 interp_gpr = ctx->eg_interpolators[k].ij_index / 2;
4700 interp_base_chan = 2 * (ctx->eg_interpolators[k].ij_index % 2);
4701
4702 /* NOTE: currently offset is not perspective correct */
4703 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
4704 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
4705 int sample_gpr = -1;
4706 int gradientsH, gradientsV;
4707 struct r600_bytecode_tex tex;
4708
4709 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
4710 sample_gpr = load_sample_position(ctx, &ctx->src[1], ctx->src[1].swizzle[0]);
4711 }
4712
4713 gradientsH = r600_get_temp(ctx);
4714 gradientsV = r600_get_temp(ctx);
4715 for (i = 0; i < 2; i++) {
4716 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
4717 tex.op = i == 0 ? FETCH_OP_GET_GRADIENTS_H : FETCH_OP_GET_GRADIENTS_V;
4718 tex.src_gpr = interp_gpr;
4719 tex.src_sel_x = interp_base_chan + 0;
4720 tex.src_sel_y = interp_base_chan + 1;
4721 tex.src_sel_z = 0;
4722 tex.src_sel_w = 0;
4723 tex.dst_gpr = i == 0 ? gradientsH : gradientsV;
4724 tex.dst_sel_x = 0;
4725 tex.dst_sel_y = 1;
4726 tex.dst_sel_z = 7;
4727 tex.dst_sel_w = 7;
4728 tex.inst_mod = 1; // Use per pixel gradient calculation
4729 tex.sampler_id = 0;
4730 tex.resource_id = tex.sampler_id;
4731 r = r600_bytecode_add_tex(ctx->bc, &tex);
4732 if (r)
4733 return r;
4734 }
4735
4736 for (i = 0; i < 2; i++) {
4737 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4738 alu.op = ALU_OP3_MULADD;
4739 alu.is_op3 = 1;
4740 alu.src[0].sel = gradientsH;
4741 alu.src[0].chan = i;
4742 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
4743 alu.src[1].sel = sample_gpr;
4744 alu.src[1].chan = 2;
4745 }
4746 else {
4747 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
4748 }
4749 alu.src[2].sel = interp_gpr;
4750 alu.src[2].chan = interp_base_chan + i;
4751 alu.dst.sel = ctx->temp_reg;
4752 alu.dst.chan = i;
4753 alu.last = i == 1;
4754
4755 r = r600_bytecode_add_alu(ctx->bc, &alu);
4756 if (r)
4757 return r;
4758 }
4759
4760 for (i = 0; i < 2; i++) {
4761 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4762 alu.op = ALU_OP3_MULADD;
4763 alu.is_op3 = 1;
4764 alu.src[0].sel = gradientsV;
4765 alu.src[0].chan = i;
4766 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
4767 alu.src[1].sel = sample_gpr;
4768 alu.src[1].chan = 3;
4769 }
4770 else {
4771 r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
4772 }
4773 alu.src[2].sel = ctx->temp_reg;
4774 alu.src[2].chan = i;
4775 alu.dst.sel = ctx->temp_reg;
4776 alu.dst.chan = i;
4777 alu.last = i == 1;
4778
4779 r = r600_bytecode_add_alu(ctx->bc, &alu);
4780 if (r)
4781 return r;
4782 }
4783 }
4784
4785 tmp = r600_get_temp(ctx);
4786 for (i = 0; i < 8; i++) {
4787 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4788 alu.op = i < 4 ? ALU_OP2_INTERP_ZW : ALU_OP2_INTERP_XY;
4789
4790 alu.dst.sel = tmp;
4791 if ((i > 1 && i < 6)) {
4792 alu.dst.write = 1;
4793 }
4794 else {
4795 alu.dst.write = 0;
4796 }
4797 alu.dst.chan = i % 4;
4798
4799 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
4800 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
4801 alu.src[0].sel = ctx->temp_reg;
4802 alu.src[0].chan = 1 - (i % 2);
4803 } else {
4804 alu.src[0].sel = interp_gpr;
4805 alu.src[0].chan = interp_base_chan + 1 - (i % 2);
4806 }
4807 alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
4808 alu.src[1].chan = 0;
4809
4810 alu.last = i % 4 == 3;
4811 alu.bank_swizzle_force = SQ_ALU_VEC_210;
4812
4813 r = r600_bytecode_add_alu(ctx->bc, &alu);
4814 if (r)
4815 return r;
4816 }
4817
4818 // INTERP can't swizzle dst
4819 lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4820 for (i = 0; i <= lasti; i++) {
4821 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4822 continue;
4823
4824 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4825 alu.op = ALU_OP1_MOV;
4826 alu.src[0].sel = tmp;
4827 alu.src[0].chan = ctx->src[0].swizzle[i];
4828 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4829 alu.dst.write = 1;
4830 alu.last = i == lasti;
4831 r = r600_bytecode_add_alu(ctx->bc, &alu);
4832 if (r)
4833 return r;
4834 }
4835
4836 return 0;
4837 }
4838
4839
4840 static int tgsi_helper_copy(struct r600_shader_ctx *ctx, struct tgsi_full_instruction *inst)
4841 {
4842 struct r600_bytecode_alu alu;
4843 int i, r;
4844
4845 for (i = 0; i < 4; i++) {
4846 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4847 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) {
4848 alu.op = ALU_OP0_NOP;
4849 alu.dst.chan = i;
4850 } else {
4851 alu.op = ALU_OP1_MOV;
4852 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4853 alu.src[0].sel = ctx->temp_reg;
4854 alu.src[0].chan = i;
4855 }
4856 if (i == 3) {
4857 alu.last = 1;
4858 }
4859 r = r600_bytecode_add_alu(ctx->bc, &alu);
4860 if (r)
4861 return r;
4862 }
4863 return 0;
4864 }
4865
4866 static int tgsi_make_src_for_op3(struct r600_shader_ctx *ctx,
4867 unsigned temp, int temp_chan,
4868 struct r600_bytecode_alu_src *bc_src,
4869 const struct r600_shader_src *shader_src,
4870 unsigned chan)
4871 {
4872 struct r600_bytecode_alu alu;
4873 int r;
4874
4875 r600_bytecode_src(bc_src, shader_src, chan);
4876
4877 /* op3 operands don't support abs modifier */
4878 if (bc_src->abs) {
4879 assert(temp!=0); /* we actually need the extra register, make sure it is allocated. */
4880 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4881 alu.op = ALU_OP1_MOV;
4882 alu.dst.sel = temp;
4883 alu.dst.chan = temp_chan;
4884 alu.dst.write = 1;
4885
4886 alu.src[0] = *bc_src;
4887 alu.last = true; // sufficient?
4888 r = r600_bytecode_add_alu(ctx->bc, &alu);
4889 if (r)
4890 return r;
4891
4892 memset(bc_src, 0, sizeof(*bc_src));
4893 bc_src->sel = temp;
4894 bc_src->chan = temp_chan;
4895 }
4896 return 0;
4897 }
4898
4899 static int tgsi_op3(struct r600_shader_ctx *ctx)
4900 {
4901 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4902 struct r600_bytecode_alu alu;
4903 int i, j, r;
4904 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4905
4906 for (i = 0; i < lasti + 1; i++) {
4907 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4908 continue;
4909
4910 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4911 alu.op = ctx->inst_info->op;
4912 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4913 r = tgsi_make_src_for_op3(ctx, ctx->temp_reg, j, &alu.src[j], &ctx->src[j], i);
4914 if (r)
4915 return r;
4916 }
4917
4918 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4919 alu.dst.chan = i;
4920 alu.dst.write = 1;
4921 alu.is_op3 = 1;
4922 if (i == lasti) {
4923 alu.last = 1;
4924 }
4925 r = r600_bytecode_add_alu(ctx->bc, &alu);
4926 if (r)
4927 return r;
4928 }
4929 return 0;
4930 }
4931
4932 static int tgsi_dp(struct r600_shader_ctx *ctx)
4933 {
4934 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4935 struct r600_bytecode_alu alu;
4936 int i, j, r;
4937
4938 for (i = 0; i < 4; i++) {
4939 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4940 alu.op = ctx->inst_info->op;
4941 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4942 r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
4943 }
4944
4945 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4946 alu.dst.chan = i;
4947 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
4948 /* handle some special cases */
4949 switch (inst->Instruction.Opcode) {
4950 case TGSI_OPCODE_DP2:
4951 if (i > 1) {
4952 alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
4953 alu.src[0].chan = alu.src[1].chan = 0;
4954 }
4955 break;
4956 case TGSI_OPCODE_DP3:
4957 if (i > 2) {
4958 alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
4959 alu.src[0].chan = alu.src[1].chan = 0;
4960 }
4961 break;
4962 case TGSI_OPCODE_DPH:
4963 if (i == 3) {
4964 alu.src[0].sel = V_SQ_ALU_SRC_1;
4965 alu.src[0].chan = 0;
4966 alu.src[0].neg = 0;
4967 }
4968 break;
4969 default:
4970 break;
4971 }
4972 if (i == 3) {
4973 alu.last = 1;
4974 }
4975 r = r600_bytecode_add_alu(ctx->bc, &alu);
4976 if (r)
4977 return r;
4978 }
4979 return 0;
4980 }
4981
4982 static inline boolean tgsi_tex_src_requires_loading(struct r600_shader_ctx *ctx,
4983 unsigned index)
4984 {
4985 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4986 return (inst->Src[index].Register.File != TGSI_FILE_TEMPORARY &&
4987 inst->Src[index].Register.File != TGSI_FILE_INPUT &&
4988 inst->Src[index].Register.File != TGSI_FILE_OUTPUT) ||
4989 ctx->src[index].neg || ctx->src[index].abs ||
4990 (inst->Src[index].Register.File == TGSI_FILE_INPUT && ctx->type == TGSI_PROCESSOR_GEOMETRY);
4991 }
4992
4993 static inline unsigned tgsi_tex_get_src_gpr(struct r600_shader_ctx *ctx,
4994 unsigned index)
4995 {
4996 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4997 return ctx->file_offset[inst->Src[index].Register.File] + inst->Src[index].Register.Index;
4998 }
4999
5000 static int do_vtx_fetch_inst(struct r600_shader_ctx *ctx, boolean src_requires_loading)
5001 {
5002 struct r600_bytecode_vtx vtx;
5003 struct r600_bytecode_alu alu;
5004 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5005 int src_gpr, r, i;
5006 int id = tgsi_tex_get_src_gpr(ctx, 1);
5007
5008 src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
5009 if (src_requires_loading) {
5010 for (i = 0; i < 4; i++) {
5011 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5012 alu.op = ALU_OP1_MOV;
5013 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5014 alu.dst.sel = ctx->temp_reg;
5015 alu.dst.chan = i;
5016 if (i == 3)
5017 alu.last = 1;
5018 alu.dst.write = 1;
5019 r = r600_bytecode_add_alu(ctx->bc, &alu);
5020 if (r)
5021 return r;
5022 }
5023 src_gpr = ctx->temp_reg;
5024 }
5025
5026 memset(&vtx, 0, sizeof(vtx));
5027 vtx.op = FETCH_OP_VFETCH;
5028 vtx.buffer_id = id + R600_MAX_CONST_BUFFERS;
5029 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
5030 vtx.src_gpr = src_gpr;
5031 vtx.mega_fetch_count = 16;
5032 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
5033 vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; /* SEL_X */
5034 vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; /* SEL_Y */
5035 vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; /* SEL_Z */
5036 vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; /* SEL_W */
5037 vtx.use_const_fields = 1;
5038
5039 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
5040 return r;
5041
5042 if (ctx->bc->chip_class >= EVERGREEN)
5043 return 0;
5044
5045 for (i = 0; i < 4; i++) {
5046 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5047 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5048 continue;
5049
5050 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5051 alu.op = ALU_OP2_AND_INT;
5052
5053 alu.dst.chan = i;
5054 alu.dst.sel = vtx.dst_gpr;
5055 alu.dst.write = 1;
5056
5057 alu.src[0].sel = vtx.dst_gpr;
5058 alu.src[0].chan = i;
5059
5060 alu.src[1].sel = 512 + (id * 2);
5061 alu.src[1].chan = i % 4;
5062 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
5063
5064 if (i == lasti)
5065 alu.last = 1;
5066 r = r600_bytecode_add_alu(ctx->bc, &alu);
5067 if (r)
5068 return r;
5069 }
5070
5071 if (inst->Dst[0].Register.WriteMask & 3) {
5072 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5073 alu.op = ALU_OP2_OR_INT;
5074
5075 alu.dst.chan = 3;
5076 alu.dst.sel = vtx.dst_gpr;
5077 alu.dst.write = 1;
5078
5079 alu.src[0].sel = vtx.dst_gpr;
5080 alu.src[0].chan = 3;
5081
5082 alu.src[1].sel = 512 + (id * 2) + 1;
5083 alu.src[1].chan = 0;
5084 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
5085
5086 alu.last = 1;
5087 r = r600_bytecode_add_alu(ctx->bc, &alu);
5088 if (r)
5089 return r;
5090 }
5091 return 0;
5092 }
5093
5094 static int r600_do_buffer_txq(struct r600_shader_ctx *ctx)
5095 {
5096 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5097 struct r600_bytecode_alu alu;
5098 int r;
5099 int id = tgsi_tex_get_src_gpr(ctx, 1);
5100
5101 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5102 alu.op = ALU_OP1_MOV;
5103
5104 if (ctx->bc->chip_class >= EVERGREEN) {
5105 /* channel 0 or 2 of each word */
5106 alu.src[0].sel = 512 + (id / 2);
5107 alu.src[0].chan = (id % 2) * 2;
5108 } else {
5109 /* r600 we have them at channel 2 of the second dword */
5110 alu.src[0].sel = 512 + (id * 2) + 1;
5111 alu.src[0].chan = 1;
5112 }
5113 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
5114 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
5115 alu.last = 1;
5116 r = r600_bytecode_add_alu(ctx->bc, &alu);
5117 if (r)
5118 return r;
5119 return 0;
5120 }
5121
5122 static int tgsi_tex(struct r600_shader_ctx *ctx)
5123 {
5124 static float one_point_five = 1.5f;
5125 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5126 struct r600_bytecode_tex tex;
5127 struct r600_bytecode_alu alu;
5128 unsigned src_gpr;
5129 int r, i, j;
5130 int opcode;
5131 bool read_compressed_msaa = ctx->bc->has_compressed_msaa_texturing &&
5132 inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
5133 (inst->Texture.Texture == TGSI_TEXTURE_2D_MSAA ||
5134 inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA);
5135
5136 bool txf_add_offsets = inst->Texture.NumOffsets &&
5137 inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
5138 inst->Texture.Texture != TGSI_TEXTURE_BUFFER;
5139
5140 /* Texture fetch instructions can only use gprs as source.
5141 * Also they cannot negate the source or take the absolute value */
5142 const boolean src_requires_loading = (inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ &&
5143 tgsi_tex_src_requires_loading(ctx, 0)) ||
5144 read_compressed_msaa || txf_add_offsets;
5145
5146 boolean src_loaded = FALSE;
5147 unsigned sampler_src_reg = inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ ? 0 : 1;
5148 int8_t offset_x = 0, offset_y = 0, offset_z = 0;
5149 boolean has_txq_cube_array_z = false;
5150 unsigned sampler_index_mode;
5151
5152 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ &&
5153 ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
5154 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)))
5155 if (inst->Dst[0].Register.WriteMask & 4) {
5156 ctx->shader->has_txq_cube_array_z_comp = true;
5157 has_txq_cube_array_z = true;
5158 }
5159
5160 if (inst->Instruction.Opcode == TGSI_OPCODE_TEX2 ||
5161 inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
5162 inst->Instruction.Opcode == TGSI_OPCODE_TXL2 ||
5163 inst->Instruction.Opcode == TGSI_OPCODE_TG4)
5164 sampler_src_reg = 2;
5165
5166 /* TGSI moves the sampler to src reg 3 for TXD */
5167 if (inst->Instruction.Opcode == TGSI_OPCODE_TXD)
5168 sampler_src_reg = 3;
5169
5170 sampler_index_mode = inst->Src[sampler_src_reg].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
5171 if (sampler_index_mode)
5172 ctx->shader->uses_index_registers = true;
5173
5174 src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
5175
5176 if (inst->Texture.Texture == TGSI_TEXTURE_BUFFER) {
5177 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) {
5178 ctx->shader->uses_tex_buffers = true;
5179 return r600_do_buffer_txq(ctx);
5180 }
5181 else if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
5182 if (ctx->bc->chip_class < EVERGREEN)
5183 ctx->shader->uses_tex_buffers = true;
5184 return do_vtx_fetch_inst(ctx, src_requires_loading);
5185 }
5186 }
5187
5188 if (inst->Instruction.Opcode == TGSI_OPCODE_TXP) {
5189 int out_chan;
5190 /* Add perspective divide */
5191 if (ctx->bc->chip_class == CAYMAN) {
5192 out_chan = 2;
5193 for (i = 0; i < 3; i++) {
5194 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5195 alu.op = ALU_OP1_RECIP_IEEE;
5196 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
5197
5198 alu.dst.sel = ctx->temp_reg;
5199 alu.dst.chan = i;
5200 if (i == 2)
5201 alu.last = 1;
5202 if (out_chan == i)
5203 alu.dst.write = 1;
5204 r = r600_bytecode_add_alu(ctx->bc, &alu);
5205 if (r)
5206 return r;
5207 }
5208
5209 } else {
5210 out_chan = 3;
5211 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5212 alu.op = ALU_OP1_RECIP_IEEE;
5213 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
5214
5215 alu.dst.sel = ctx->temp_reg;
5216 alu.dst.chan = out_chan;
5217 alu.last = 1;
5218 alu.dst.write = 1;
5219 r = r600_bytecode_add_alu(ctx->bc, &alu);
5220 if (r)
5221 return r;
5222 }
5223
5224 for (i = 0; i < 3; i++) {
5225 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5226 alu.op = ALU_OP2_MUL;
5227 alu.src[0].sel = ctx->temp_reg;
5228 alu.src[0].chan = out_chan;
5229 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5230 alu.dst.sel = ctx->temp_reg;
5231 alu.dst.chan = i;
5232 alu.dst.write = 1;
5233 r = r600_bytecode_add_alu(ctx->bc, &alu);
5234 if (r)
5235 return r;
5236 }
5237 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5238 alu.op = ALU_OP1_MOV;
5239 alu.src[0].sel = V_SQ_ALU_SRC_1;
5240 alu.src[0].chan = 0;
5241 alu.dst.sel = ctx->temp_reg;
5242 alu.dst.chan = 3;
5243 alu.last = 1;
5244 alu.dst.write = 1;
5245 r = r600_bytecode_add_alu(ctx->bc, &alu);
5246 if (r)
5247 return r;
5248 src_loaded = TRUE;
5249 src_gpr = ctx->temp_reg;
5250 }
5251
5252
5253 if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
5254 inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
5255 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
5256 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
5257 inst->Instruction.Opcode != TGSI_OPCODE_TXQ &&
5258 inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ) {
5259
5260 static const unsigned src0_swizzle[] = {2, 2, 0, 1};
5261 static const unsigned src1_swizzle[] = {1, 0, 2, 2};
5262
5263 /* tmp1.xyzw = CUBE(R0.zzxy, R0.yxzz) */
5264 for (i = 0; i < 4; i++) {
5265 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5266 alu.op = ALU_OP2_CUBE;
5267 r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
5268 r600_bytecode_src(&alu.src[1], &ctx->src[0], src1_swizzle[i]);
5269 alu.dst.sel = ctx->temp_reg;
5270 alu.dst.chan = i;
5271 if (i == 3)
5272 alu.last = 1;
5273 alu.dst.write = 1;
5274 r = r600_bytecode_add_alu(ctx->bc, &alu);
5275 if (r)
5276 return r;
5277 }
5278
5279 /* tmp1.z = RCP_e(|tmp1.z|) */
5280 if (ctx->bc->chip_class == CAYMAN) {
5281 for (i = 0; i < 3; i++) {
5282 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5283 alu.op = ALU_OP1_RECIP_IEEE;
5284 alu.src[0].sel = ctx->temp_reg;
5285 alu.src[0].chan = 2;
5286 alu.src[0].abs = 1;
5287 alu.dst.sel = ctx->temp_reg;
5288 alu.dst.chan = i;
5289 if (i == 2)
5290 alu.dst.write = 1;
5291 if (i == 2)
5292 alu.last = 1;
5293 r = r600_bytecode_add_alu(ctx->bc, &alu);
5294 if (r)
5295 return r;
5296 }
5297 } else {
5298 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5299 alu.op = ALU_OP1_RECIP_IEEE;
5300 alu.src[0].sel = ctx->temp_reg;
5301 alu.src[0].chan = 2;
5302 alu.src[0].abs = 1;
5303 alu.dst.sel = ctx->temp_reg;
5304 alu.dst.chan = 2;
5305 alu.dst.write = 1;
5306 alu.last = 1;
5307 r = r600_bytecode_add_alu(ctx->bc, &alu);
5308 if (r)
5309 return r;
5310 }
5311
5312 /* MULADD R0.x, R0.x, PS1, (0x3FC00000, 1.5f).x
5313 * MULADD R0.y, R0.y, PS1, (0x3FC00000, 1.5f).x
5314 * muladd has no writemask, have to use another temp
5315 */
5316 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5317 alu.op = ALU_OP3_MULADD;
5318 alu.is_op3 = 1;
5319
5320 alu.src[0].sel = ctx->temp_reg;
5321 alu.src[0].chan = 0;
5322 alu.src[1].sel = ctx->temp_reg;
5323 alu.src[1].chan = 2;
5324
5325 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
5326 alu.src[2].chan = 0;
5327 alu.src[2].value = *(uint32_t *)&one_point_five;
5328
5329 alu.dst.sel = ctx->temp_reg;
5330 alu.dst.chan = 0;
5331 alu.dst.write = 1;
5332
5333 r = r600_bytecode_add_alu(ctx->bc, &alu);
5334 if (r)
5335 return r;
5336
5337 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5338 alu.op = ALU_OP3_MULADD;
5339 alu.is_op3 = 1;
5340
5341 alu.src[0].sel = ctx->temp_reg;
5342 alu.src[0].chan = 1;
5343 alu.src[1].sel = ctx->temp_reg;
5344 alu.src[1].chan = 2;
5345
5346 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
5347 alu.src[2].chan = 0;
5348 alu.src[2].value = *(uint32_t *)&one_point_five;
5349
5350 alu.dst.sel = ctx->temp_reg;
5351 alu.dst.chan = 1;
5352 alu.dst.write = 1;
5353
5354 alu.last = 1;
5355 r = r600_bytecode_add_alu(ctx->bc, &alu);
5356 if (r)
5357 return r;
5358 /* write initial compare value into Z component
5359 - W src 0 for shadow cube
5360 - X src 1 for shadow cube array */
5361 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
5362 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
5363 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5364 alu.op = ALU_OP1_MOV;
5365 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
5366 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
5367 else
5368 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
5369 alu.dst.sel = ctx->temp_reg;
5370 alu.dst.chan = 2;
5371 alu.dst.write = 1;
5372 alu.last = 1;
5373 r = r600_bytecode_add_alu(ctx->bc, &alu);
5374 if (r)
5375 return r;
5376 }
5377
5378 if (inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
5379 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
5380 if (ctx->bc->chip_class >= EVERGREEN) {
5381 int mytmp = r600_get_temp(ctx);
5382 static const float eight = 8.0f;
5383 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5384 alu.op = ALU_OP1_MOV;
5385 alu.src[0].sel = ctx->temp_reg;
5386 alu.src[0].chan = 3;
5387 alu.dst.sel = mytmp;
5388 alu.dst.chan = 0;
5389 alu.dst.write = 1;
5390 alu.last = 1;
5391 r = r600_bytecode_add_alu(ctx->bc, &alu);
5392 if (r)
5393 return r;
5394
5395 /* have to multiply original layer by 8 and add to face id (temp.w) in Z */
5396 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5397 alu.op = ALU_OP3_MULADD;
5398 alu.is_op3 = 1;
5399 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
5400 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
5401 alu.src[1].chan = 0;
5402 alu.src[1].value = *(uint32_t *)&eight;
5403 alu.src[2].sel = mytmp;
5404 alu.src[2].chan = 0;
5405 alu.dst.sel = ctx->temp_reg;
5406 alu.dst.chan = 3;
5407 alu.dst.write = 1;
5408 alu.last = 1;
5409 r = r600_bytecode_add_alu(ctx->bc, &alu);
5410 if (r)
5411 return r;
5412 } else if (ctx->bc->chip_class < EVERGREEN) {
5413 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
5414 tex.op = FETCH_OP_SET_CUBEMAP_INDEX;
5415 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
5416 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
5417 tex.src_gpr = r600_get_temp(ctx);
5418 tex.src_sel_x = 0;
5419 tex.src_sel_y = 0;
5420 tex.src_sel_z = 0;
5421 tex.src_sel_w = 0;
5422 tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
5423 tex.coord_type_x = 1;
5424 tex.coord_type_y = 1;
5425 tex.coord_type_z = 1;
5426 tex.coord_type_w = 1;
5427 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5428 alu.op = ALU_OP1_MOV;
5429 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
5430 alu.dst.sel = tex.src_gpr;
5431 alu.dst.chan = 0;
5432 alu.last = 1;
5433 alu.dst.write = 1;
5434 r = r600_bytecode_add_alu(ctx->bc, &alu);
5435 if (r)
5436 return r;
5437
5438 r = r600_bytecode_add_tex(ctx->bc, &tex);
5439 if (r)
5440 return r;
5441 }
5442
5443 }
5444
5445 /* for cube forms of lod and bias we need to route things */
5446 if (inst->Instruction.Opcode == TGSI_OPCODE_TXB ||
5447 inst->Instruction.Opcode == TGSI_OPCODE_TXL ||
5448 inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
5449 inst->Instruction.Opcode == TGSI_OPCODE_TXL2) {
5450 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5451 alu.op = ALU_OP1_MOV;
5452 if (inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
5453 inst->Instruction.Opcode == TGSI_OPCODE_TXL2)
5454 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
5455 else
5456 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
5457 alu.dst.sel = ctx->temp_reg;
5458 alu.dst.chan = 2;
5459 alu.last = 1;
5460 alu.dst.write = 1;
5461 r = r600_bytecode_add_alu(ctx->bc, &alu);
5462 if (r)
5463 return r;
5464 }
5465
5466 src_loaded = TRUE;
5467 src_gpr = ctx->temp_reg;
5468 }
5469
5470 if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) {
5471 int temp_h = 0, temp_v = 0;
5472 int start_val = 0;
5473
5474 /* if we've already loaded the src (i.e. CUBE don't reload it). */
5475 if (src_loaded == TRUE)
5476 start_val = 1;
5477 else
5478 src_loaded = TRUE;
5479 for (i = start_val; i < 3; i++) {
5480 int treg = r600_get_temp(ctx);
5481
5482 if (i == 0)
5483 src_gpr = treg;
5484 else if (i == 1)
5485 temp_h = treg;
5486 else
5487 temp_v = treg;
5488
5489 for (j = 0; j < 4; j++) {
5490 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5491 alu.op = ALU_OP1_MOV;
5492 r600_bytecode_src(&alu.src[0], &ctx->src[i], j);
5493 alu.dst.sel = treg;
5494 alu.dst.chan = j;
5495 if (j == 3)
5496 alu.last = 1;
5497 alu.dst.write = 1;
5498 r = r600_bytecode_add_alu(ctx->bc, &alu);
5499 if (r)
5500 return r;
5501 }
5502 }
5503 for (i = 1; i < 3; i++) {
5504 /* set gradients h/v */
5505 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
5506 tex.op = (i == 1) ? FETCH_OP_SET_GRADIENTS_H :
5507 FETCH_OP_SET_GRADIENTS_V;
5508 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
5509 tex.sampler_index_mode = sampler_index_mode;
5510 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
5511 tex.resource_index_mode = sampler_index_mode;
5512
5513 tex.src_gpr = (i == 1) ? temp_h : temp_v;
5514 tex.src_sel_x = 0;
5515 tex.src_sel_y = 1;
5516 tex.src_sel_z = 2;
5517 tex.src_sel_w = 3;
5518
5519 tex.dst_gpr = r600_get_temp(ctx); /* just to avoid confusing the asm scheduler */
5520 tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
5521 if (inst->Texture.Texture != TGSI_TEXTURE_RECT) {
5522 tex.coord_type_x = 1;
5523 tex.coord_type_y = 1;
5524 tex.coord_type_z = 1;
5525 tex.coord_type_w = 1;
5526 }
5527 r = r600_bytecode_add_tex(ctx->bc, &tex);
5528 if (r)
5529 return r;
5530 }
5531 }
5532
5533 if (src_requires_loading && !src_loaded) {
5534 for (i = 0; i < 4; i++) {
5535 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5536 alu.op = ALU_OP1_MOV;
5537 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5538 alu.dst.sel = ctx->temp_reg;
5539 alu.dst.chan = i;
5540 if (i == 3)
5541 alu.last = 1;
5542 alu.dst.write = 1;
5543 r = r600_bytecode_add_alu(ctx->bc, &alu);
5544 if (r)
5545 return r;
5546 }
5547 src_loaded = TRUE;
5548 src_gpr = ctx->temp_reg;
5549 }
5550
5551 /* get offset values */
5552 if (inst->Texture.NumOffsets) {
5553 assert(inst->Texture.NumOffsets == 1);
5554
5555 /* The texture offset feature doesn't work with the TXF instruction
5556 * and must be emulated by adding the offset to the texture coordinates. */
5557 if (txf_add_offsets) {
5558 const struct tgsi_texture_offset *off = inst->TexOffsets;
5559
5560 switch (inst->Texture.Texture) {
5561 case TGSI_TEXTURE_3D:
5562 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5563 alu.op = ALU_OP2_ADD_INT;
5564 alu.src[0].sel = src_gpr;
5565 alu.src[0].chan = 2;
5566 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
5567 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleZ];
5568 alu.dst.sel = src_gpr;
5569 alu.dst.chan = 2;
5570 alu.dst.write = 1;
5571 alu.last = 1;
5572 r = r600_bytecode_add_alu(ctx->bc, &alu);
5573 if (r)
5574 return r;
5575 /* fall through */
5576
5577 case TGSI_TEXTURE_2D:
5578 case TGSI_TEXTURE_SHADOW2D:
5579 case TGSI_TEXTURE_RECT:
5580 case TGSI_TEXTURE_SHADOWRECT:
5581 case TGSI_TEXTURE_2D_ARRAY:
5582 case TGSI_TEXTURE_SHADOW2D_ARRAY:
5583 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5584 alu.op = ALU_OP2_ADD_INT;
5585 alu.src[0].sel = src_gpr;
5586 alu.src[0].chan = 1;
5587 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
5588 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleY];
5589 alu.dst.sel = src_gpr;
5590 alu.dst.chan = 1;
5591 alu.dst.write = 1;
5592 alu.last = 1;
5593 r = r600_bytecode_add_alu(ctx->bc, &alu);
5594 if (r)
5595 return r;
5596 /* fall through */
5597
5598 case TGSI_TEXTURE_1D:
5599 case TGSI_TEXTURE_SHADOW1D:
5600 case TGSI_TEXTURE_1D_ARRAY:
5601 case TGSI_TEXTURE_SHADOW1D_ARRAY:
5602 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5603 alu.op = ALU_OP2_ADD_INT;
5604 alu.src[0].sel = src_gpr;
5605 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
5606 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleX];
5607 alu.dst.sel = src_gpr;
5608 alu.dst.write = 1;
5609 alu.last = 1;
5610 r = r600_bytecode_add_alu(ctx->bc, &alu);
5611 if (r)
5612 return r;
5613 break;
5614 /* texture offsets do not apply to other texture targets */
5615 }
5616 } else {
5617 switch (inst->Texture.Texture) {
5618 case TGSI_TEXTURE_3D:
5619 offset_z = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleZ] << 1;
5620 /* fallthrough */
5621 case TGSI_TEXTURE_2D:
5622 case TGSI_TEXTURE_SHADOW2D:
5623 case TGSI_TEXTURE_RECT:
5624 case TGSI_TEXTURE_SHADOWRECT:
5625 case TGSI_TEXTURE_2D_ARRAY:
5626 case TGSI_TEXTURE_SHADOW2D_ARRAY:
5627 offset_y = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleY] << 1;
5628 /* fallthrough */
5629 case TGSI_TEXTURE_1D:
5630 case TGSI_TEXTURE_SHADOW1D:
5631 case TGSI_TEXTURE_1D_ARRAY:
5632 case TGSI_TEXTURE_SHADOW1D_ARRAY:
5633 offset_x = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleX] << 1;
5634 }
5635 }
5636 }
5637
5638 /* Obtain the sample index for reading a compressed MSAA color texture.
5639 * To read the FMASK, we use the ldfptr instruction, which tells us
5640 * where the samples are stored.
5641 * For uncompressed 8x MSAA surfaces, ldfptr should return 0x76543210,
5642 * which is the identity mapping. Each nibble says which physical sample
5643 * should be fetched to get that sample.
5644 *
5645 * Assume src.z contains the sample index. It should be modified like this:
5646 * src.z = (ldfptr() >> (src.z * 4)) & 0xF;
5647 * Then fetch the texel with src.
5648 */
5649 if (read_compressed_msaa) {
5650 unsigned sample_chan = 3;
5651 unsigned temp = r600_get_temp(ctx);
5652 assert(src_loaded);
5653
5654 /* temp.w = ldfptr() */
5655 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
5656 tex.op = FETCH_OP_LD;
5657 tex.inst_mod = 1; /* to indicate this is ldfptr */
5658 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
5659 tex.sampler_index_mode = sampler_index_mode;
5660 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
5661 tex.resource_index_mode = sampler_index_mode;
5662 tex.src_gpr = src_gpr;
5663 tex.dst_gpr = temp;
5664 tex.dst_sel_x = 7; /* mask out these components */
5665 tex.dst_sel_y = 7;
5666 tex.dst_sel_z = 7;
5667 tex.dst_sel_w = 0; /* store X */
5668 tex.src_sel_x = 0;
5669 tex.src_sel_y = 1;
5670 tex.src_sel_z = 2;
5671 tex.src_sel_w = 3;
5672 tex.offset_x = offset_x;
5673 tex.offset_y = offset_y;
5674 tex.offset_z = offset_z;
5675 r = r600_bytecode_add_tex(ctx->bc, &tex);
5676 if (r)
5677 return r;
5678
5679 /* temp.x = sample_index*4 */
5680 if (ctx->bc->chip_class == CAYMAN) {
5681 for (i = 0 ; i < 4; i++) {
5682 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5683 alu.op = ALU_OP2_MULLO_INT;
5684 alu.src[0].sel = src_gpr;
5685 alu.src[0].chan = sample_chan;
5686 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
5687 alu.src[1].value = 4;
5688 alu.dst.sel = temp;
5689 alu.dst.chan = i;
5690 alu.dst.write = i == 0;
5691 if (i == 3)
5692 alu.last = 1;
5693 r = r600_bytecode_add_alu(ctx->bc, &alu);
5694 if (r)
5695 return r;
5696 }
5697 } else {
5698 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5699 alu.op = ALU_OP2_MULLO_INT;
5700 alu.src[0].sel = src_gpr;
5701 alu.src[0].chan = sample_chan;
5702 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
5703 alu.src[1].value = 4;
5704 alu.dst.sel = temp;
5705 alu.dst.chan = 0;
5706 alu.dst.write = 1;
5707 alu.last = 1;
5708 r = r600_bytecode_add_alu(ctx->bc, &alu);
5709 if (r)
5710 return r;
5711 }
5712
5713 /* sample_index = temp.w >> temp.x */
5714 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5715 alu.op = ALU_OP2_LSHR_INT;
5716 alu.src[0].sel = temp;
5717 alu.src[0].chan = 3;
5718 alu.src[1].sel = temp;
5719 alu.src[1].chan = 0;
5720 alu.dst.sel = src_gpr;
5721 alu.dst.chan = sample_chan;
5722 alu.dst.write = 1;
5723 alu.last = 1;
5724 r = r600_bytecode_add_alu(ctx->bc, &alu);
5725 if (r)
5726 return r;
5727
5728 /* sample_index & 0xF */
5729 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5730 alu.op = ALU_OP2_AND_INT;
5731 alu.src[0].sel = src_gpr;
5732 alu.src[0].chan = sample_chan;
5733 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
5734 alu.src[1].value = 0xF;
5735 alu.dst.sel = src_gpr;
5736 alu.dst.chan = sample_chan;
5737 alu.dst.write = 1;
5738 alu.last = 1;
5739 r = r600_bytecode_add_alu(ctx->bc, &alu);
5740 if (r)
5741 return r;
5742 #if 0
5743 /* visualize the FMASK */
5744 for (i = 0; i < 4; i++) {
5745 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5746 alu.op = ALU_OP1_INT_TO_FLT;
5747 alu.src[0].sel = src_gpr;
5748 alu.src[0].chan = sample_chan;
5749 alu.dst.sel = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
5750 alu.dst.chan = i;
5751 alu.dst.write = 1;
5752 alu.last = 1;
5753 r = r600_bytecode_add_alu(ctx->bc, &alu);
5754 if (r)
5755 return r;
5756 }
5757 return 0;
5758 #endif
5759 }
5760
5761 /* does this shader want a num layers from TXQ for a cube array? */
5762 if (has_txq_cube_array_z) {
5763 int id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
5764
5765 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5766 alu.op = ALU_OP1_MOV;
5767
5768 if (ctx->bc->chip_class >= EVERGREEN) {
5769 /* channel 1 or 3 of each word */
5770 alu.src[0].sel = 512 + (id / 2);
5771 alu.src[0].chan = ((id % 2) * 2) + 1;
5772 } else {
5773 /* r600 we have them at channel 2 of the second dword */
5774 alu.src[0].sel = 512 + (id * 2) + 1;
5775 alu.src[0].chan = 2;
5776 }
5777 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
5778 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
5779 alu.last = 1;
5780 r = r600_bytecode_add_alu(ctx->bc, &alu);
5781 if (r)
5782 return r;
5783 /* disable writemask from texture instruction */
5784 inst->Dst[0].Register.WriteMask &= ~4;
5785 }
5786
5787 opcode = ctx->inst_info->op;
5788 if (opcode == FETCH_OP_GATHER4 &&
5789 inst->TexOffsets[0].File != TGSI_FILE_NULL &&
5790 inst->TexOffsets[0].File != TGSI_FILE_IMMEDIATE) {
5791 opcode = FETCH_OP_GATHER4_O;
5792
5793 /* GATHER4_O/GATHER4_C_O use offset values loaded by
5794 SET_TEXTURE_OFFSETS instruction. The immediate offset values
5795 encoded in the instruction are ignored. */
5796 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
5797 tex.op = FETCH_OP_SET_TEXTURE_OFFSETS;
5798 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
5799 tex.sampler_index_mode = sampler_index_mode;
5800 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
5801 tex.resource_index_mode = sampler_index_mode;
5802
5803 tex.src_gpr = ctx->file_offset[inst->TexOffsets[0].File] + inst->TexOffsets[0].Index;
5804 tex.src_sel_x = inst->TexOffsets[0].SwizzleX;
5805 tex.src_sel_y = inst->TexOffsets[0].SwizzleY;
5806 tex.src_sel_z = inst->TexOffsets[0].SwizzleZ;
5807 tex.src_sel_w = 4;
5808
5809 tex.dst_sel_x = 7;
5810 tex.dst_sel_y = 7;
5811 tex.dst_sel_z = 7;
5812 tex.dst_sel_w = 7;
5813
5814 r = r600_bytecode_add_tex(ctx->bc, &tex);
5815 if (r)
5816 return r;
5817 }
5818
5819 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
5820 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
5821 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
5822 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
5823 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY ||
5824 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
5825 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
5826 switch (opcode) {
5827 case FETCH_OP_SAMPLE:
5828 opcode = FETCH_OP_SAMPLE_C;
5829 break;
5830 case FETCH_OP_SAMPLE_L:
5831 opcode = FETCH_OP_SAMPLE_C_L;
5832 break;
5833 case FETCH_OP_SAMPLE_LB:
5834 opcode = FETCH_OP_SAMPLE_C_LB;
5835 break;
5836 case FETCH_OP_SAMPLE_G:
5837 opcode = FETCH_OP_SAMPLE_C_G;
5838 break;
5839 /* Texture gather variants */
5840 case FETCH_OP_GATHER4:
5841 opcode = FETCH_OP_GATHER4_C;
5842 break;
5843 case FETCH_OP_GATHER4_O:
5844 opcode = FETCH_OP_GATHER4_C_O;
5845 break;
5846 }
5847 }
5848
5849 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
5850 tex.op = opcode;
5851
5852 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
5853 tex.sampler_index_mode = sampler_index_mode;
5854 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
5855 tex.resource_index_mode = sampler_index_mode;
5856 tex.src_gpr = src_gpr;
5857 tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
5858
5859 if (inst->Instruction.Opcode == TGSI_OPCODE_DDX_FINE ||
5860 inst->Instruction.Opcode == TGSI_OPCODE_DDY_FINE) {
5861 tex.inst_mod = 1; /* per pixel gradient calculation instead of per 2x2 quad */
5862 }
5863
5864 if (inst->Instruction.Opcode == TGSI_OPCODE_TG4) {
5865 int8_t texture_component_select = ctx->literals[4 * inst->Src[1].Register.Index + inst->Src[1].Register.SwizzleX];
5866 tex.inst_mod = texture_component_select;
5867
5868 if (ctx->bc->chip_class == CAYMAN) {
5869 /* GATHER4 result order is different from TGSI TG4 */
5870 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 0 : 7;
5871 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 4) ? 1 : 7;
5872 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 1) ? 2 : 7;
5873 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
5874 } else {
5875 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
5876 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
5877 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
5878 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
5879 }
5880 }
5881 else if (inst->Instruction.Opcode == TGSI_OPCODE_LODQ) {
5882 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
5883 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
5884 tex.dst_sel_z = 7;
5885 tex.dst_sel_w = 7;
5886 }
5887 else {
5888 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
5889 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
5890 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
5891 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
5892 }
5893
5894
5895 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ) {
5896 tex.src_sel_x = 4;
5897 tex.src_sel_y = 4;
5898 tex.src_sel_z = 4;
5899 tex.src_sel_w = 4;
5900 } else if (src_loaded) {
5901 tex.src_sel_x = 0;
5902 tex.src_sel_y = 1;
5903 tex.src_sel_z = 2;
5904 tex.src_sel_w = 3;
5905 } else {
5906 tex.src_sel_x = ctx->src[0].swizzle[0];
5907 tex.src_sel_y = ctx->src[0].swizzle[1];
5908 tex.src_sel_z = ctx->src[0].swizzle[2];
5909 tex.src_sel_w = ctx->src[0].swizzle[3];
5910 tex.src_rel = ctx->src[0].rel;
5911 }
5912
5913 if (inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
5914 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
5915 inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
5916 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
5917 tex.src_sel_x = 1;
5918 tex.src_sel_y = 0;
5919 tex.src_sel_z = 3;
5920 tex.src_sel_w = 2; /* route Z compare or Lod value into W */
5921 }
5922
5923 if (inst->Texture.Texture != TGSI_TEXTURE_RECT &&
5924 inst->Texture.Texture != TGSI_TEXTURE_SHADOWRECT) {
5925 tex.coord_type_x = 1;
5926 tex.coord_type_y = 1;
5927 }
5928 tex.coord_type_z = 1;
5929 tex.coord_type_w = 1;
5930
5931 tex.offset_x = offset_x;
5932 tex.offset_y = offset_y;
5933 if (inst->Instruction.Opcode == TGSI_OPCODE_TG4 &&
5934 (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
5935 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY)) {
5936 tex.offset_z = 0;
5937 }
5938 else {
5939 tex.offset_z = offset_z;
5940 }
5941
5942 /* Put the depth for comparison in W.
5943 * TGSI_TEXTURE_SHADOW2D_ARRAY already has the depth in W.
5944 * Some instructions expect the depth in Z. */
5945 if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
5946 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
5947 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
5948 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) &&
5949 opcode != FETCH_OP_SAMPLE_C_L &&
5950 opcode != FETCH_OP_SAMPLE_C_LB) {
5951 tex.src_sel_w = tex.src_sel_z;
5952 }
5953
5954 if (inst->Texture.Texture == TGSI_TEXTURE_1D_ARRAY ||
5955 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) {
5956 if (opcode == FETCH_OP_SAMPLE_C_L ||
5957 opcode == FETCH_OP_SAMPLE_C_LB) {
5958 /* the array index is read from Y */
5959 tex.coord_type_y = 0;
5960 } else {
5961 /* the array index is read from Z */
5962 tex.coord_type_z = 0;
5963 tex.src_sel_z = tex.src_sel_y;
5964 }
5965 } else if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
5966 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
5967 ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
5968 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
5969 (ctx->bc->chip_class >= EVERGREEN)))
5970 /* the array index is read from Z */
5971 tex.coord_type_z = 0;
5972
5973 /* mask unused source components */
5974 if (opcode == FETCH_OP_SAMPLE || opcode == FETCH_OP_GATHER4) {
5975 switch (inst->Texture.Texture) {
5976 case TGSI_TEXTURE_2D:
5977 case TGSI_TEXTURE_RECT:
5978 tex.src_sel_z = 7;
5979 tex.src_sel_w = 7;
5980 break;
5981 case TGSI_TEXTURE_1D_ARRAY:
5982 tex.src_sel_y = 7;
5983 tex.src_sel_w = 7;
5984 break;
5985 case TGSI_TEXTURE_1D:
5986 tex.src_sel_y = 7;
5987 tex.src_sel_z = 7;
5988 tex.src_sel_w = 7;
5989 break;
5990 }
5991 }
5992
5993 r = r600_bytecode_add_tex(ctx->bc, &tex);
5994 if (r)
5995 return r;
5996
5997 /* add shadow ambient support - gallium doesn't do it yet */
5998 return 0;
5999 }
6000
6001 static int tgsi_lrp(struct r600_shader_ctx *ctx)
6002 {
6003 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6004 struct r600_bytecode_alu alu;
6005 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6006 unsigned i, extra_temp;
6007 int r;
6008
6009 /* optimize if it's just an equal balance */
6010 if (ctx->src[0].sel == V_SQ_ALU_SRC_0_5) {
6011 for (i = 0; i < lasti + 1; i++) {
6012 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6013 continue;
6014
6015 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6016 alu.op = ALU_OP2_ADD;
6017 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
6018 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6019 alu.omod = 3;
6020 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6021 alu.dst.chan = i;
6022 if (i == lasti) {
6023 alu.last = 1;
6024 }
6025 r = r600_bytecode_add_alu(ctx->bc, &alu);
6026 if (r)
6027 return r;
6028 }
6029 return 0;
6030 }
6031
6032 /* 1 - src0 */
6033 for (i = 0; i < lasti + 1; i++) {
6034 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6035 continue;
6036
6037 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6038 alu.op = ALU_OP2_ADD;
6039 alu.src[0].sel = V_SQ_ALU_SRC_1;
6040 alu.src[0].chan = 0;
6041 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6042 r600_bytecode_src_toggle_neg(&alu.src[1]);
6043 alu.dst.sel = ctx->temp_reg;
6044 alu.dst.chan = i;
6045 if (i == lasti) {
6046 alu.last = 1;
6047 }
6048 alu.dst.write = 1;
6049 r = r600_bytecode_add_alu(ctx->bc, &alu);
6050 if (r)
6051 return r;
6052 }
6053
6054 /* (1 - src0) * src2 */
6055 for (i = 0; i < lasti + 1; i++) {
6056 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6057 continue;
6058
6059 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6060 alu.op = ALU_OP2_MUL;
6061 alu.src[0].sel = ctx->temp_reg;
6062 alu.src[0].chan = i;
6063 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6064 alu.dst.sel = ctx->temp_reg;
6065 alu.dst.chan = i;
6066 if (i == lasti) {
6067 alu.last = 1;
6068 }
6069 alu.dst.write = 1;
6070 r = r600_bytecode_add_alu(ctx->bc, &alu);
6071 if (r)
6072 return r;
6073 }
6074
6075 /* src0 * src1 + (1 - src0) * src2 */
6076 if (ctx->src[0].abs || ctx->src[1].abs) /* XXX avoid dupliating condition */
6077 extra_temp = r600_get_temp(ctx);
6078 else
6079 extra_temp = 0;
6080 for (i = 0; i < lasti + 1; i++) {
6081 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6082 continue;
6083
6084 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6085 alu.op = ALU_OP3_MULADD;
6086 alu.is_op3 = 1;
6087 r = tgsi_make_src_for_op3(ctx, extra_temp, 0, &alu.src[0], &ctx->src[0], i);
6088 if (r)
6089 return r;
6090 r = tgsi_make_src_for_op3(ctx, extra_temp, 1, &alu.src[1], &ctx->src[1], i);
6091 if (r)
6092 return r;
6093 alu.src[2].sel = ctx->temp_reg;
6094 alu.src[2].chan = i;
6095
6096 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6097 alu.dst.chan = i;
6098 if (i == lasti) {
6099 alu.last = 1;
6100 }
6101 r = r600_bytecode_add_alu(ctx->bc, &alu);
6102 if (r)
6103 return r;
6104 }
6105 return 0;
6106 }
6107
6108 static int tgsi_cmp(struct r600_shader_ctx *ctx)
6109 {
6110 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6111 struct r600_bytecode_alu alu;
6112 int i, r;
6113 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6114
6115 for (i = 0; i < lasti + 1; i++) {
6116 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6117 continue;
6118
6119 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6120 alu.op = ALU_OP3_CNDGE;
6121 r = tgsi_make_src_for_op3(ctx, ctx->temp_reg, 0, &alu.src[0], &ctx->src[0], i);
6122 if (r)
6123 return r;
6124 r = tgsi_make_src_for_op3(ctx, ctx->temp_reg, 1, &alu.src[1], &ctx->src[2], i);
6125 if (r)
6126 return r;
6127 r = tgsi_make_src_for_op3(ctx, ctx->temp_reg, 2, &alu.src[2], &ctx->src[1], i);
6128 if (r)
6129 return r;
6130 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6131 alu.dst.chan = i;
6132 alu.dst.write = 1;
6133 alu.is_op3 = 1;
6134 if (i == lasti)
6135 alu.last = 1;
6136 r = r600_bytecode_add_alu(ctx->bc, &alu);
6137 if (r)
6138 return r;
6139 }
6140 return 0;
6141 }
6142
6143 static int tgsi_ucmp(struct r600_shader_ctx *ctx)
6144 {
6145 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6146 struct r600_bytecode_alu alu;
6147 int i, r;
6148 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6149
6150 for (i = 0; i < lasti + 1; i++) {
6151 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6152 continue;
6153
6154 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6155 alu.op = ALU_OP3_CNDE_INT;
6156 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6157 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6158 r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
6159 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6160 alu.dst.chan = i;
6161 alu.dst.write = 1;
6162 alu.is_op3 = 1;
6163 if (i == lasti)
6164 alu.last = 1;
6165 r = r600_bytecode_add_alu(ctx->bc, &alu);
6166 if (r)
6167 return r;
6168 }
6169 return 0;
6170 }
6171
6172 static int tgsi_xpd(struct r600_shader_ctx *ctx)
6173 {
6174 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6175 static const unsigned int src0_swizzle[] = {2, 0, 1};
6176 static const unsigned int src1_swizzle[] = {1, 2, 0};
6177 struct r600_bytecode_alu alu;
6178 uint32_t use_temp = 0;
6179 int i, r;
6180
6181 if (inst->Dst[0].Register.WriteMask != 0xf)
6182 use_temp = 1;
6183
6184 for (i = 0; i < 4; i++) {
6185 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6186 alu.op = ALU_OP2_MUL;
6187 if (i < 3) {
6188 r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
6189 r600_bytecode_src(&alu.src[1], &ctx->src[1], src1_swizzle[i]);
6190 } else {
6191 alu.src[0].sel = V_SQ_ALU_SRC_0;
6192 alu.src[0].chan = i;
6193 alu.src[1].sel = V_SQ_ALU_SRC_0;
6194 alu.src[1].chan = i;
6195 }
6196
6197 alu.dst.sel = ctx->temp_reg;
6198 alu.dst.chan = i;
6199 alu.dst.write = 1;
6200
6201 if (i == 3)
6202 alu.last = 1;
6203 r = r600_bytecode_add_alu(ctx->bc, &alu);
6204 if (r)
6205 return r;
6206 }
6207
6208 for (i = 0; i < 4; i++) {
6209 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6210 alu.op = ALU_OP3_MULADD;
6211
6212 if (i < 3) {
6213 r600_bytecode_src(&alu.src[0], &ctx->src[0], src1_swizzle[i]);
6214 r600_bytecode_src(&alu.src[1], &ctx->src[1], src0_swizzle[i]);
6215 } else {
6216 alu.src[0].sel = V_SQ_ALU_SRC_0;
6217 alu.src[0].chan = i;
6218 alu.src[1].sel = V_SQ_ALU_SRC_0;
6219 alu.src[1].chan = i;
6220 }
6221
6222 alu.src[2].sel = ctx->temp_reg;
6223 alu.src[2].neg = 1;
6224 alu.src[2].chan = i;
6225
6226 if (use_temp)
6227 alu.dst.sel = ctx->temp_reg;
6228 else
6229 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6230 alu.dst.chan = i;
6231 alu.dst.write = 1;
6232 alu.is_op3 = 1;
6233 if (i == 3)
6234 alu.last = 1;
6235 r = r600_bytecode_add_alu(ctx->bc, &alu);
6236 if (r)
6237 return r;
6238 }
6239 if (use_temp)
6240 return tgsi_helper_copy(ctx, inst);
6241 return 0;
6242 }
6243
6244 static int tgsi_exp(struct r600_shader_ctx *ctx)
6245 {
6246 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6247 struct r600_bytecode_alu alu;
6248 int r;
6249 int i;
6250
6251 /* result.x = 2^floor(src); */
6252 if (inst->Dst[0].Register.WriteMask & 1) {
6253 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6254
6255 alu.op = ALU_OP1_FLOOR;
6256 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6257
6258 alu.dst.sel = ctx->temp_reg;
6259 alu.dst.chan = 0;
6260 alu.dst.write = 1;
6261 alu.last = 1;
6262 r = r600_bytecode_add_alu(ctx->bc, &alu);
6263 if (r)
6264 return r;
6265
6266 if (ctx->bc->chip_class == CAYMAN) {
6267 for (i = 0; i < 3; i++) {
6268 alu.op = ALU_OP1_EXP_IEEE;
6269 alu.src[0].sel = ctx->temp_reg;
6270 alu.src[0].chan = 0;
6271
6272 alu.dst.sel = ctx->temp_reg;
6273 alu.dst.chan = i;
6274 alu.dst.write = i == 0;
6275 alu.last = i == 2;
6276 r = r600_bytecode_add_alu(ctx->bc, &alu);
6277 if (r)
6278 return r;
6279 }
6280 } else {
6281 alu.op = ALU_OP1_EXP_IEEE;
6282 alu.src[0].sel = ctx->temp_reg;
6283 alu.src[0].chan = 0;
6284
6285 alu.dst.sel = ctx->temp_reg;
6286 alu.dst.chan = 0;
6287 alu.dst.write = 1;
6288 alu.last = 1;
6289 r = r600_bytecode_add_alu(ctx->bc, &alu);
6290 if (r)
6291 return r;
6292 }
6293 }
6294
6295 /* result.y = tmp - floor(tmp); */
6296 if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
6297 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6298
6299 alu.op = ALU_OP1_FRACT;
6300 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6301
6302 alu.dst.sel = ctx->temp_reg;
6303 #if 0
6304 r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6305 if (r)
6306 return r;
6307 #endif
6308 alu.dst.write = 1;
6309 alu.dst.chan = 1;
6310
6311 alu.last = 1;
6312
6313 r = r600_bytecode_add_alu(ctx->bc, &alu);
6314 if (r)
6315 return r;
6316 }
6317
6318 /* result.z = RoughApprox2ToX(tmp);*/
6319 if ((inst->Dst[0].Register.WriteMask >> 2) & 0x1) {
6320 if (ctx->bc->chip_class == CAYMAN) {
6321 for (i = 0; i < 3; i++) {
6322 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6323 alu.op = ALU_OP1_EXP_IEEE;
6324 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6325
6326 alu.dst.sel = ctx->temp_reg;
6327 alu.dst.chan = i;
6328 if (i == 2) {
6329 alu.dst.write = 1;
6330 alu.last = 1;
6331 }
6332
6333 r = r600_bytecode_add_alu(ctx->bc, &alu);
6334 if (r)
6335 return r;
6336 }
6337 } else {
6338 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6339 alu.op = ALU_OP1_EXP_IEEE;
6340 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6341
6342 alu.dst.sel = ctx->temp_reg;
6343 alu.dst.write = 1;
6344 alu.dst.chan = 2;
6345
6346 alu.last = 1;
6347
6348 r = r600_bytecode_add_alu(ctx->bc, &alu);
6349 if (r)
6350 return r;
6351 }
6352 }
6353
6354 /* result.w = 1.0;*/
6355 if ((inst->Dst[0].Register.WriteMask >> 3) & 0x1) {
6356 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6357
6358 alu.op = ALU_OP1_MOV;
6359 alu.src[0].sel = V_SQ_ALU_SRC_1;
6360 alu.src[0].chan = 0;
6361
6362 alu.dst.sel = ctx->temp_reg;
6363 alu.dst.chan = 3;
6364 alu.dst.write = 1;
6365 alu.last = 1;
6366 r = r600_bytecode_add_alu(ctx->bc, &alu);
6367 if (r)
6368 return r;
6369 }
6370 return tgsi_helper_copy(ctx, inst);
6371 }
6372
6373 static int tgsi_log(struct r600_shader_ctx *ctx)
6374 {
6375 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6376 struct r600_bytecode_alu alu;
6377 int r;
6378 int i;
6379
6380 /* result.x = floor(log2(|src|)); */
6381 if (inst->Dst[0].Register.WriteMask & 1) {
6382 if (ctx->bc->chip_class == CAYMAN) {
6383 for (i = 0; i < 3; i++) {
6384 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6385
6386 alu.op = ALU_OP1_LOG_IEEE;
6387 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6388 r600_bytecode_src_set_abs(&alu.src[0]);
6389
6390 alu.dst.sel = ctx->temp_reg;
6391 alu.dst.chan = i;
6392 if (i == 0)
6393 alu.dst.write = 1;
6394 if (i == 2)
6395 alu.last = 1;
6396 r = r600_bytecode_add_alu(ctx->bc, &alu);
6397 if (r)
6398 return r;
6399 }
6400
6401 } else {
6402 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6403
6404 alu.op = ALU_OP1_LOG_IEEE;
6405 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6406 r600_bytecode_src_set_abs(&alu.src[0]);
6407
6408 alu.dst.sel = ctx->temp_reg;
6409 alu.dst.chan = 0;
6410 alu.dst.write = 1;
6411 alu.last = 1;
6412 r = r600_bytecode_add_alu(ctx->bc, &alu);
6413 if (r)
6414 return r;
6415 }
6416
6417 alu.op = ALU_OP1_FLOOR;
6418 alu.src[0].sel = ctx->temp_reg;
6419 alu.src[0].chan = 0;
6420
6421 alu.dst.sel = ctx->temp_reg;
6422 alu.dst.chan = 0;
6423 alu.dst.write = 1;
6424 alu.last = 1;
6425
6426 r = r600_bytecode_add_alu(ctx->bc, &alu);
6427 if (r)
6428 return r;
6429 }
6430
6431 /* result.y = |src.x| / (2 ^ floor(log2(|src.x|))); */
6432 if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
6433
6434 if (ctx->bc->chip_class == CAYMAN) {
6435 for (i = 0; i < 3; i++) {
6436 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6437
6438 alu.op = ALU_OP1_LOG_IEEE;
6439 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6440 r600_bytecode_src_set_abs(&alu.src[0]);
6441
6442 alu.dst.sel = ctx->temp_reg;
6443 alu.dst.chan = i;
6444 if (i == 1)
6445 alu.dst.write = 1;
6446 if (i == 2)
6447 alu.last = 1;
6448
6449 r = r600_bytecode_add_alu(ctx->bc, &alu);
6450 if (r)
6451 return r;
6452 }
6453 } else {
6454 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6455
6456 alu.op = ALU_OP1_LOG_IEEE;
6457 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6458 r600_bytecode_src_set_abs(&alu.src[0]);
6459
6460 alu.dst.sel = ctx->temp_reg;
6461 alu.dst.chan = 1;
6462 alu.dst.write = 1;
6463 alu.last = 1;
6464
6465 r = r600_bytecode_add_alu(ctx->bc, &alu);
6466 if (r)
6467 return r;
6468 }
6469
6470 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6471
6472 alu.op = ALU_OP1_FLOOR;
6473 alu.src[0].sel = ctx->temp_reg;
6474 alu.src[0].chan = 1;
6475
6476 alu.dst.sel = ctx->temp_reg;
6477 alu.dst.chan = 1;
6478 alu.dst.write = 1;
6479 alu.last = 1;
6480
6481 r = r600_bytecode_add_alu(ctx->bc, &alu);
6482 if (r)
6483 return r;
6484
6485 if (ctx->bc->chip_class == CAYMAN) {
6486 for (i = 0; i < 3; i++) {
6487 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6488 alu.op = ALU_OP1_EXP_IEEE;
6489 alu.src[0].sel = ctx->temp_reg;
6490 alu.src[0].chan = 1;
6491
6492 alu.dst.sel = ctx->temp_reg;
6493 alu.dst.chan = i;
6494 if (i == 1)
6495 alu.dst.write = 1;
6496 if (i == 2)
6497 alu.last = 1;
6498
6499 r = r600_bytecode_add_alu(ctx->bc, &alu);
6500 if (r)
6501 return r;
6502 }
6503 } else {
6504 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6505 alu.op = ALU_OP1_EXP_IEEE;
6506 alu.src[0].sel = ctx->temp_reg;
6507 alu.src[0].chan = 1;
6508
6509 alu.dst.sel = ctx->temp_reg;
6510 alu.dst.chan = 1;
6511 alu.dst.write = 1;
6512 alu.last = 1;
6513
6514 r = r600_bytecode_add_alu(ctx->bc, &alu);
6515 if (r)
6516 return r;
6517 }
6518
6519 if (ctx->bc->chip_class == CAYMAN) {
6520 for (i = 0; i < 3; i++) {
6521 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6522 alu.op = ALU_OP1_RECIP_IEEE;
6523 alu.src[0].sel = ctx->temp_reg;
6524 alu.src[0].chan = 1;
6525
6526 alu.dst.sel = ctx->temp_reg;
6527 alu.dst.chan = i;
6528 if (i == 1)
6529 alu.dst.write = 1;
6530 if (i == 2)
6531 alu.last = 1;
6532
6533 r = r600_bytecode_add_alu(ctx->bc, &alu);
6534 if (r)
6535 return r;
6536 }
6537 } else {
6538 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6539 alu.op = ALU_OP1_RECIP_IEEE;
6540 alu.src[0].sel = ctx->temp_reg;
6541 alu.src[0].chan = 1;
6542
6543 alu.dst.sel = ctx->temp_reg;
6544 alu.dst.chan = 1;
6545 alu.dst.write = 1;
6546 alu.last = 1;
6547
6548 r = r600_bytecode_add_alu(ctx->bc, &alu);
6549 if (r)
6550 return r;
6551 }
6552
6553 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6554
6555 alu.op = ALU_OP2_MUL;
6556
6557 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6558 r600_bytecode_src_set_abs(&alu.src[0]);
6559
6560 alu.src[1].sel = ctx->temp_reg;
6561 alu.src[1].chan = 1;
6562
6563 alu.dst.sel = ctx->temp_reg;
6564 alu.dst.chan = 1;
6565 alu.dst.write = 1;
6566 alu.last = 1;
6567
6568 r = r600_bytecode_add_alu(ctx->bc, &alu);
6569 if (r)
6570 return r;
6571 }
6572
6573 /* result.z = log2(|src|);*/
6574 if ((inst->Dst[0].Register.WriteMask >> 2) & 1) {
6575 if (ctx->bc->chip_class == CAYMAN) {
6576 for (i = 0; i < 3; i++) {
6577 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6578
6579 alu.op = ALU_OP1_LOG_IEEE;
6580 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6581 r600_bytecode_src_set_abs(&alu.src[0]);
6582
6583 alu.dst.sel = ctx->temp_reg;
6584 if (i == 2)
6585 alu.dst.write = 1;
6586 alu.dst.chan = i;
6587 if (i == 2)
6588 alu.last = 1;
6589
6590 r = r600_bytecode_add_alu(ctx->bc, &alu);
6591 if (r)
6592 return r;
6593 }
6594 } else {
6595 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6596
6597 alu.op = ALU_OP1_LOG_IEEE;
6598 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6599 r600_bytecode_src_set_abs(&alu.src[0]);
6600
6601 alu.dst.sel = ctx->temp_reg;
6602 alu.dst.write = 1;
6603 alu.dst.chan = 2;
6604 alu.last = 1;
6605
6606 r = r600_bytecode_add_alu(ctx->bc, &alu);
6607 if (r)
6608 return r;
6609 }
6610 }
6611
6612 /* result.w = 1.0; */
6613 if ((inst->Dst[0].Register.WriteMask >> 3) & 1) {
6614 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6615
6616 alu.op = ALU_OP1_MOV;
6617 alu.src[0].sel = V_SQ_ALU_SRC_1;
6618 alu.src[0].chan = 0;
6619
6620 alu.dst.sel = ctx->temp_reg;
6621 alu.dst.chan = 3;
6622 alu.dst.write = 1;
6623 alu.last = 1;
6624
6625 r = r600_bytecode_add_alu(ctx->bc, &alu);
6626 if (r)
6627 return r;
6628 }
6629
6630 return tgsi_helper_copy(ctx, inst);
6631 }
6632
6633 static int tgsi_eg_arl(struct r600_shader_ctx *ctx)
6634 {
6635 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6636 struct r600_bytecode_alu alu;
6637 int r;
6638 int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6639 unsigned reg = inst->Dst[0].Register.Index > 0 ? ctx->bc->index_reg[inst->Dst[0].Register.Index - 1] : ctx->bc->ar_reg;
6640
6641 assert(inst->Dst[0].Register.Index < 3);
6642 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6643
6644 switch (inst->Instruction.Opcode) {
6645 case TGSI_OPCODE_ARL:
6646 alu.op = ALU_OP1_FLT_TO_INT_FLOOR;
6647 break;
6648 case TGSI_OPCODE_ARR:
6649 alu.op = ALU_OP1_FLT_TO_INT;
6650 break;
6651 case TGSI_OPCODE_UARL:
6652 alu.op = ALU_OP1_MOV;
6653 break;
6654 default:
6655 assert(0);
6656 return -1;
6657 }
6658
6659 for (i = 0; i <= lasti; ++i) {
6660 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6661 continue;
6662 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6663 alu.last = i == lasti;
6664 alu.dst.sel = reg;
6665 alu.dst.chan = i;
6666 alu.dst.write = 1;
6667 r = r600_bytecode_add_alu(ctx->bc, &alu);
6668 if (r)
6669 return r;
6670 }
6671
6672 if (inst->Dst[0].Register.Index > 0)
6673 ctx->bc->index_loaded[inst->Dst[0].Register.Index - 1] = 0;
6674 else
6675 ctx->bc->ar_loaded = 0;
6676
6677 return 0;
6678 }
6679 static int tgsi_r600_arl(struct r600_shader_ctx *ctx)
6680 {
6681 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6682 struct r600_bytecode_alu alu;
6683 int r;
6684 int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6685
6686 switch (inst->Instruction.Opcode) {
6687 case TGSI_OPCODE_ARL:
6688 memset(&alu, 0, sizeof(alu));
6689 alu.op = ALU_OP1_FLOOR;
6690 alu.dst.sel = ctx->bc->ar_reg;
6691 alu.dst.write = 1;
6692 for (i = 0; i <= lasti; ++i) {
6693 if (inst->Dst[0].Register.WriteMask & (1 << i)) {
6694 alu.dst.chan = i;
6695 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6696 alu.last = i == lasti;
6697 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6698 return r;
6699 }
6700 }
6701
6702 memset(&alu, 0, sizeof(alu));
6703 alu.op = ALU_OP1_FLT_TO_INT;
6704 alu.src[0].sel = ctx->bc->ar_reg;
6705 alu.dst.sel = ctx->bc->ar_reg;
6706 alu.dst.write = 1;
6707 /* FLT_TO_INT is trans-only on r600/r700 */
6708 alu.last = TRUE;
6709 for (i = 0; i <= lasti; ++i) {
6710 alu.dst.chan = i;
6711 alu.src[0].chan = i;
6712 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6713 return r;
6714 }
6715 break;
6716 case TGSI_OPCODE_ARR:
6717 memset(&alu, 0, sizeof(alu));
6718 alu.op = ALU_OP1_FLT_TO_INT;
6719 alu.dst.sel = ctx->bc->ar_reg;
6720 alu.dst.write = 1;
6721 /* FLT_TO_INT is trans-only on r600/r700 */
6722 alu.last = TRUE;
6723 for (i = 0; i <= lasti; ++i) {
6724 if (inst->Dst[0].Register.WriteMask & (1 << i)) {
6725 alu.dst.chan = i;
6726 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6727 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6728 return r;
6729 }
6730 }
6731 break;
6732 case TGSI_OPCODE_UARL:
6733 memset(&alu, 0, sizeof(alu));
6734 alu.op = ALU_OP1_MOV;
6735 alu.dst.sel = ctx->bc->ar_reg;
6736 alu.dst.write = 1;
6737 for (i = 0; i <= lasti; ++i) {
6738 if (inst->Dst[0].Register.WriteMask & (1 << i)) {
6739 alu.dst.chan = i;
6740 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6741 alu.last = i == lasti;
6742 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6743 return r;
6744 }
6745 }
6746 break;
6747 default:
6748 assert(0);
6749 return -1;
6750 }
6751
6752 ctx->bc->ar_loaded = 0;
6753 return 0;
6754 }
6755
6756 static int tgsi_opdst(struct r600_shader_ctx *ctx)
6757 {
6758 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6759 struct r600_bytecode_alu alu;
6760 int i, r = 0;
6761
6762 for (i = 0; i < 4; i++) {
6763 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6764
6765 alu.op = ALU_OP2_MUL;
6766 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6767
6768 if (i == 0 || i == 3) {
6769 alu.src[0].sel = V_SQ_ALU_SRC_1;
6770 } else {
6771 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6772 }
6773
6774 if (i == 0 || i == 2) {
6775 alu.src[1].sel = V_SQ_ALU_SRC_1;
6776 } else {
6777 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
6778 }
6779 if (i == 3)
6780 alu.last = 1;
6781 r = r600_bytecode_add_alu(ctx->bc, &alu);
6782 if (r)
6783 return r;
6784 }
6785 return 0;
6786 }
6787
6788 static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode, int alu_type)
6789 {
6790 struct r600_bytecode_alu alu;
6791 int r;
6792
6793 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6794 alu.op = opcode;
6795 alu.execute_mask = 1;
6796 alu.update_pred = 1;
6797
6798 alu.dst.sel = ctx->temp_reg;
6799 alu.dst.write = 1;
6800 alu.dst.chan = 0;
6801
6802 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6803 alu.src[1].sel = V_SQ_ALU_SRC_0;
6804 alu.src[1].chan = 0;
6805
6806 alu.last = 1;
6807
6808 r = r600_bytecode_add_alu_type(ctx->bc, &alu, alu_type);
6809 if (r)
6810 return r;
6811 return 0;
6812 }
6813
6814 static int pops(struct r600_shader_ctx *ctx, int pops)
6815 {
6816 unsigned force_pop = ctx->bc->force_add_cf;
6817
6818 if (!force_pop) {
6819 int alu_pop = 3;
6820 if (ctx->bc->cf_last) {
6821 if (ctx->bc->cf_last->op == CF_OP_ALU)
6822 alu_pop = 0;
6823 else if (ctx->bc->cf_last->op == CF_OP_ALU_POP_AFTER)
6824 alu_pop = 1;
6825 }
6826 alu_pop += pops;
6827 if (alu_pop == 1) {
6828 ctx->bc->cf_last->op = CF_OP_ALU_POP_AFTER;
6829 ctx->bc->force_add_cf = 1;
6830 } else if (alu_pop == 2) {
6831 ctx->bc->cf_last->op = CF_OP_ALU_POP2_AFTER;
6832 ctx->bc->force_add_cf = 1;
6833 } else {
6834 force_pop = 1;
6835 }
6836 }
6837
6838 if (force_pop) {
6839 r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP);
6840 ctx->bc->cf_last->pop_count = pops;
6841 ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
6842 }
6843
6844 return 0;
6845 }
6846
6847 static inline void callstack_update_max_depth(struct r600_shader_ctx *ctx,
6848 unsigned reason)
6849 {
6850 struct r600_stack_info *stack = &ctx->bc->stack;
6851 unsigned elements, entries;
6852
6853 unsigned entry_size = stack->entry_size;
6854
6855 elements = (stack->loop + stack->push_wqm ) * entry_size;
6856 elements += stack->push;
6857
6858 switch (ctx->bc->chip_class) {
6859 case R600:
6860 case R700:
6861 /* pre-r8xx: if any non-WQM PUSH instruction is invoked, 2 elements on
6862 * the stack must be reserved to hold the current active/continue
6863 * masks */
6864 if (reason == FC_PUSH_VPM) {
6865 elements += 2;
6866 }
6867 break;
6868
6869 case CAYMAN:
6870 /* r9xx: any stack operation on empty stack consumes 2 additional
6871 * elements */
6872 elements += 2;
6873
6874 /* fallthrough */
6875 /* FIXME: do the two elements added above cover the cases for the
6876 * r8xx+ below? */
6877
6878 case EVERGREEN:
6879 /* r8xx+: 2 extra elements are not always required, but one extra
6880 * element must be added for each of the following cases:
6881 * 1. There is an ALU_ELSE_AFTER instruction at the point of greatest
6882 * stack usage.
6883 * (Currently we don't use ALU_ELSE_AFTER.)
6884 * 2. There are LOOP/WQM frames on the stack when any flavor of non-WQM
6885 * PUSH instruction executed.
6886 *
6887 * NOTE: it seems we also need to reserve additional element in some
6888 * other cases, e.g. when we have 4 levels of PUSH_VPM in the shader,
6889 * then STACK_SIZE should be 2 instead of 1 */
6890 if (reason == FC_PUSH_VPM) {
6891 elements += 1;
6892 }
6893 break;
6894
6895 default:
6896 assert(0);
6897 break;
6898 }
6899
6900 /* NOTE: it seems STACK_SIZE is interpreted by hw as if entry_size is 4
6901 * for all chips, so we use 4 in the final formula, not the real entry_size
6902 * for the chip */
6903 entry_size = 4;
6904
6905 entries = (elements + (entry_size - 1)) / entry_size;
6906
6907 if (entries > stack->max_entries)
6908 stack->max_entries = entries;
6909 }
6910
6911 static inline void callstack_pop(struct r600_shader_ctx *ctx, unsigned reason)
6912 {
6913 switch(reason) {
6914 case FC_PUSH_VPM:
6915 --ctx->bc->stack.push;
6916 assert(ctx->bc->stack.push >= 0);
6917 break;
6918 case FC_PUSH_WQM:
6919 --ctx->bc->stack.push_wqm;
6920 assert(ctx->bc->stack.push_wqm >= 0);
6921 break;
6922 case FC_LOOP:
6923 --ctx->bc->stack.loop;
6924 assert(ctx->bc->stack.loop >= 0);
6925 break;
6926 default:
6927 assert(0);
6928 break;
6929 }
6930 }
6931
6932 static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason)
6933 {
6934 switch (reason) {
6935 case FC_PUSH_VPM:
6936 ++ctx->bc->stack.push;
6937 break;
6938 case FC_PUSH_WQM:
6939 ++ctx->bc->stack.push_wqm;
6940 case FC_LOOP:
6941 ++ctx->bc->stack.loop;
6942 break;
6943 default:
6944 assert(0);
6945 }
6946
6947 callstack_update_max_depth(ctx, reason);
6948 }
6949
6950 static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp)
6951 {
6952 struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[fc_sp];
6953
6954 sp->mid = realloc((void *)sp->mid,
6955 sizeof(struct r600_bytecode_cf *) * (sp->num_mid + 1));
6956 sp->mid[sp->num_mid] = ctx->bc->cf_last;
6957 sp->num_mid++;
6958 }
6959
6960 static void fc_pushlevel(struct r600_shader_ctx *ctx, int type)
6961 {
6962 ctx->bc->fc_sp++;
6963 ctx->bc->fc_stack[ctx->bc->fc_sp].type = type;
6964 ctx->bc->fc_stack[ctx->bc->fc_sp].start = ctx->bc->cf_last;
6965 }
6966
6967 static void fc_poplevel(struct r600_shader_ctx *ctx)
6968 {
6969 struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[ctx->bc->fc_sp];
6970 free(sp->mid);
6971 sp->mid = NULL;
6972 sp->num_mid = 0;
6973 sp->start = NULL;
6974 sp->type = 0;
6975 ctx->bc->fc_sp--;
6976 }
6977
6978 #if 0
6979 static int emit_return(struct r600_shader_ctx *ctx)
6980 {
6981 r600_bytecode_add_cfinst(ctx->bc, CF_OP_RETURN));
6982 return 0;
6983 }
6984
6985 static int emit_jump_to_offset(struct r600_shader_ctx *ctx, int pops, int offset)
6986 {
6987
6988 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP));
6989 ctx->bc->cf_last->pop_count = pops;
6990 /* XXX work out offset */
6991 return 0;
6992 }
6993
6994 static int emit_setret_in_loop_flag(struct r600_shader_ctx *ctx, unsigned flag_value)
6995 {
6996 return 0;
6997 }
6998
6999 static void emit_testflag(struct r600_shader_ctx *ctx)
7000 {
7001
7002 }
7003
7004 static void emit_return_on_flag(struct r600_shader_ctx *ctx, unsigned ifidx)
7005 {
7006 emit_testflag(ctx);
7007 emit_jump_to_offset(ctx, 1, 4);
7008 emit_setret_in_loop_flag(ctx, V_SQ_ALU_SRC_0);
7009 pops(ctx, ifidx + 1);
7010 emit_return(ctx);
7011 }
7012
7013 static void break_loop_on_flag(struct r600_shader_ctx *ctx, unsigned fc_sp)
7014 {
7015 emit_testflag(ctx);
7016
7017 r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
7018 ctx->bc->cf_last->pop_count = 1;
7019
7020 fc_set_mid(ctx, fc_sp);
7021
7022 pops(ctx, 1);
7023 }
7024 #endif
7025
7026 static int emit_if(struct r600_shader_ctx *ctx, int opcode)
7027 {
7028 int alu_type = CF_OP_ALU_PUSH_BEFORE;
7029
7030 /* There is a hardware bug on Cayman where a BREAK/CONTINUE followed by
7031 * LOOP_STARTxxx for nested loops may put the branch stack into a state
7032 * such that ALU_PUSH_BEFORE doesn't work as expected. Workaround this
7033 * by replacing the ALU_PUSH_BEFORE with a PUSH + ALU */
7034 if (ctx->bc->chip_class == CAYMAN && ctx->bc->stack.loop > 1) {
7035 r600_bytecode_add_cfinst(ctx->bc, CF_OP_PUSH);
7036 ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
7037 alu_type = CF_OP_ALU;
7038 }
7039
7040 emit_logic_pred(ctx, opcode, alu_type);
7041
7042 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
7043
7044 fc_pushlevel(ctx, FC_IF);
7045
7046 callstack_push(ctx, FC_PUSH_VPM);
7047 return 0;
7048 }
7049
7050 static int tgsi_if(struct r600_shader_ctx *ctx)
7051 {
7052 return emit_if(ctx, ALU_OP2_PRED_SETNE);
7053 }
7054
7055 static int tgsi_uif(struct r600_shader_ctx *ctx)
7056 {
7057 return emit_if(ctx, ALU_OP2_PRED_SETNE_INT);
7058 }
7059
7060 static int tgsi_else(struct r600_shader_ctx *ctx)
7061 {
7062 r600_bytecode_add_cfinst(ctx->bc, CF_OP_ELSE);
7063 ctx->bc->cf_last->pop_count = 1;
7064
7065 fc_set_mid(ctx, ctx->bc->fc_sp);
7066 ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id;
7067 return 0;
7068 }
7069
7070 static int tgsi_endif(struct r600_shader_ctx *ctx)
7071 {
7072 pops(ctx, 1);
7073 if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_IF) {
7074 R600_ERR("if/endif unbalanced in shader\n");
7075 return -1;
7076 }
7077
7078 if (ctx->bc->fc_stack[ctx->bc->fc_sp].mid == NULL) {
7079 ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2;
7080 ctx->bc->fc_stack[ctx->bc->fc_sp].start->pop_count = 1;
7081 } else {
7082 ctx->bc->fc_stack[ctx->bc->fc_sp].mid[0]->cf_addr = ctx->bc->cf_last->id + 2;
7083 }
7084 fc_poplevel(ctx);
7085
7086 callstack_pop(ctx, FC_PUSH_VPM);
7087 return 0;
7088 }
7089
7090 static int tgsi_bgnloop(struct r600_shader_ctx *ctx)
7091 {
7092 /* LOOP_START_DX10 ignores the LOOP_CONFIG* registers, so it is not
7093 * limited to 4096 iterations, like the other LOOP_* instructions. */
7094 r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_START_DX10);
7095
7096 fc_pushlevel(ctx, FC_LOOP);
7097
7098 /* check stack depth */
7099 callstack_push(ctx, FC_LOOP);
7100 return 0;
7101 }
7102
7103 static int tgsi_endloop(struct r600_shader_ctx *ctx)
7104 {
7105 int i;
7106
7107 r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_END);
7108
7109 if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_LOOP) {
7110 R600_ERR("loop/endloop in shader code are not paired.\n");
7111 return -EINVAL;
7112 }
7113
7114 /* fixup loop pointers - from r600isa
7115 LOOP END points to CF after LOOP START,
7116 LOOP START point to CF after LOOP END
7117 BRK/CONT point to LOOP END CF
7118 */
7119 ctx->bc->cf_last->cf_addr = ctx->bc->fc_stack[ctx->bc->fc_sp].start->id + 2;
7120
7121 ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2;
7122
7123 for (i = 0; i < ctx->bc->fc_stack[ctx->bc->fc_sp].num_mid; i++) {
7124 ctx->bc->fc_stack[ctx->bc->fc_sp].mid[i]->cf_addr = ctx->bc->cf_last->id;
7125 }
7126 /* XXX add LOOPRET support */
7127 fc_poplevel(ctx);
7128 callstack_pop(ctx, FC_LOOP);
7129 return 0;
7130 }
7131
7132 static int tgsi_loop_breakc(struct r600_shader_ctx *ctx)
7133 {
7134 int r;
7135 unsigned int fscp;
7136
7137 for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
7138 {
7139 if (FC_LOOP == ctx->bc->fc_stack[fscp].type)
7140 break;
7141 }
7142 if (fscp == 0) {
7143 R600_ERR("BREAKC not inside loop/endloop pair\n");
7144 return -EINVAL;
7145 }
7146
7147 if (ctx->bc->chip_class == EVERGREEN &&
7148 ctx->bc->family != CHIP_CYPRESS &&
7149 ctx->bc->family != CHIP_JUNIPER) {
7150 /* HW bug: ALU_BREAK does not save the active mask correctly */
7151 r = tgsi_uif(ctx);
7152 if (r)
7153 return r;
7154
7155 r = r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_BREAK);
7156 if (r)
7157 return r;
7158 fc_set_mid(ctx, fscp);
7159
7160 return tgsi_endif(ctx);
7161 } else {
7162 r = emit_logic_pred(ctx, ALU_OP2_PRED_SETE_INT, CF_OP_ALU_BREAK);
7163 if (r)
7164 return r;
7165 fc_set_mid(ctx, fscp);
7166 }
7167
7168 return 0;
7169 }
7170
7171 static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx)
7172 {
7173 unsigned int fscp;
7174
7175 for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
7176 {
7177 if (FC_LOOP == ctx->bc->fc_stack[fscp].type)
7178 break;
7179 }
7180
7181 if (fscp == 0) {
7182 R600_ERR("Break not inside loop/endloop pair\n");
7183 return -EINVAL;
7184 }
7185
7186 r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
7187
7188 fc_set_mid(ctx, fscp);
7189
7190 return 0;
7191 }
7192
7193 static int tgsi_gs_emit(struct r600_shader_ctx *ctx)
7194 {
7195 if (ctx->inst_info->op == CF_OP_EMIT_VERTEX)
7196 emit_gs_ring_writes(ctx, TRUE);
7197
7198 return r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
7199 }
7200
7201 static int tgsi_umad(struct r600_shader_ctx *ctx)
7202 {
7203 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7204 struct r600_bytecode_alu alu;
7205 int i, j, k, r;
7206 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7207
7208 /* src0 * src1 */
7209 for (i = 0; i < lasti + 1; i++) {
7210 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7211 continue;
7212
7213 if (ctx->bc->chip_class == CAYMAN) {
7214 for (j = 0 ; j < 4; j++) {
7215 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7216
7217 alu.op = ALU_OP2_MULLO_UINT;
7218 for (k = 0; k < inst->Instruction.NumSrcRegs; k++) {
7219 r600_bytecode_src(&alu.src[k], &ctx->src[k], i);
7220 }
7221 alu.dst.chan = j;
7222 alu.dst.sel = ctx->temp_reg;
7223 alu.dst.write = (j == i);
7224 if (j == 3)
7225 alu.last = 1;
7226 r = r600_bytecode_add_alu(ctx->bc, &alu);
7227 if (r)
7228 return r;
7229 }
7230 } else {
7231 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7232
7233 alu.dst.chan = i;
7234 alu.dst.sel = ctx->temp_reg;
7235 alu.dst.write = 1;
7236
7237 alu.op = ALU_OP2_MULLO_UINT;
7238 for (j = 0; j < 2; j++) {
7239 r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
7240 }
7241
7242 alu.last = 1;
7243 r = r600_bytecode_add_alu(ctx->bc, &alu);
7244 if (r)
7245 return r;
7246 }
7247 }
7248
7249
7250 for (i = 0; i < lasti + 1; i++) {
7251 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7252 continue;
7253
7254 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7255 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7256
7257 alu.op = ALU_OP2_ADD_INT;
7258
7259 alu.src[0].sel = ctx->temp_reg;
7260 alu.src[0].chan = i;
7261
7262 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
7263 if (i == lasti) {
7264 alu.last = 1;
7265 }
7266 r = r600_bytecode_add_alu(ctx->bc, &alu);
7267 if (r)
7268 return r;
7269 }
7270 return 0;
7271 }
7272
7273 static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
7274 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_r600_arl},
7275 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2},
7276 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit},
7277
7278 /* XXX:
7279 * For state trackers other than OpenGL, we'll want to use
7280 * _RECIP_IEEE instead.
7281 */
7282 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_CLAMPED, tgsi_trans_srcx_replicate},
7283
7284 [TGSI_OPCODE_RSQ] = { ALU_OP0_NOP, tgsi_rsq},
7285 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp},
7286 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log},
7287 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL, tgsi_op2},
7288 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2},
7289 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4, tgsi_dp},
7290 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4, tgsi_dp},
7291 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst},
7292 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN, tgsi_op2},
7293 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX, tgsi_op2},
7294 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap},
7295 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2},
7296 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD, tgsi_op3},
7297 [TGSI_OPCODE_SUB] = { ALU_OP2_ADD, tgsi_op2},
7298 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp},
7299 [TGSI_OPCODE_FMA] = { ALU_OP0_NOP, tgsi_unsupported},
7300 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
7301 [TGSI_OPCODE_DP2A] = { ALU_OP0_NOP, tgsi_unsupported},
7302 [22] = { ALU_OP0_NOP, tgsi_unsupported},
7303 [23] = { ALU_OP0_NOP, tgsi_unsupported},
7304 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2},
7305 [TGSI_OPCODE_CLAMP] = { ALU_OP0_NOP, tgsi_unsupported},
7306 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2},
7307 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2},
7308 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
7309 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
7310 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, tgsi_pow},
7311 [TGSI_OPCODE_XPD] = { ALU_OP0_NOP, tgsi_xpd},
7312 [32] = { ALU_OP0_NOP, tgsi_unsupported},
7313 [TGSI_OPCODE_ABS] = { ALU_OP1_MOV, tgsi_op2},
7314 [34] = { ALU_OP0_NOP, tgsi_unsupported},
7315 [TGSI_OPCODE_DPH] = { ALU_OP2_DOT4, tgsi_dp},
7316 [TGSI_OPCODE_COS] = { ALU_OP1_COS, tgsi_trig},
7317 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
7318 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
7319 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */
7320 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_unsupported},
7321 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported},
7322 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported},
7323 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported},
7324 [44] = { ALU_OP0_NOP, tgsi_unsupported},
7325 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2},
7326 [46] = { ALU_OP0_NOP, tgsi_unsupported},
7327 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2},
7328 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, tgsi_trig},
7329 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap},
7330 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2},
7331 [51] = { ALU_OP0_NOP, tgsi_unsupported},
7332 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex},
7333 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex},
7334 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex},
7335 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_unsupported},
7336 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported},
7337 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported},
7338 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported},
7339 [59] = { ALU_OP0_NOP, tgsi_unsupported},
7340 [60] = { ALU_OP0_NOP, tgsi_unsupported},
7341 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_r600_arl},
7342 [62] = { ALU_OP0_NOP, tgsi_unsupported},
7343 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported},
7344 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported},
7345 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg},
7346 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp},
7347 [TGSI_OPCODE_SCS] = { ALU_OP0_NOP, tgsi_scs},
7348 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
7349 [69] = { ALU_OP0_NOP, tgsi_unsupported},
7350 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported},
7351 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4, tgsi_dp},
7352 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex},
7353 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
7354 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if},
7355 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif},
7356 [76] = { ALU_OP0_NOP, tgsi_unsupported},
7357 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else},
7358 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif},
7359 [TGSI_OPCODE_DDX_FINE] = { ALU_OP0_NOP, tgsi_unsupported},
7360 [TGSI_OPCODE_DDY_FINE] = { ALU_OP0_NOP, tgsi_unsupported},
7361 [TGSI_OPCODE_PUSHA] = { ALU_OP0_NOP, tgsi_unsupported},
7362 [TGSI_OPCODE_POPA] = { ALU_OP0_NOP, tgsi_unsupported},
7363 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2},
7364 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
7365 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2},
7366 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2},
7367 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2_trans},
7368 [88] = { ALU_OP0_NOP, tgsi_unsupported},
7369 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2},
7370 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2},
7371 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod},
7372 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2},
7373 [TGSI_OPCODE_SAD] = { ALU_OP0_NOP, tgsi_unsupported},
7374 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex},
7375 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
7376 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
7377 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
7378 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit},
7379 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop},
7380 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported},
7381 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop},
7382 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported},
7383 [TGSI_OPCODE_TXQ_LZ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
7384 [104] = { ALU_OP0_NOP, tgsi_unsupported},
7385 [105] = { ALU_OP0_NOP, tgsi_unsupported},
7386 [106] = { ALU_OP0_NOP, tgsi_unsupported},
7387 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported},
7388 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2},
7389 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2},
7390 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
7391 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
7392 [112] = { ALU_OP0_NOP, tgsi_unsupported},
7393 [TGSI_OPCODE_CALLNZ] = { ALU_OP0_NOP, tgsi_unsupported},
7394 [114] = { ALU_OP0_NOP, tgsi_unsupported},
7395 [TGSI_OPCODE_BREAKC] = { ALU_OP0_NOP, tgsi_loop_breakc},
7396 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */
7397 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */
7398 [118] = { ALU_OP0_NOP, tgsi_unsupported},
7399 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_op2_trans},
7400 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv},
7401 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2},
7402 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2},
7403 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg},
7404 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2},
7405 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2_trans},
7406 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap},
7407 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_op2_trans},
7408 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
7409 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2},
7410 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv},
7411 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad},
7412 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2},
7413 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2},
7414 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod},
7415 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_UINT, tgsi_op2_trans},
7416 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2},
7417 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2},
7418 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2_trans},
7419 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
7420 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2_swap},
7421 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
7422 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported},
7423 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported},
7424 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
7425 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported},
7426 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported},
7427 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported},
7428 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported},
7429 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported},
7430 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported},
7431 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported},
7432 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported},
7433 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported},
7434 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported},
7435 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported},
7436 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported},
7437 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_r600_arl},
7438 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp},
7439 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs},
7440 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg},
7441 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_unsupported},
7442 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_unsupported},
7443 [TGSI_OPCODE_MFENCE] = { ALU_OP0_NOP, tgsi_unsupported},
7444 [TGSI_OPCODE_LFENCE] = { ALU_OP0_NOP, tgsi_unsupported},
7445 [TGSI_OPCODE_SFENCE] = { ALU_OP0_NOP, tgsi_unsupported},
7446 [TGSI_OPCODE_BARRIER] = { ALU_OP0_NOP, tgsi_unsupported},
7447 [TGSI_OPCODE_ATOMUADD] = { ALU_OP0_NOP, tgsi_unsupported},
7448 [TGSI_OPCODE_ATOMXCHG] = { ALU_OP0_NOP, tgsi_unsupported},
7449 [TGSI_OPCODE_ATOMCAS] = { ALU_OP0_NOP, tgsi_unsupported},
7450 [TGSI_OPCODE_ATOMAND] = { ALU_OP0_NOP, tgsi_unsupported},
7451 [TGSI_OPCODE_ATOMOR] = { ALU_OP0_NOP, tgsi_unsupported},
7452 [TGSI_OPCODE_ATOMXOR] = { ALU_OP0_NOP, tgsi_unsupported},
7453 [TGSI_OPCODE_ATOMUMIN] = { ALU_OP0_NOP, tgsi_unsupported},
7454 [TGSI_OPCODE_ATOMUMAX] = { ALU_OP0_NOP, tgsi_unsupported},
7455 [TGSI_OPCODE_ATOMIMIN] = { ALU_OP0_NOP, tgsi_unsupported},
7456 [TGSI_OPCODE_ATOMIMAX] = { ALU_OP0_NOP, tgsi_unsupported},
7457 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex},
7458 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
7459 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex},
7460 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, tgsi_op2_trans},
7461 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, tgsi_op2_trans},
7462 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_unsupported},
7463 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_unsupported},
7464 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_unsupported},
7465 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_unsupported},
7466 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_unsupported},
7467 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_unsupported},
7468 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_unsupported},
7469 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_unsupported},
7470 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_unsupported},
7471 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_unsupported},
7472 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_unsupported},
7473 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_unsupported},
7474 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_unsupported},
7475 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported},
7476 };
7477
7478 static struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = {
7479 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_eg_arl},
7480 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2},
7481 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit},
7482 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate},
7483 [TGSI_OPCODE_RSQ] = { ALU_OP1_RECIPSQRT_IEEE, tgsi_rsq},
7484 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp},
7485 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log},
7486 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL, tgsi_op2},
7487 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2},
7488 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4, tgsi_dp},
7489 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4, tgsi_dp},
7490 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst},
7491 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN, tgsi_op2},
7492 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX, tgsi_op2},
7493 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap},
7494 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2},
7495 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD, tgsi_op3},
7496 [TGSI_OPCODE_SUB] = { ALU_OP2_ADD, tgsi_op2},
7497 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp},
7498 [TGSI_OPCODE_FMA] = { ALU_OP0_NOP, tgsi_unsupported},
7499 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
7500 [TGSI_OPCODE_DP2A] = { ALU_OP0_NOP, tgsi_unsupported},
7501 [22] = { ALU_OP0_NOP, tgsi_unsupported},
7502 [23] = { ALU_OP0_NOP, tgsi_unsupported},
7503 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2},
7504 [TGSI_OPCODE_CLAMP] = { ALU_OP0_NOP, tgsi_unsupported},
7505 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2},
7506 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2},
7507 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
7508 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
7509 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, tgsi_pow},
7510 [TGSI_OPCODE_XPD] = { ALU_OP0_NOP, tgsi_xpd},
7511 [32] = { ALU_OP0_NOP, tgsi_unsupported},
7512 [TGSI_OPCODE_ABS] = { ALU_OP1_MOV, tgsi_op2},
7513 [34] = { ALU_OP0_NOP, tgsi_unsupported},
7514 [TGSI_OPCODE_DPH] = { ALU_OP2_DOT4, tgsi_dp},
7515 [TGSI_OPCODE_COS] = { ALU_OP1_COS, tgsi_trig},
7516 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
7517 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
7518 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */
7519 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_unsupported},
7520 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported},
7521 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported},
7522 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported},
7523 [44] = { ALU_OP0_NOP, tgsi_unsupported},
7524 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2},
7525 [46] = { ALU_OP0_NOP, tgsi_unsupported},
7526 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2},
7527 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, tgsi_trig},
7528 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap},
7529 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2},
7530 [51] = { ALU_OP0_NOP, tgsi_unsupported},
7531 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex},
7532 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex},
7533 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex},
7534 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_unsupported},
7535 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported},
7536 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported},
7537 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported},
7538 [59] = { ALU_OP0_NOP, tgsi_unsupported},
7539 [60] = { ALU_OP0_NOP, tgsi_unsupported},
7540 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_eg_arl},
7541 [62] = { ALU_OP0_NOP, tgsi_unsupported},
7542 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported},
7543 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported},
7544 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg},
7545 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp},
7546 [TGSI_OPCODE_SCS] = { ALU_OP0_NOP, tgsi_scs},
7547 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
7548 [69] = { ALU_OP0_NOP, tgsi_unsupported},
7549 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported},
7550 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4, tgsi_dp},
7551 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex},
7552 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
7553 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if},
7554 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif},
7555 [76] = { ALU_OP0_NOP, tgsi_unsupported},
7556 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else},
7557 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif},
7558 [TGSI_OPCODE_DDX_FINE] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
7559 [TGSI_OPCODE_DDY_FINE] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
7560 [TGSI_OPCODE_PUSHA] = { ALU_OP0_NOP, tgsi_unsupported},
7561 [TGSI_OPCODE_POPA] = { ALU_OP0_NOP, tgsi_unsupported},
7562 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2},
7563 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
7564 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2},
7565 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2},
7566 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2},
7567 [88] = { ALU_OP0_NOP, tgsi_unsupported},
7568 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2},
7569 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2},
7570 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod},
7571 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2},
7572 [TGSI_OPCODE_SAD] = { ALU_OP0_NOP, tgsi_unsupported},
7573 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex},
7574 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
7575 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
7576 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
7577 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit},
7578 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop},
7579 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported},
7580 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop},
7581 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported},
7582 [TGSI_OPCODE_TXQ_LZ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
7583 [104] = { ALU_OP0_NOP, tgsi_unsupported},
7584 [105] = { ALU_OP0_NOP, tgsi_unsupported},
7585 [106] = { ALU_OP0_NOP, tgsi_unsupported},
7586 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported},
7587 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2},
7588 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2},
7589 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
7590 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
7591 [112] = { ALU_OP0_NOP, tgsi_unsupported},
7592 [TGSI_OPCODE_CALLNZ] = { ALU_OP0_NOP, tgsi_unsupported},
7593 [114] = { ALU_OP0_NOP, tgsi_unsupported},
7594 [TGSI_OPCODE_BREAKC] = { ALU_OP0_NOP, tgsi_unsupported},
7595 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */
7596 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */
7597 [118] = { ALU_OP0_NOP, tgsi_unsupported},
7598 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_f2i},
7599 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv},
7600 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2},
7601 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2},
7602 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg},
7603 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2},
7604 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2},
7605 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap},
7606 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_f2i},
7607 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
7608 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2},
7609 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv},
7610 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad},
7611 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2},
7612 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2},
7613 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod},
7614 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_UINT, tgsi_op2_trans},
7615 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2},
7616 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2},
7617 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2},
7618 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
7619 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2},
7620 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
7621 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported},
7622 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported},
7623 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
7624 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported},
7625 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported},
7626 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported},
7627 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported},
7628 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported},
7629 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported},
7630 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported},
7631 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported},
7632 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported},
7633 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported},
7634 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported},
7635 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported},
7636 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_eg_arl},
7637 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp},
7638 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs},
7639 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg},
7640 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_unsupported},
7641 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_unsupported},
7642 [TGSI_OPCODE_MFENCE] = { ALU_OP0_NOP, tgsi_unsupported},
7643 [TGSI_OPCODE_LFENCE] = { ALU_OP0_NOP, tgsi_unsupported},
7644 [TGSI_OPCODE_SFENCE] = { ALU_OP0_NOP, tgsi_unsupported},
7645 [TGSI_OPCODE_BARRIER] = { ALU_OP0_NOP, tgsi_unsupported},
7646 [TGSI_OPCODE_ATOMUADD] = { ALU_OP0_NOP, tgsi_unsupported},
7647 [TGSI_OPCODE_ATOMXCHG] = { ALU_OP0_NOP, tgsi_unsupported},
7648 [TGSI_OPCODE_ATOMCAS] = { ALU_OP0_NOP, tgsi_unsupported},
7649 [TGSI_OPCODE_ATOMAND] = { ALU_OP0_NOP, tgsi_unsupported},
7650 [TGSI_OPCODE_ATOMOR] = { ALU_OP0_NOP, tgsi_unsupported},
7651 [TGSI_OPCODE_ATOMXOR] = { ALU_OP0_NOP, tgsi_unsupported},
7652 [TGSI_OPCODE_ATOMUMIN] = { ALU_OP0_NOP, tgsi_unsupported},
7653 [TGSI_OPCODE_ATOMUMAX] = { ALU_OP0_NOP, tgsi_unsupported},
7654 [TGSI_OPCODE_ATOMIMIN] = { ALU_OP0_NOP, tgsi_unsupported},
7655 [TGSI_OPCODE_ATOMIMAX] = { ALU_OP0_NOP, tgsi_unsupported},
7656 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex},
7657 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
7658 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex},
7659 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, tgsi_op2_trans},
7660 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, tgsi_op2_trans},
7661 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_tex},
7662 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_tex},
7663 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_op3},
7664 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_op3},
7665 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_bfi},
7666 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_op2},
7667 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_op2},
7668 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_op2},
7669 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_msb},
7670 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_msb},
7671 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_interp_egcm},
7672 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_interp_egcm},
7673 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_interp_egcm},
7674 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported},
7675 };
7676
7677 static struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = {
7678 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_eg_arl},
7679 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2},
7680 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit},
7681 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, cayman_emit_float_instr},
7682 [TGSI_OPCODE_RSQ] = { ALU_OP1_RECIPSQRT_IEEE, cayman_emit_float_instr},
7683 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp},
7684 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log},
7685 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL, tgsi_op2},
7686 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2},
7687 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4, tgsi_dp},
7688 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4, tgsi_dp},
7689 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst},
7690 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN, tgsi_op2},
7691 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX, tgsi_op2},
7692 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap},
7693 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2},
7694 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD, tgsi_op3},
7695 [TGSI_OPCODE_SUB] = { ALU_OP2_ADD, tgsi_op2},
7696 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp},
7697 [TGSI_OPCODE_FMA] = { ALU_OP0_NOP, tgsi_unsupported},
7698 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, cayman_emit_float_instr},
7699 [TGSI_OPCODE_DP2A] = { ALU_OP0_NOP, tgsi_unsupported},
7700 [22] = { ALU_OP0_NOP, tgsi_unsupported},
7701 [23] = { ALU_OP0_NOP, tgsi_unsupported},
7702 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2},
7703 [TGSI_OPCODE_CLAMP] = { ALU_OP0_NOP, tgsi_unsupported},
7704 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2},
7705 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2},
7706 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, cayman_emit_float_instr},
7707 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, cayman_emit_float_instr},
7708 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, cayman_pow},
7709 [TGSI_OPCODE_XPD] = { ALU_OP0_NOP, tgsi_xpd},
7710 [32] = { ALU_OP0_NOP, tgsi_unsupported},
7711 [TGSI_OPCODE_ABS] = { ALU_OP1_MOV, tgsi_op2},
7712 [34] = { ALU_OP0_NOP, tgsi_unsupported},
7713 [TGSI_OPCODE_DPH] = { ALU_OP2_DOT4, tgsi_dp},
7714 [TGSI_OPCODE_COS] = { ALU_OP1_COS, cayman_trig},
7715 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
7716 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
7717 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */
7718 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_unsupported},
7719 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported},
7720 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported},
7721 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported},
7722 [44] = { ALU_OP0_NOP, tgsi_unsupported},
7723 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2},
7724 [46] = { ALU_OP0_NOP, tgsi_unsupported},
7725 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2},
7726 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, cayman_trig},
7727 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap},
7728 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2},
7729 [51] = { ALU_OP0_NOP, tgsi_unsupported},
7730 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex},
7731 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex},
7732 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex},
7733 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_unsupported},
7734 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported},
7735 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported},
7736 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported},
7737 [59] = { ALU_OP0_NOP, tgsi_unsupported},
7738 [60] = { ALU_OP0_NOP, tgsi_unsupported},
7739 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_eg_arl},
7740 [62] = { ALU_OP0_NOP, tgsi_unsupported},
7741 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported},
7742 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported},
7743 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg},
7744 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp},
7745 [TGSI_OPCODE_SCS] = { ALU_OP0_NOP, tgsi_scs},
7746 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
7747 [69] = { ALU_OP0_NOP, tgsi_unsupported},
7748 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported},
7749 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4, tgsi_dp},
7750 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex},
7751 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
7752 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if},
7753 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif},
7754 [76] = { ALU_OP0_NOP, tgsi_unsupported},
7755 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else},
7756 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif},
7757 [TGSI_OPCODE_DDX_FINE] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
7758 [TGSI_OPCODE_DDY_FINE] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
7759 [TGSI_OPCODE_PUSHA] = { ALU_OP0_NOP, tgsi_unsupported},
7760 [TGSI_OPCODE_POPA] = { ALU_OP0_NOP, tgsi_unsupported},
7761 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2},
7762 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2},
7763 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2},
7764 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2},
7765 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2},
7766 [88] = { ALU_OP0_NOP, tgsi_unsupported},
7767 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2},
7768 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2},
7769 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod},
7770 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2},
7771 [TGSI_OPCODE_SAD] = { ALU_OP0_NOP, tgsi_unsupported},
7772 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex},
7773 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
7774 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
7775 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
7776 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit},
7777 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop},
7778 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported},
7779 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop},
7780 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported},
7781 [TGSI_OPCODE_TXQ_LZ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
7782 [104] = { ALU_OP0_NOP, tgsi_unsupported},
7783 [105] = { ALU_OP0_NOP, tgsi_unsupported},
7784 [106] = { ALU_OP0_NOP, tgsi_unsupported},
7785 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported},
7786 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2},
7787 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2},
7788 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
7789 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
7790 [112] = { ALU_OP0_NOP, tgsi_unsupported},
7791 [TGSI_OPCODE_CALLNZ] = { ALU_OP0_NOP, tgsi_unsupported},
7792 [114] = { ALU_OP0_NOP, tgsi_unsupported},
7793 [TGSI_OPCODE_BREAKC] = { ALU_OP0_NOP, tgsi_unsupported},
7794 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */
7795 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */
7796 [118] = { ALU_OP0_NOP, tgsi_unsupported},
7797 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_op2},
7798 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv},
7799 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2},
7800 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2},
7801 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg},
7802 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2},
7803 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2},
7804 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap},
7805 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_op2},
7806 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2},
7807 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2},
7808 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv},
7809 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad},
7810 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2},
7811 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2},
7812 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod},
7813 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_INT, cayman_mul_int_instr},
7814 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2},
7815 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2},
7816 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2},
7817 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
7818 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2},
7819 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
7820 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported},
7821 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported},
7822 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
7823 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported},
7824 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported},
7825 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported},
7826 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported},
7827 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported},
7828 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported},
7829 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported},
7830 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported},
7831 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported},
7832 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported},
7833 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported},
7834 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported},
7835 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_eg_arl},
7836 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp},
7837 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs},
7838 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg},
7839 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_unsupported},
7840 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_unsupported},
7841 [TGSI_OPCODE_MFENCE] = { ALU_OP0_NOP, tgsi_unsupported},
7842 [TGSI_OPCODE_LFENCE] = { ALU_OP0_NOP, tgsi_unsupported},
7843 [TGSI_OPCODE_SFENCE] = { ALU_OP0_NOP, tgsi_unsupported},
7844 [TGSI_OPCODE_BARRIER] = { ALU_OP0_NOP, tgsi_unsupported},
7845 [TGSI_OPCODE_ATOMUADD] = { ALU_OP0_NOP, tgsi_unsupported},
7846 [TGSI_OPCODE_ATOMXCHG] = { ALU_OP0_NOP, tgsi_unsupported},
7847 [TGSI_OPCODE_ATOMCAS] = { ALU_OP0_NOP, tgsi_unsupported},
7848 [TGSI_OPCODE_ATOMAND] = { ALU_OP0_NOP, tgsi_unsupported},
7849 [TGSI_OPCODE_ATOMOR] = { ALU_OP0_NOP, tgsi_unsupported},
7850 [TGSI_OPCODE_ATOMXOR] = { ALU_OP0_NOP, tgsi_unsupported},
7851 [TGSI_OPCODE_ATOMUMIN] = { ALU_OP0_NOP, tgsi_unsupported},
7852 [TGSI_OPCODE_ATOMUMAX] = { ALU_OP0_NOP, tgsi_unsupported},
7853 [TGSI_OPCODE_ATOMIMIN] = { ALU_OP0_NOP, tgsi_unsupported},
7854 [TGSI_OPCODE_ATOMIMAX] = { ALU_OP0_NOP, tgsi_unsupported},
7855 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex},
7856 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
7857 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex},
7858 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, cayman_mul_int_instr},
7859 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, cayman_mul_int_instr},
7860 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_tex},
7861 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_tex},
7862 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_op3},
7863 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_op3},
7864 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_bfi},
7865 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_op2},
7866 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_op2},
7867 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_op2},
7868 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_msb},
7869 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_msb},
7870 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_interp_egcm},
7871 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_interp_egcm},
7872 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_interp_egcm},
7873 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported},
7874 };