r600: disable SB for now on tess related shaders.
[mesa.git] / src / gallium / drivers / r600 / r600_shader.c
1 /*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 */
23 #include "r600_sq.h"
24 #include "r600_llvm.h"
25 #include "r600_formats.h"
26 #include "r600_opcodes.h"
27 #include "r600_shader.h"
28 #include "r600d.h"
29
30 #include "sb/sb_public.h"
31
32 #include "pipe/p_shader_tokens.h"
33 #include "tgsi/tgsi_info.h"
34 #include "tgsi/tgsi_parse.h"
35 #include "tgsi/tgsi_scan.h"
36 #include "tgsi/tgsi_dump.h"
37 #include "util/u_memory.h"
38 #include "util/u_math.h"
39 #include <stdio.h>
40 #include <errno.h>
41
42 /* CAYMAN notes
43 Why CAYMAN got loops for lots of instructions is explained here.
44
45 -These 8xx t-slot only ops are implemented in all vector slots.
46 MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT
47 These 8xx t-slot only opcodes become vector ops, with all four
48 slots expecting the arguments on sources a and b. Result is
49 broadcast to all channels.
50 MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT, MUL_64
51 These 8xx t-slot only opcodes become vector ops in the z, y, and
52 x slots.
53 EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64
54 RECIPSQRT_IEEE/CLAMPED/FF/_64/CLAMPED_64
55 SQRT_IEEE/_64
56 SIN/COS
57 The w slot may have an independent co-issued operation, or if the
58 result is required to be in the w slot, the opcode above may be
59 issued in the w slot as well.
60 The compiler must issue the source argument to slots z, y, and x
61 */
62
63 #define R600_SHADER_BUFFER_INFO_SEL (512 + R600_BUFFER_INFO_OFFSET / 16)
64 static int r600_shader_from_tgsi(struct r600_context *rctx,
65 struct r600_pipe_shader *pipeshader,
66 union r600_shader_key key);
67
68
69 static void r600_add_gpr_array(struct r600_shader *ps, int start_gpr,
70 int size, unsigned comp_mask) {
71
72 if (!size)
73 return;
74
75 if (ps->num_arrays == ps->max_arrays) {
76 ps->max_arrays += 64;
77 ps->arrays = realloc(ps->arrays, ps->max_arrays *
78 sizeof(struct r600_shader_array));
79 }
80
81 int n = ps->num_arrays;
82 ++ps->num_arrays;
83
84 ps->arrays[n].comp_mask = comp_mask;
85 ps->arrays[n].gpr_start = start_gpr;
86 ps->arrays[n].gpr_count = size;
87 }
88
89 static void r600_dump_streamout(struct pipe_stream_output_info *so)
90 {
91 unsigned i;
92
93 fprintf(stderr, "STREAMOUT\n");
94 for (i = 0; i < so->num_outputs; i++) {
95 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
96 so->output[i].start_component;
97 fprintf(stderr, " %i: MEM_STREAM%d_BUF%i[%i..%i] <- OUT[%i].%s%s%s%s%s\n",
98 i,
99 so->output[i].stream,
100 so->output[i].output_buffer,
101 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
102 so->output[i].register_index,
103 mask & 1 ? "x" : "",
104 mask & 2 ? "y" : "",
105 mask & 4 ? "z" : "",
106 mask & 8 ? "w" : "",
107 so->output[i].dst_offset < so->output[i].start_component ? " (will lower)" : "");
108 }
109 }
110
111 static int store_shader(struct pipe_context *ctx,
112 struct r600_pipe_shader *shader)
113 {
114 struct r600_context *rctx = (struct r600_context *)ctx;
115 uint32_t *ptr, i;
116
117 if (shader->bo == NULL) {
118 shader->bo = (struct r600_resource*)
119 pipe_buffer_create(ctx->screen, PIPE_BIND_CUSTOM, PIPE_USAGE_IMMUTABLE, shader->shader.bc.ndw * 4);
120 if (shader->bo == NULL) {
121 return -ENOMEM;
122 }
123 ptr = r600_buffer_map_sync_with_rings(&rctx->b, shader->bo, PIPE_TRANSFER_WRITE);
124 if (R600_BIG_ENDIAN) {
125 for (i = 0; i < shader->shader.bc.ndw; ++i) {
126 ptr[i] = util_cpu_to_le32(shader->shader.bc.bytecode[i]);
127 }
128 } else {
129 memcpy(ptr, shader->shader.bc.bytecode, shader->shader.bc.ndw * sizeof(*ptr));
130 }
131 rctx->b.ws->buffer_unmap(shader->bo->cs_buf);
132 }
133
134 return 0;
135 }
136
137 int r600_pipe_shader_create(struct pipe_context *ctx,
138 struct r600_pipe_shader *shader,
139 union r600_shader_key key)
140 {
141 struct r600_context *rctx = (struct r600_context *)ctx;
142 struct r600_pipe_shader_selector *sel = shader->selector;
143 int r;
144 bool dump = r600_can_dump_shader(&rctx->screen->b, sel->tokens);
145 unsigned use_sb = !(rctx->screen->b.debug_flags & DBG_NO_SB);
146 unsigned sb_disasm = use_sb || (rctx->screen->b.debug_flags & DBG_SB_DISASM);
147 unsigned export_shader;
148
149 shader->shader.bc.isa = rctx->isa;
150
151 if (dump) {
152 fprintf(stderr, "--------------------------------------------------------------\n");
153 tgsi_dump(sel->tokens, 0);
154
155 if (sel->so.num_outputs) {
156 r600_dump_streamout(&sel->so);
157 }
158 }
159 r = r600_shader_from_tgsi(rctx, shader, key);
160 if (r) {
161 R600_ERR("translation from TGSI failed !\n");
162 goto error;
163 }
164 if (shader->shader.processor_type == TGSI_PROCESSOR_VERTEX) {
165 /* only disable for vertex shaders in tess paths */
166 if (key.vs.as_ls)
167 use_sb = 0;
168 }
169 use_sb &= (shader->shader.processor_type != TGSI_PROCESSOR_TESS_CTRL);
170 use_sb &= (shader->shader.processor_type != TGSI_PROCESSOR_TESS_EVAL);
171
172 /* disable SB for shaders using doubles */
173 use_sb &= !shader->shader.uses_doubles;
174
175 /* Check if the bytecode has already been built. When using the llvm
176 * backend, r600_shader_from_tgsi() will take care of building the
177 * bytecode.
178 */
179 if (!shader->shader.bc.bytecode) {
180 r = r600_bytecode_build(&shader->shader.bc);
181 if (r) {
182 R600_ERR("building bytecode failed !\n");
183 goto error;
184 }
185 }
186
187 if (dump && !sb_disasm) {
188 fprintf(stderr, "--------------------------------------------------------------\n");
189 r600_bytecode_disasm(&shader->shader.bc);
190 fprintf(stderr, "______________________________________________________________\n");
191 } else if ((dump && sb_disasm) || use_sb) {
192 r = r600_sb_bytecode_process(rctx, &shader->shader.bc, &shader->shader,
193 dump, use_sb);
194 if (r) {
195 R600_ERR("r600_sb_bytecode_process failed !\n");
196 goto error;
197 }
198 }
199
200 if (shader->gs_copy_shader) {
201 if (dump) {
202 // dump copy shader
203 r = r600_sb_bytecode_process(rctx, &shader->gs_copy_shader->shader.bc,
204 &shader->gs_copy_shader->shader, dump, 0);
205 if (r)
206 goto error;
207 }
208
209 if ((r = store_shader(ctx, shader->gs_copy_shader)))
210 goto error;
211 }
212
213 /* Store the shader in a buffer. */
214 if ((r = store_shader(ctx, shader)))
215 goto error;
216
217 /* Build state. */
218 switch (shader->shader.processor_type) {
219 case TGSI_PROCESSOR_TESS_CTRL:
220 evergreen_update_hs_state(ctx, shader);
221 break;
222 case TGSI_PROCESSOR_TESS_EVAL:
223 if (key.tes.as_es)
224 evergreen_update_es_state(ctx, shader);
225 else
226 evergreen_update_vs_state(ctx, shader);
227 break;
228 case TGSI_PROCESSOR_GEOMETRY:
229 if (rctx->b.chip_class >= EVERGREEN) {
230 evergreen_update_gs_state(ctx, shader);
231 evergreen_update_vs_state(ctx, shader->gs_copy_shader);
232 } else {
233 r600_update_gs_state(ctx, shader);
234 r600_update_vs_state(ctx, shader->gs_copy_shader);
235 }
236 break;
237 case TGSI_PROCESSOR_VERTEX:
238 export_shader = key.vs.as_es;
239 if (rctx->b.chip_class >= EVERGREEN) {
240 if (key.vs.as_ls)
241 evergreen_update_ls_state(ctx, shader);
242 else if (key.vs.as_es)
243 evergreen_update_es_state(ctx, shader);
244 else
245 evergreen_update_vs_state(ctx, shader);
246 } else {
247 if (export_shader)
248 r600_update_es_state(ctx, shader);
249 else
250 r600_update_vs_state(ctx, shader);
251 }
252 break;
253 case TGSI_PROCESSOR_FRAGMENT:
254 if (rctx->b.chip_class >= EVERGREEN) {
255 evergreen_update_ps_state(ctx, shader);
256 } else {
257 r600_update_ps_state(ctx, shader);
258 }
259 break;
260 default:
261 r = -EINVAL;
262 goto error;
263 }
264 return 0;
265
266 error:
267 r600_pipe_shader_destroy(ctx, shader);
268 return r;
269 }
270
271 void r600_pipe_shader_destroy(struct pipe_context *ctx, struct r600_pipe_shader *shader)
272 {
273 pipe_resource_reference((struct pipe_resource**)&shader->bo, NULL);
274 r600_bytecode_clear(&shader->shader.bc);
275 r600_release_command_buffer(&shader->command_buffer);
276 }
277
278 /*
279 * tgsi -> r600 shader
280 */
281 struct r600_shader_tgsi_instruction;
282
283 struct r600_shader_src {
284 unsigned sel;
285 unsigned swizzle[4];
286 unsigned neg;
287 unsigned abs;
288 unsigned rel;
289 unsigned kc_bank;
290 boolean kc_rel; /* true if cache bank is indexed */
291 uint32_t value[4];
292 };
293
294 struct eg_interp {
295 boolean enabled;
296 unsigned ij_index;
297 };
298
299 struct r600_shader_ctx {
300 struct tgsi_shader_info info;
301 struct tgsi_parse_context parse;
302 const struct tgsi_token *tokens;
303 unsigned type;
304 unsigned file_offset[TGSI_FILE_COUNT];
305 unsigned temp_reg;
306 const struct r600_shader_tgsi_instruction *inst_info;
307 struct r600_bytecode *bc;
308 struct r600_shader *shader;
309 struct r600_shader_src src[4];
310 uint32_t *literals;
311 uint32_t nliterals;
312 uint32_t max_driver_temp_used;
313 boolean use_llvm;
314 /* needed for evergreen interpolation */
315 struct eg_interp eg_interpolators[6]; // indexed by Persp/Linear * 3 + sample/center/centroid
316 /* evergreen/cayman also store sample mask in face register */
317 int face_gpr;
318 /* sample id is .w component stored in fixed point position register */
319 int fixed_pt_position_gpr;
320 int colors_used;
321 boolean clip_vertex_write;
322 unsigned cv_output;
323 unsigned edgeflag_output;
324 int fragcoord_input;
325 int native_integers;
326 int next_ring_offset;
327 int gs_out_ring_offset;
328 int gs_next_vertex;
329 struct r600_shader *gs_for_vs;
330 int gs_export_gpr_tregs[4];
331 const struct pipe_stream_output_info *gs_stream_output_info;
332 unsigned enabled_stream_buffers_mask;
333 };
334
335 struct r600_shader_tgsi_instruction {
336 unsigned op;
337 int (*process)(struct r600_shader_ctx *ctx);
338 };
339
340 static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so, int stream, bool ind);
341 static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[];
342 static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx);
343 static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason);
344 static void fc_pushlevel(struct r600_shader_ctx *ctx, int type);
345 static int tgsi_else(struct r600_shader_ctx *ctx);
346 static int tgsi_endif(struct r600_shader_ctx *ctx);
347 static int tgsi_bgnloop(struct r600_shader_ctx *ctx);
348 static int tgsi_endloop(struct r600_shader_ctx *ctx);
349 static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx);
350 static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
351 unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,
352 unsigned int dst_reg);
353 static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
354 const struct r600_shader_src *shader_src,
355 unsigned chan);
356
357 static int tgsi_is_supported(struct r600_shader_ctx *ctx)
358 {
359 struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction;
360 int j;
361
362 if (i->Instruction.NumDstRegs > 1 && i->Instruction.Opcode != TGSI_OPCODE_DFRACEXP) {
363 R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs);
364 return -EINVAL;
365 }
366 if (i->Instruction.Predicate) {
367 R600_ERR("predicate unsupported\n");
368 return -EINVAL;
369 }
370 #if 0
371 if (i->Instruction.Label) {
372 R600_ERR("label unsupported\n");
373 return -EINVAL;
374 }
375 #endif
376 for (j = 0; j < i->Instruction.NumSrcRegs; j++) {
377 if (i->Src[j].Register.Dimension) {
378 switch (i->Src[j].Register.File) {
379 case TGSI_FILE_CONSTANT:
380 break;
381 case TGSI_FILE_INPUT:
382 if (ctx->type == TGSI_PROCESSOR_GEOMETRY)
383 break;
384 default:
385 R600_ERR("unsupported src %d (dimension %d)\n", j,
386 i->Src[j].Register.Dimension);
387 return -EINVAL;
388 }
389 }
390 }
391 for (j = 0; j < i->Instruction.NumDstRegs; j++) {
392 if (i->Dst[j].Register.Dimension) {
393 R600_ERR("unsupported dst (dimension)\n");
394 return -EINVAL;
395 }
396 }
397 return 0;
398 }
399
400 int eg_get_interpolator_index(unsigned interpolate, unsigned location)
401 {
402 if (interpolate == TGSI_INTERPOLATE_COLOR ||
403 interpolate == TGSI_INTERPOLATE_LINEAR ||
404 interpolate == TGSI_INTERPOLATE_PERSPECTIVE)
405 {
406 int is_linear = interpolate == TGSI_INTERPOLATE_LINEAR;
407 int loc;
408
409 switch(location) {
410 case TGSI_INTERPOLATE_LOC_CENTER:
411 loc = 1;
412 break;
413 case TGSI_INTERPOLATE_LOC_CENTROID:
414 loc = 2;
415 break;
416 case TGSI_INTERPOLATE_LOC_SAMPLE:
417 default:
418 loc = 0; break;
419 }
420
421 return is_linear * 3 + loc;
422 }
423
424 return -1;
425 }
426
427 static void evergreen_interp_assign_ij_index(struct r600_shader_ctx *ctx,
428 int input)
429 {
430 int i = eg_get_interpolator_index(
431 ctx->shader->input[input].interpolate,
432 ctx->shader->input[input].interpolate_location);
433 assert(i >= 0);
434 ctx->shader->input[input].ij_index = ctx->eg_interpolators[i].ij_index;
435 }
436
437 static int evergreen_interp_alu(struct r600_shader_ctx *ctx, int input)
438 {
439 int i, r;
440 struct r600_bytecode_alu alu;
441 int gpr = 0, base_chan = 0;
442 int ij_index = ctx->shader->input[input].ij_index;
443
444 /* work out gpr and base_chan from index */
445 gpr = ij_index / 2;
446 base_chan = (2 * (ij_index % 2)) + 1;
447
448 for (i = 0; i < 8; i++) {
449 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
450
451 if (i < 4)
452 alu.op = ALU_OP2_INTERP_ZW;
453 else
454 alu.op = ALU_OP2_INTERP_XY;
455
456 if ((i > 1) && (i < 6)) {
457 alu.dst.sel = ctx->shader->input[input].gpr;
458 alu.dst.write = 1;
459 }
460
461 alu.dst.chan = i % 4;
462
463 alu.src[0].sel = gpr;
464 alu.src[0].chan = (base_chan - (i % 2));
465
466 alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
467
468 alu.bank_swizzle_force = SQ_ALU_VEC_210;
469 if ((i % 4) == 3)
470 alu.last = 1;
471 r = r600_bytecode_add_alu(ctx->bc, &alu);
472 if (r)
473 return r;
474 }
475 return 0;
476 }
477
478 static int evergreen_interp_flat(struct r600_shader_ctx *ctx, int input)
479 {
480 int i, r;
481 struct r600_bytecode_alu alu;
482
483 for (i = 0; i < 4; i++) {
484 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
485
486 alu.op = ALU_OP1_INTERP_LOAD_P0;
487
488 alu.dst.sel = ctx->shader->input[input].gpr;
489 alu.dst.write = 1;
490
491 alu.dst.chan = i;
492
493 alu.src[0].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
494 alu.src[0].chan = i;
495
496 if (i == 3)
497 alu.last = 1;
498 r = r600_bytecode_add_alu(ctx->bc, &alu);
499 if (r)
500 return r;
501 }
502 return 0;
503 }
504
505 /*
506 * Special export handling in shaders
507 *
508 * shader export ARRAY_BASE for EXPORT_POS:
509 * 60 is position
510 * 61 is misc vector
511 * 62, 63 are clip distance vectors
512 *
513 * The use of the values exported in 61-63 are controlled by PA_CL_VS_OUT_CNTL:
514 * VS_OUT_MISC_VEC_ENA - enables the use of all fields in export 61
515 * USE_VTX_POINT_SIZE - point size in the X channel of export 61
516 * USE_VTX_EDGE_FLAG - edge flag in the Y channel of export 61
517 * USE_VTX_RENDER_TARGET_INDX - render target index in the Z channel of export 61
518 * USE_VTX_VIEWPORT_INDX - viewport index in the W channel of export 61
519 * USE_VTX_KILL_FLAG - kill flag in the Z channel of export 61 (mutually
520 * exclusive from render target index)
521 * VS_OUT_CCDIST0_VEC_ENA/VS_OUT_CCDIST1_VEC_ENA - enable clip distance vectors
522 *
523 *
524 * shader export ARRAY_BASE for EXPORT_PIXEL:
525 * 0-7 CB targets
526 * 61 computed Z vector
527 *
528 * The use of the values exported in the computed Z vector are controlled
529 * by DB_SHADER_CONTROL:
530 * Z_EXPORT_ENABLE - Z as a float in RED
531 * STENCIL_REF_EXPORT_ENABLE - stencil ref as int in GREEN
532 * COVERAGE_TO_MASK_ENABLE - alpha to mask in ALPHA
533 * MASK_EXPORT_ENABLE - pixel sample mask in BLUE
534 * DB_SOURCE_FORMAT - export control restrictions
535 *
536 */
537
538
539 /* Map name/sid pair from tgsi to the 8-bit semantic index for SPI setup */
540 static int r600_spi_sid(struct r600_shader_io * io)
541 {
542 int index, name = io->name;
543
544 /* These params are handled differently, they don't need
545 * semantic indices, so we'll use 0 for them.
546 */
547 if (name == TGSI_SEMANTIC_POSITION ||
548 name == TGSI_SEMANTIC_PSIZE ||
549 name == TGSI_SEMANTIC_EDGEFLAG ||
550 name == TGSI_SEMANTIC_FACE ||
551 name == TGSI_SEMANTIC_SAMPLEMASK)
552 index = 0;
553 else {
554 if (name == TGSI_SEMANTIC_GENERIC) {
555 /* For generic params simply use sid from tgsi */
556 index = io->sid;
557 } else {
558 /* For non-generic params - pack name and sid into 8 bits */
559 index = 0x80 | (name<<3) | (io->sid);
560 }
561
562 /* Make sure that all really used indices have nonzero value, so
563 * we can just compare it to 0 later instead of comparing the name
564 * with different values to detect special cases. */
565 index++;
566 }
567
568 return index;
569 };
570
571 /* turn input into interpolate on EG */
572 static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index)
573 {
574 int r = 0;
575
576 if (ctx->shader->input[index].spi_sid) {
577 ctx->shader->input[index].lds_pos = ctx->shader->nlds++;
578 if (ctx->shader->input[index].interpolate > 0) {
579 evergreen_interp_assign_ij_index(ctx, index);
580 if (!ctx->use_llvm)
581 r = evergreen_interp_alu(ctx, index);
582 } else {
583 if (!ctx->use_llvm)
584 r = evergreen_interp_flat(ctx, index);
585 }
586 }
587 return r;
588 }
589
590 static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back)
591 {
592 struct r600_bytecode_alu alu;
593 int i, r;
594 int gpr_front = ctx->shader->input[front].gpr;
595 int gpr_back = ctx->shader->input[back].gpr;
596
597 for (i = 0; i < 4; i++) {
598 memset(&alu, 0, sizeof(alu));
599 alu.op = ALU_OP3_CNDGT;
600 alu.is_op3 = 1;
601 alu.dst.write = 1;
602 alu.dst.sel = gpr_front;
603 alu.src[0].sel = ctx->face_gpr;
604 alu.src[1].sel = gpr_front;
605 alu.src[2].sel = gpr_back;
606
607 alu.dst.chan = i;
608 alu.src[1].chan = i;
609 alu.src[2].chan = i;
610 alu.last = (i==3);
611
612 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
613 return r;
614 }
615
616 return 0;
617 }
618
619 static inline int get_address_file_reg(struct r600_shader_ctx *ctx, int index)
620 {
621 return index > 0 ? ctx->bc->index_reg[index - 1] : ctx->bc->ar_reg;
622 }
623
624 static int vs_add_primid_output(struct r600_shader_ctx *ctx, int prim_id_sid)
625 {
626 int i;
627 i = ctx->shader->noutput++;
628 ctx->shader->output[i].name = TGSI_SEMANTIC_PRIMID;
629 ctx->shader->output[i].sid = 0;
630 ctx->shader->output[i].gpr = 0;
631 ctx->shader->output[i].interpolate = TGSI_INTERPOLATE_CONSTANT;
632 ctx->shader->output[i].write_mask = 0x4;
633 ctx->shader->output[i].spi_sid = prim_id_sid;
634
635 return 0;
636 }
637
638 static int tgsi_declaration(struct r600_shader_ctx *ctx)
639 {
640 struct tgsi_full_declaration *d = &ctx->parse.FullToken.FullDeclaration;
641 int r, i, j, count = d->Range.Last - d->Range.First + 1;
642
643 switch (d->Declaration.File) {
644 case TGSI_FILE_INPUT:
645 for (j = 0; j < count; j++) {
646 i = ctx->shader->ninput + j;
647 assert(i < Elements(ctx->shader->input));
648 ctx->shader->input[i].name = d->Semantic.Name;
649 ctx->shader->input[i].sid = d->Semantic.Index + j;
650 ctx->shader->input[i].interpolate = d->Interp.Interpolate;
651 ctx->shader->input[i].interpolate_location = d->Interp.Location;
652 ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First + j;
653 if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
654 ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]);
655 switch (ctx->shader->input[i].name) {
656 case TGSI_SEMANTIC_FACE:
657 if (ctx->face_gpr != -1)
658 ctx->shader->input[i].gpr = ctx->face_gpr; /* already allocated by allocate_system_value_inputs */
659 else
660 ctx->face_gpr = ctx->shader->input[i].gpr;
661 break;
662 case TGSI_SEMANTIC_COLOR:
663 ctx->colors_used++;
664 break;
665 case TGSI_SEMANTIC_POSITION:
666 ctx->fragcoord_input = i;
667 break;
668 case TGSI_SEMANTIC_PRIMID:
669 /* set this for now */
670 ctx->shader->gs_prim_id_input = true;
671 ctx->shader->ps_prim_id_input = i;
672 break;
673 }
674 if (ctx->bc->chip_class >= EVERGREEN) {
675 if ((r = evergreen_interp_input(ctx, i)))
676 return r;
677 }
678 } else if (ctx->type == TGSI_PROCESSOR_GEOMETRY) {
679 /* FIXME probably skip inputs if they aren't passed in the ring */
680 ctx->shader->input[i].ring_offset = ctx->next_ring_offset;
681 ctx->next_ring_offset += 16;
682 if (ctx->shader->input[i].name == TGSI_SEMANTIC_PRIMID)
683 ctx->shader->gs_prim_id_input = true;
684 }
685 }
686 ctx->shader->ninput += count;
687 break;
688 case TGSI_FILE_OUTPUT:
689 for (j = 0; j < count; j++) {
690 i = ctx->shader->noutput + j;
691 assert(i < Elements(ctx->shader->output));
692 ctx->shader->output[i].name = d->Semantic.Name;
693 ctx->shader->output[i].sid = d->Semantic.Index + j;
694 ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First + j;
695 ctx->shader->output[i].interpolate = d->Interp.Interpolate;
696 ctx->shader->output[i].write_mask = d->Declaration.UsageMask;
697 if (ctx->type == TGSI_PROCESSOR_VERTEX ||
698 ctx->type == TGSI_PROCESSOR_GEOMETRY) {
699 ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]);
700 switch (d->Semantic.Name) {
701 case TGSI_SEMANTIC_CLIPDIST:
702 ctx->shader->clip_dist_write |= d->Declaration.UsageMask <<
703 ((d->Semantic.Index + j) << 2);
704 break;
705 case TGSI_SEMANTIC_PSIZE:
706 ctx->shader->vs_out_misc_write = 1;
707 ctx->shader->vs_out_point_size = 1;
708 break;
709 case TGSI_SEMANTIC_EDGEFLAG:
710 ctx->shader->vs_out_misc_write = 1;
711 ctx->shader->vs_out_edgeflag = 1;
712 ctx->edgeflag_output = i;
713 break;
714 case TGSI_SEMANTIC_VIEWPORT_INDEX:
715 ctx->shader->vs_out_misc_write = 1;
716 ctx->shader->vs_out_viewport = 1;
717 break;
718 case TGSI_SEMANTIC_LAYER:
719 ctx->shader->vs_out_misc_write = 1;
720 ctx->shader->vs_out_layer = 1;
721 break;
722 case TGSI_SEMANTIC_CLIPVERTEX:
723 ctx->clip_vertex_write = TRUE;
724 ctx->cv_output = i;
725 break;
726 }
727 if (ctx->type == TGSI_PROCESSOR_GEOMETRY) {
728 ctx->gs_out_ring_offset += 16;
729 }
730 } else if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
731 switch (d->Semantic.Name) {
732 case TGSI_SEMANTIC_COLOR:
733 ctx->shader->nr_ps_max_color_exports++;
734 break;
735 }
736 }
737 }
738 ctx->shader->noutput += count;
739 break;
740 case TGSI_FILE_TEMPORARY:
741 if (ctx->info.indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
742 if (d->Array.ArrayID) {
743 r600_add_gpr_array(ctx->shader,
744 ctx->file_offset[TGSI_FILE_TEMPORARY] +
745 d->Range.First,
746 d->Range.Last - d->Range.First + 1, 0x0F);
747 }
748 }
749 break;
750
751 case TGSI_FILE_CONSTANT:
752 case TGSI_FILE_SAMPLER:
753 case TGSI_FILE_SAMPLER_VIEW:
754 case TGSI_FILE_ADDRESS:
755 break;
756
757 case TGSI_FILE_SYSTEM_VALUE:
758 if (d->Semantic.Name == TGSI_SEMANTIC_SAMPLEMASK ||
759 d->Semantic.Name == TGSI_SEMANTIC_SAMPLEID ||
760 d->Semantic.Name == TGSI_SEMANTIC_SAMPLEPOS) {
761 break; /* Already handled from allocate_system_value_inputs */
762 } else if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) {
763 if (!ctx->native_integers) {
764 struct r600_bytecode_alu alu;
765 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
766
767 alu.op = ALU_OP1_INT_TO_FLT;
768 alu.src[0].sel = 0;
769 alu.src[0].chan = 3;
770
771 alu.dst.sel = 0;
772 alu.dst.chan = 3;
773 alu.dst.write = 1;
774 alu.last = 1;
775
776 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
777 return r;
778 }
779 break;
780 } else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID)
781 break;
782 else if (d->Semantic.Name == TGSI_SEMANTIC_INVOCATIONID)
783 break;
784 default:
785 R600_ERR("unsupported file %d declaration\n", d->Declaration.File);
786 return -EINVAL;
787 }
788 return 0;
789 }
790
791 static int r600_get_temp(struct r600_shader_ctx *ctx)
792 {
793 return ctx->temp_reg + ctx->max_driver_temp_used++;
794 }
795
796 static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_offset)
797 {
798 struct tgsi_parse_context parse;
799 struct {
800 boolean enabled;
801 int *reg;
802 unsigned name, alternate_name;
803 } inputs[2] = {
804 { false, &ctx->face_gpr, TGSI_SEMANTIC_SAMPLEMASK, ~0u }, /* lives in Front Face GPR.z */
805
806 { false, &ctx->fixed_pt_position_gpr, TGSI_SEMANTIC_SAMPLEID, TGSI_SEMANTIC_SAMPLEPOS } /* SAMPLEID is in Fixed Point Position GPR.w */
807 };
808 int i, k, num_regs = 0;
809
810 if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) {
811 return 0;
812 }
813
814 /* need to scan shader for system values and interpolateAtSample/Offset/Centroid */
815 while (!tgsi_parse_end_of_tokens(&parse)) {
816 tgsi_parse_token(&parse);
817
818 if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) {
819 const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
820 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE ||
821 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
822 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID)
823 {
824 int interpolate, location, k;
825
826 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
827 location = TGSI_INTERPOLATE_LOC_CENTER;
828 inputs[1].enabled = true; /* needs SAMPLEID */
829 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
830 location = TGSI_INTERPOLATE_LOC_CENTER;
831 /* Needs sample positions, currently those are always available */
832 } else {
833 location = TGSI_INTERPOLATE_LOC_CENTROID;
834 }
835
836 interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index];
837 k = eg_get_interpolator_index(interpolate, location);
838 ctx->eg_interpolators[k].enabled = true;
839 }
840 } else if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_DECLARATION) {
841 struct tgsi_full_declaration *d = &parse.FullToken.FullDeclaration;
842 if (d->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
843 for (k = 0; k < Elements(inputs); k++) {
844 if (d->Semantic.Name == inputs[k].name ||
845 d->Semantic.Name == inputs[k].alternate_name) {
846 inputs[k].enabled = true;
847 }
848 }
849 }
850 }
851 }
852
853 tgsi_parse_free(&parse);
854
855 for (i = 0; i < Elements(inputs); i++) {
856 boolean enabled = inputs[i].enabled;
857 int *reg = inputs[i].reg;
858 unsigned name = inputs[i].name;
859
860 if (enabled) {
861 int gpr = gpr_offset + num_regs++;
862
863 // add to inputs, allocate a gpr
864 k = ctx->shader->ninput ++;
865 ctx->shader->input[k].name = name;
866 ctx->shader->input[k].sid = 0;
867 ctx->shader->input[k].interpolate = TGSI_INTERPOLATE_CONSTANT;
868 ctx->shader->input[k].interpolate_location = TGSI_INTERPOLATE_LOC_CENTER;
869 *reg = ctx->shader->input[k].gpr = gpr;
870 }
871 }
872
873 return gpr_offset + num_regs;
874 }
875
876 /*
877 * for evergreen we need to scan the shader to find the number of GPRs we need to
878 * reserve for interpolation and system values
879 *
880 * we need to know if we are going to emit
881 * any sample or centroid inputs
882 * if perspective and linear are required
883 */
884 static int evergreen_gpr_count(struct r600_shader_ctx *ctx)
885 {
886 int i;
887 int num_baryc;
888 struct tgsi_parse_context parse;
889
890 memset(&ctx->eg_interpolators, 0, sizeof(ctx->eg_interpolators));
891
892 for (i = 0; i < ctx->info.num_inputs; i++) {
893 int k;
894 /* skip position/face/mask/sampleid */
895 if (ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_POSITION ||
896 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_FACE ||
897 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEMASK ||
898 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEID)
899 continue;
900
901 k = eg_get_interpolator_index(
902 ctx->info.input_interpolate[i],
903 ctx->info.input_interpolate_loc[i]);
904 if (k >= 0)
905 ctx->eg_interpolators[k].enabled = TRUE;
906 }
907
908 if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) {
909 return 0;
910 }
911
912 /* need to scan shader for system values and interpolateAtSample/Offset/Centroid */
913 while (!tgsi_parse_end_of_tokens(&parse)) {
914 tgsi_parse_token(&parse);
915
916 if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) {
917 const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
918 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE ||
919 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
920 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID)
921 {
922 int interpolate, location, k;
923
924 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
925 location = TGSI_INTERPOLATE_LOC_CENTER;
926 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
927 location = TGSI_INTERPOLATE_LOC_CENTER;
928 } else {
929 location = TGSI_INTERPOLATE_LOC_CENTROID;
930 }
931
932 interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index];
933 k = eg_get_interpolator_index(interpolate, location);
934 ctx->eg_interpolators[k].enabled = true;
935 }
936 }
937 }
938
939 tgsi_parse_free(&parse);
940
941 /* assign gpr to each interpolator according to priority */
942 num_baryc = 0;
943 for (i = 0; i < Elements(ctx->eg_interpolators); i++) {
944 if (ctx->eg_interpolators[i].enabled) {
945 ctx->eg_interpolators[i].ij_index = num_baryc;
946 num_baryc ++;
947 }
948 }
949
950 /* XXX PULL MODEL and LINE STIPPLE */
951
952 num_baryc = (num_baryc + 1) >> 1;
953 return allocate_system_value_inputs(ctx, num_baryc);
954 }
955
956 /* sample_id_sel == NULL means fetch for current sample */
957 static int load_sample_position(struct r600_shader_ctx *ctx, struct r600_shader_src *sample_id, int chan_sel)
958 {
959 struct r600_bytecode_vtx vtx;
960 int r, t1;
961
962 assert(ctx->fixed_pt_position_gpr != -1);
963
964 t1 = r600_get_temp(ctx);
965
966 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
967 vtx.op = FETCH_OP_VFETCH;
968 vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;
969 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
970 if (sample_id == NULL) {
971 vtx.src_gpr = ctx->fixed_pt_position_gpr; // SAMPLEID is in .w;
972 vtx.src_sel_x = 3;
973 }
974 else {
975 struct r600_bytecode_alu alu;
976
977 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
978 alu.op = ALU_OP1_MOV;
979 r600_bytecode_src(&alu.src[0], sample_id, chan_sel);
980 alu.dst.sel = t1;
981 alu.dst.write = 1;
982 alu.last = 1;
983 r = r600_bytecode_add_alu(ctx->bc, &alu);
984 if (r)
985 return r;
986
987 vtx.src_gpr = t1;
988 vtx.src_sel_x = 0;
989 }
990 vtx.mega_fetch_count = 16;
991 vtx.dst_gpr = t1;
992 vtx.dst_sel_x = 0;
993 vtx.dst_sel_y = 1;
994 vtx.dst_sel_z = 2;
995 vtx.dst_sel_w = 3;
996 vtx.data_format = FMT_32_32_32_32_FLOAT;
997 vtx.num_format_all = 2;
998 vtx.format_comp_all = 1;
999 vtx.use_const_fields = 0;
1000 vtx.offset = 1; // first element is size of buffer
1001 vtx.endian = r600_endian_swap(32);
1002 vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */
1003
1004 r = r600_bytecode_add_vtx(ctx->bc, &vtx);
1005 if (r)
1006 return r;
1007
1008 return t1;
1009 }
1010
1011 static void tgsi_src(struct r600_shader_ctx *ctx,
1012 const struct tgsi_full_src_register *tgsi_src,
1013 struct r600_shader_src *r600_src)
1014 {
1015 memset(r600_src, 0, sizeof(*r600_src));
1016 r600_src->swizzle[0] = tgsi_src->Register.SwizzleX;
1017 r600_src->swizzle[1] = tgsi_src->Register.SwizzleY;
1018 r600_src->swizzle[2] = tgsi_src->Register.SwizzleZ;
1019 r600_src->swizzle[3] = tgsi_src->Register.SwizzleW;
1020 r600_src->neg = tgsi_src->Register.Negate;
1021 r600_src->abs = tgsi_src->Register.Absolute;
1022
1023 if (tgsi_src->Register.File == TGSI_FILE_IMMEDIATE) {
1024 int index;
1025 if ((tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleY) &&
1026 (tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleZ) &&
1027 (tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleW)) {
1028
1029 index = tgsi_src->Register.Index * 4 + tgsi_src->Register.SwizzleX;
1030 r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel, &r600_src->neg, r600_src->abs);
1031 if (r600_src->sel != V_SQ_ALU_SRC_LITERAL)
1032 return;
1033 }
1034 index = tgsi_src->Register.Index;
1035 r600_src->sel = V_SQ_ALU_SRC_LITERAL;
1036 memcpy(r600_src->value, ctx->literals + index * 4, sizeof(r600_src->value));
1037 } else if (tgsi_src->Register.File == TGSI_FILE_SYSTEM_VALUE) {
1038 if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEMASK) {
1039 r600_src->swizzle[0] = 2; // Z value
1040 r600_src->swizzle[1] = 2;
1041 r600_src->swizzle[2] = 2;
1042 r600_src->swizzle[3] = 2;
1043 r600_src->sel = ctx->face_gpr;
1044 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEID) {
1045 r600_src->swizzle[0] = 3; // W value
1046 r600_src->swizzle[1] = 3;
1047 r600_src->swizzle[2] = 3;
1048 r600_src->swizzle[3] = 3;
1049 r600_src->sel = ctx->fixed_pt_position_gpr;
1050 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEPOS) {
1051 r600_src->swizzle[0] = 0;
1052 r600_src->swizzle[1] = 1;
1053 r600_src->swizzle[2] = 4;
1054 r600_src->swizzle[3] = 4;
1055 r600_src->sel = load_sample_position(ctx, NULL, -1);
1056 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INSTANCEID) {
1057 r600_src->swizzle[0] = 3;
1058 r600_src->swizzle[1] = 3;
1059 r600_src->swizzle[2] = 3;
1060 r600_src->swizzle[3] = 3;
1061 r600_src->sel = 0;
1062 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTEXID) {
1063 r600_src->swizzle[0] = 0;
1064 r600_src->swizzle[1] = 0;
1065 r600_src->swizzle[2] = 0;
1066 r600_src->swizzle[3] = 0;
1067 r600_src->sel = 0;
1068 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) {
1069 r600_src->swizzle[0] = 3;
1070 r600_src->swizzle[1] = 3;
1071 r600_src->swizzle[2] = 3;
1072 r600_src->swizzle[3] = 3;
1073 r600_src->sel = 1;
1074 }
1075 } else {
1076 if (tgsi_src->Register.Indirect)
1077 r600_src->rel = V_SQ_REL_RELATIVE;
1078 r600_src->sel = tgsi_src->Register.Index;
1079 r600_src->sel += ctx->file_offset[tgsi_src->Register.File];
1080 }
1081 if (tgsi_src->Register.File == TGSI_FILE_CONSTANT) {
1082 if (tgsi_src->Register.Dimension) {
1083 r600_src->kc_bank = tgsi_src->Dimension.Index;
1084 if (tgsi_src->Dimension.Indirect) {
1085 r600_src->kc_rel = 1;
1086 }
1087 }
1088 }
1089 }
1090
1091 static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
1092 unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,
1093 unsigned int dst_reg)
1094 {
1095 struct r600_bytecode_vtx vtx;
1096 unsigned int ar_reg;
1097 int r;
1098
1099 if (offset) {
1100 struct r600_bytecode_alu alu;
1101
1102 memset(&alu, 0, sizeof(alu));
1103
1104 alu.op = ALU_OP2_ADD_INT;
1105 alu.src[0].sel = ctx->bc->ar_reg;
1106 alu.src[0].chan = ar_chan;
1107
1108 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1109 alu.src[1].value = offset;
1110
1111 alu.dst.sel = dst_reg;
1112 alu.dst.chan = ar_chan;
1113 alu.dst.write = 1;
1114 alu.last = 1;
1115
1116 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1117 return r;
1118
1119 ar_reg = dst_reg;
1120 } else {
1121 ar_reg = ctx->bc->ar_reg;
1122 }
1123
1124 memset(&vtx, 0, sizeof(vtx));
1125 vtx.buffer_id = cb_idx;
1126 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1127 vtx.src_gpr = ar_reg;
1128 vtx.src_sel_x = ar_chan;
1129 vtx.mega_fetch_count = 16;
1130 vtx.dst_gpr = dst_reg;
1131 vtx.dst_sel_x = 0; /* SEL_X */
1132 vtx.dst_sel_y = 1; /* SEL_Y */
1133 vtx.dst_sel_z = 2; /* SEL_Z */
1134 vtx.dst_sel_w = 3; /* SEL_W */
1135 vtx.data_format = FMT_32_32_32_32_FLOAT;
1136 vtx.num_format_all = 2; /* NUM_FORMAT_SCALED */
1137 vtx.format_comp_all = 1; /* FORMAT_COMP_SIGNED */
1138 vtx.endian = r600_endian_swap(32);
1139 vtx.buffer_index_mode = cb_rel; // cb_rel ? V_SQ_CF_INDEX_0 : V_SQ_CF_INDEX_NONE;
1140
1141 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1142 return r;
1143
1144 return 0;
1145 }
1146
1147 static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1148 {
1149 struct r600_bytecode_vtx vtx;
1150 int r;
1151 unsigned index = src->Register.Index;
1152 unsigned vtx_id = src->Dimension.Index;
1153 int offset_reg = vtx_id / 3;
1154 int offset_chan = vtx_id % 3;
1155
1156 /* offsets of per-vertex data in ESGS ring are passed to GS in R0.x, R0.y,
1157 * R0.w, R1.x, R1.y, R1.z (it seems R0.z is used for PrimitiveID) */
1158
1159 if (offset_reg == 0 && offset_chan == 2)
1160 offset_chan = 3;
1161
1162 if (src->Dimension.Indirect) {
1163 int treg[3];
1164 int t2;
1165 struct r600_bytecode_alu alu;
1166 int r, i;
1167
1168 /* you have got to be shitting me -
1169 we have to put the R0.x/y/w into Rt.x Rt+1.x Rt+2.x then index reg from Rt.
1170 at least this is what fglrx seems to do. */
1171 for (i = 0; i < 3; i++) {
1172 treg[i] = r600_get_temp(ctx);
1173 }
1174 r600_add_gpr_array(ctx->shader, treg[0], 3, 0x0F);
1175
1176 t2 = r600_get_temp(ctx);
1177 for (i = 0; i < 3; i++) {
1178 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1179 alu.op = ALU_OP1_MOV;
1180 alu.src[0].sel = 0;
1181 alu.src[0].chan = i == 2 ? 3 : i;
1182 alu.dst.sel = treg[i];
1183 alu.dst.chan = 0;
1184 alu.dst.write = 1;
1185 alu.last = 1;
1186 r = r600_bytecode_add_alu(ctx->bc, &alu);
1187 if (r)
1188 return r;
1189 }
1190 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1191 alu.op = ALU_OP1_MOV;
1192 alu.src[0].sel = treg[0];
1193 alu.src[0].rel = 1;
1194 alu.dst.sel = t2;
1195 alu.dst.write = 1;
1196 alu.last = 1;
1197 r = r600_bytecode_add_alu(ctx->bc, &alu);
1198 if (r)
1199 return r;
1200 offset_reg = t2;
1201 }
1202
1203
1204 memset(&vtx, 0, sizeof(vtx));
1205 vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
1206 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1207 vtx.src_gpr = offset_reg;
1208 vtx.src_sel_x = offset_chan;
1209 vtx.offset = index * 16; /*bytes*/
1210 vtx.mega_fetch_count = 16;
1211 vtx.dst_gpr = dst_reg;
1212 vtx.dst_sel_x = 0; /* SEL_X */
1213 vtx.dst_sel_y = 1; /* SEL_Y */
1214 vtx.dst_sel_z = 2; /* SEL_Z */
1215 vtx.dst_sel_w = 3; /* SEL_W */
1216 if (ctx->bc->chip_class >= EVERGREEN) {
1217 vtx.use_const_fields = 1;
1218 } else {
1219 vtx.data_format = FMT_32_32_32_32_FLOAT;
1220 }
1221
1222 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1223 return r;
1224
1225 return 0;
1226 }
1227
1228 static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx)
1229 {
1230 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1231 int i;
1232
1233 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1234 struct tgsi_full_src_register *src = &inst->Src[i];
1235
1236 if (src->Register.File == TGSI_FILE_INPUT) {
1237 if (ctx->shader->input[src->Register.Index].name == TGSI_SEMANTIC_PRIMID) {
1238 /* primitive id is in R0.z */
1239 ctx->src[i].sel = 0;
1240 ctx->src[i].swizzle[0] = 2;
1241 }
1242 }
1243 if (src->Register.File == TGSI_FILE_INPUT && src->Register.Dimension) {
1244 int treg = r600_get_temp(ctx);
1245
1246 fetch_gs_input(ctx, src, treg);
1247 ctx->src[i].sel = treg;
1248 }
1249 }
1250 return 0;
1251 }
1252
1253 static int tgsi_split_constant(struct r600_shader_ctx *ctx)
1254 {
1255 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1256 struct r600_bytecode_alu alu;
1257 int i, j, k, nconst, r;
1258
1259 for (i = 0, nconst = 0; i < inst->Instruction.NumSrcRegs; i++) {
1260 if (inst->Src[i].Register.File == TGSI_FILE_CONSTANT) {
1261 nconst++;
1262 }
1263 tgsi_src(ctx, &inst->Src[i], &ctx->src[i]);
1264 }
1265 for (i = 0, j = nconst - 1; i < inst->Instruction.NumSrcRegs; i++) {
1266 if (inst->Src[i].Register.File != TGSI_FILE_CONSTANT) {
1267 continue;
1268 }
1269
1270 if (ctx->src[i].rel) {
1271 int chan = inst->Src[i].Indirect.Swizzle;
1272 int treg = r600_get_temp(ctx);
1273 if ((r = tgsi_fetch_rel_const(ctx, ctx->src[i].kc_bank, ctx->src[i].kc_rel, ctx->src[i].sel - 512, chan, treg)))
1274 return r;
1275
1276 ctx->src[i].kc_bank = 0;
1277 ctx->src[i].kc_rel = 0;
1278 ctx->src[i].sel = treg;
1279 ctx->src[i].rel = 0;
1280 j--;
1281 } else if (j > 0) {
1282 int treg = r600_get_temp(ctx);
1283 for (k = 0; k < 4; k++) {
1284 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1285 alu.op = ALU_OP1_MOV;
1286 alu.src[0].sel = ctx->src[i].sel;
1287 alu.src[0].chan = k;
1288 alu.src[0].rel = ctx->src[i].rel;
1289 alu.src[0].kc_bank = ctx->src[i].kc_bank;
1290 alu.src[0].kc_rel = ctx->src[i].kc_rel;
1291 alu.dst.sel = treg;
1292 alu.dst.chan = k;
1293 alu.dst.write = 1;
1294 if (k == 3)
1295 alu.last = 1;
1296 r = r600_bytecode_add_alu(ctx->bc, &alu);
1297 if (r)
1298 return r;
1299 }
1300 ctx->src[i].sel = treg;
1301 ctx->src[i].rel =0;
1302 j--;
1303 }
1304 }
1305 return 0;
1306 }
1307
1308 /* need to move any immediate into a temp - for trig functions which use literal for PI stuff */
1309 static int tgsi_split_literal_constant(struct r600_shader_ctx *ctx)
1310 {
1311 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1312 struct r600_bytecode_alu alu;
1313 int i, j, k, nliteral, r;
1314
1315 for (i = 0, nliteral = 0; i < inst->Instruction.NumSrcRegs; i++) {
1316 if (ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
1317 nliteral++;
1318 }
1319 }
1320 for (i = 0, j = nliteral - 1; i < inst->Instruction.NumSrcRegs; i++) {
1321 if (j > 0 && ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
1322 int treg = r600_get_temp(ctx);
1323 for (k = 0; k < 4; k++) {
1324 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1325 alu.op = ALU_OP1_MOV;
1326 alu.src[0].sel = ctx->src[i].sel;
1327 alu.src[0].chan = k;
1328 alu.src[0].value = ctx->src[i].value[k];
1329 alu.dst.sel = treg;
1330 alu.dst.chan = k;
1331 alu.dst.write = 1;
1332 if (k == 3)
1333 alu.last = 1;
1334 r = r600_bytecode_add_alu(ctx->bc, &alu);
1335 if (r)
1336 return r;
1337 }
1338 ctx->src[i].sel = treg;
1339 j--;
1340 }
1341 }
1342 return 0;
1343 }
1344
1345 static int process_twoside_color_inputs(struct r600_shader_ctx *ctx)
1346 {
1347 int i, r, count = ctx->shader->ninput;
1348
1349 for (i = 0; i < count; i++) {
1350 if (ctx->shader->input[i].name == TGSI_SEMANTIC_COLOR) {
1351 r = select_twoside_color(ctx, i, ctx->shader->input[i].back_color_input);
1352 if (r)
1353 return r;
1354 }
1355 }
1356 return 0;
1357 }
1358
1359 static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output_info *so,
1360 int stream, unsigned *stream_item_size)
1361 {
1362 unsigned so_gpr[PIPE_MAX_SHADER_OUTPUTS];
1363 unsigned start_comp[PIPE_MAX_SHADER_OUTPUTS];
1364 int i, j, r;
1365
1366 /* Sanity checking. */
1367 if (so->num_outputs > PIPE_MAX_SO_OUTPUTS) {
1368 R600_ERR("Too many stream outputs: %d\n", so->num_outputs);
1369 r = -EINVAL;
1370 goto out_err;
1371 }
1372 for (i = 0; i < so->num_outputs; i++) {
1373 if (so->output[i].output_buffer >= 4) {
1374 R600_ERR("Exceeded the max number of stream output buffers, got: %d\n",
1375 so->output[i].output_buffer);
1376 r = -EINVAL;
1377 goto out_err;
1378 }
1379 }
1380
1381 /* Initialize locations where the outputs are stored. */
1382 for (i = 0; i < so->num_outputs; i++) {
1383
1384 so_gpr[i] = ctx->shader->output[so->output[i].register_index].gpr;
1385 start_comp[i] = so->output[i].start_component;
1386 /* Lower outputs with dst_offset < start_component.
1387 *
1388 * We can only output 4D vectors with a write mask, e.g. we can
1389 * only output the W component at offset 3, etc. If we want
1390 * to store Y, Z, or W at buffer offset 0, we need to use MOV
1391 * to move it to X and output X. */
1392 if (so->output[i].dst_offset < so->output[i].start_component) {
1393 unsigned tmp = r600_get_temp(ctx);
1394
1395 for (j = 0; j < so->output[i].num_components; j++) {
1396 struct r600_bytecode_alu alu;
1397 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1398 alu.op = ALU_OP1_MOV;
1399 alu.src[0].sel = so_gpr[i];
1400 alu.src[0].chan = so->output[i].start_component + j;
1401
1402 alu.dst.sel = tmp;
1403 alu.dst.chan = j;
1404 alu.dst.write = 1;
1405 if (j == so->output[i].num_components - 1)
1406 alu.last = 1;
1407 r = r600_bytecode_add_alu(ctx->bc, &alu);
1408 if (r)
1409 return r;
1410 }
1411 start_comp[i] = 0;
1412 so_gpr[i] = tmp;
1413 }
1414 }
1415
1416 /* Write outputs to buffers. */
1417 for (i = 0; i < so->num_outputs; i++) {
1418 struct r600_bytecode_output output;
1419
1420 if (stream != -1 && stream != so->output[i].output_buffer)
1421 continue;
1422
1423 memset(&output, 0, sizeof(struct r600_bytecode_output));
1424 output.gpr = so_gpr[i];
1425 output.elem_size = so->output[i].num_components - 1;
1426 if (output.elem_size == 2)
1427 output.elem_size = 3; // 3 not supported, write 4 with junk at end
1428 output.array_base = so->output[i].dst_offset - start_comp[i];
1429 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
1430 output.burst_count = 1;
1431 /* array_size is an upper limit for the burst_count
1432 * with MEM_STREAM instructions */
1433 output.array_size = 0xFFF;
1434 output.comp_mask = ((1 << so->output[i].num_components) - 1) << start_comp[i];
1435
1436 if (ctx->bc->chip_class >= EVERGREEN) {
1437 switch (so->output[i].output_buffer) {
1438 case 0:
1439 output.op = CF_OP_MEM_STREAM0_BUF0;
1440 break;
1441 case 1:
1442 output.op = CF_OP_MEM_STREAM0_BUF1;
1443 break;
1444 case 2:
1445 output.op = CF_OP_MEM_STREAM0_BUF2;
1446 break;
1447 case 3:
1448 output.op = CF_OP_MEM_STREAM0_BUF3;
1449 break;
1450 }
1451 output.op += so->output[i].stream * 4;
1452 assert(output.op >= CF_OP_MEM_STREAM0_BUF0 && output.op <= CF_OP_MEM_STREAM3_BUF3);
1453 ctx->enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer) << so->output[i].stream * 4;
1454 } else {
1455 switch (so->output[i].output_buffer) {
1456 case 0:
1457 output.op = CF_OP_MEM_STREAM0;
1458 break;
1459 case 1:
1460 output.op = CF_OP_MEM_STREAM1;
1461 break;
1462 case 2:
1463 output.op = CF_OP_MEM_STREAM2;
1464 break;
1465 case 3:
1466 output.op = CF_OP_MEM_STREAM3;
1467 break;
1468 }
1469 ctx->enabled_stream_buffers_mask |= 1 << so->output[i].output_buffer;
1470 }
1471 r = r600_bytecode_add_output(ctx->bc, &output);
1472 if (r)
1473 goto out_err;
1474 }
1475 return 0;
1476 out_err:
1477 return r;
1478 }
1479
1480 static void convert_edgeflag_to_int(struct r600_shader_ctx *ctx)
1481 {
1482 struct r600_bytecode_alu alu;
1483 unsigned reg;
1484
1485 if (!ctx->shader->vs_out_edgeflag)
1486 return;
1487
1488 reg = ctx->shader->output[ctx->edgeflag_output].gpr;
1489
1490 /* clamp(x, 0, 1) */
1491 memset(&alu, 0, sizeof(alu));
1492 alu.op = ALU_OP1_MOV;
1493 alu.src[0].sel = reg;
1494 alu.dst.sel = reg;
1495 alu.dst.write = 1;
1496 alu.dst.clamp = 1;
1497 alu.last = 1;
1498 r600_bytecode_add_alu(ctx->bc, &alu);
1499
1500 memset(&alu, 0, sizeof(alu));
1501 alu.op = ALU_OP1_FLT_TO_INT;
1502 alu.src[0].sel = reg;
1503 alu.dst.sel = reg;
1504 alu.dst.write = 1;
1505 alu.last = 1;
1506 r600_bytecode_add_alu(ctx->bc, &alu);
1507 }
1508
1509 static int generate_gs_copy_shader(struct r600_context *rctx,
1510 struct r600_pipe_shader *gs,
1511 struct pipe_stream_output_info *so)
1512 {
1513 struct r600_shader_ctx ctx = {};
1514 struct r600_shader *gs_shader = &gs->shader;
1515 struct r600_pipe_shader *cshader;
1516 int ocnt = gs_shader->noutput;
1517 struct r600_bytecode_alu alu;
1518 struct r600_bytecode_vtx vtx;
1519 struct r600_bytecode_output output;
1520 struct r600_bytecode_cf *cf_jump, *cf_pop,
1521 *last_exp_pos = NULL, *last_exp_param = NULL;
1522 int i, j, next_clip_pos = 61, next_param = 0;
1523 int ring;
1524
1525 cshader = calloc(1, sizeof(struct r600_pipe_shader));
1526 if (!cshader)
1527 return 0;
1528
1529 memcpy(cshader->shader.output, gs_shader->output, ocnt *
1530 sizeof(struct r600_shader_io));
1531
1532 cshader->shader.noutput = ocnt;
1533
1534 ctx.shader = &cshader->shader;
1535 ctx.bc = &ctx.shader->bc;
1536 ctx.type = ctx.bc->type = TGSI_PROCESSOR_VERTEX;
1537
1538 r600_bytecode_init(ctx.bc, rctx->b.chip_class, rctx->b.family,
1539 rctx->screen->has_compressed_msaa_texturing);
1540
1541 ctx.bc->isa = rctx->isa;
1542
1543 cf_jump = NULL;
1544 memset(cshader->shader.ring_item_sizes, 0, sizeof(cshader->shader.ring_item_sizes));
1545
1546 /* R0.x = R0.x & 0x3fffffff */
1547 memset(&alu, 0, sizeof(alu));
1548 alu.op = ALU_OP2_AND_INT;
1549 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1550 alu.src[1].value = 0x3fffffff;
1551 alu.dst.write = 1;
1552 r600_bytecode_add_alu(ctx.bc, &alu);
1553
1554 /* R0.y = R0.x >> 30 */
1555 memset(&alu, 0, sizeof(alu));
1556 alu.op = ALU_OP2_LSHR_INT;
1557 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1558 alu.src[1].value = 0x1e;
1559 alu.dst.chan = 1;
1560 alu.dst.write = 1;
1561 alu.last = 1;
1562 r600_bytecode_add_alu(ctx.bc, &alu);
1563
1564 /* fetch vertex data from GSVS ring */
1565 for (i = 0; i < ocnt; ++i) {
1566 struct r600_shader_io *out = &ctx.shader->output[i];
1567
1568 out->gpr = i + 1;
1569 out->ring_offset = i * 16;
1570
1571 memset(&vtx, 0, sizeof(vtx));
1572 vtx.op = FETCH_OP_VFETCH;
1573 vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
1574 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1575 vtx.mega_fetch_count = 16;
1576 vtx.offset = out->ring_offset;
1577 vtx.dst_gpr = out->gpr;
1578 vtx.src_gpr = 0;
1579 vtx.dst_sel_x = 0;
1580 vtx.dst_sel_y = 1;
1581 vtx.dst_sel_z = 2;
1582 vtx.dst_sel_w = 3;
1583 if (rctx->b.chip_class >= EVERGREEN) {
1584 vtx.use_const_fields = 1;
1585 } else {
1586 vtx.data_format = FMT_32_32_32_32_FLOAT;
1587 }
1588
1589 r600_bytecode_add_vtx(ctx.bc, &vtx);
1590 }
1591 ctx.temp_reg = i + 1;
1592 for (ring = 3; ring >= 0; --ring) {
1593 bool enabled = false;
1594 for (i = 0; i < so->num_outputs; i++) {
1595 if (so->output[i].stream == ring) {
1596 enabled = true;
1597 break;
1598 }
1599 }
1600 if (ring != 0 && !enabled) {
1601 cshader->shader.ring_item_sizes[ring] = 0;
1602 continue;
1603 }
1604
1605 if (cf_jump) {
1606 // Patch up jump label
1607 r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
1608 cf_pop = ctx.bc->cf_last;
1609
1610 cf_jump->cf_addr = cf_pop->id + 2;
1611 cf_jump->pop_count = 1;
1612 cf_pop->cf_addr = cf_pop->id + 2;
1613 cf_pop->pop_count = 1;
1614 }
1615
1616 /* PRED_SETE_INT __, R0.y, ring */
1617 memset(&alu, 0, sizeof(alu));
1618 alu.op = ALU_OP2_PRED_SETE_INT;
1619 alu.src[0].chan = 1;
1620 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1621 alu.src[1].value = ring;
1622 alu.execute_mask = 1;
1623 alu.update_pred = 1;
1624 alu.last = 1;
1625 r600_bytecode_add_alu_type(ctx.bc, &alu, CF_OP_ALU_PUSH_BEFORE);
1626
1627 r600_bytecode_add_cfinst(ctx.bc, CF_OP_JUMP);
1628 cf_jump = ctx.bc->cf_last;
1629
1630 if (enabled)
1631 emit_streamout(&ctx, so, ring, &cshader->shader.ring_item_sizes[ring]);
1632 cshader->shader.ring_item_sizes[ring] = ocnt * 16;
1633 }
1634
1635 /* bc adds nops - copy it */
1636 if (ctx.bc->chip_class == R600) {
1637 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1638 alu.op = ALU_OP0_NOP;
1639 alu.last = 1;
1640 r600_bytecode_add_alu(ctx.bc, &alu);
1641
1642 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
1643 }
1644
1645 /* export vertex data */
1646 /* XXX factor out common code with r600_shader_from_tgsi ? */
1647 for (i = 0; i < ocnt; ++i) {
1648 struct r600_shader_io *out = &ctx.shader->output[i];
1649 bool instream0 = true;
1650 if (out->name == TGSI_SEMANTIC_CLIPVERTEX)
1651 continue;
1652
1653 for (j = 0; j < so->num_outputs; j++) {
1654 if (so->output[j].register_index == i) {
1655 if (so->output[j].stream == 0)
1656 break;
1657 if (so->output[j].stream > 0)
1658 instream0 = false;
1659 }
1660 }
1661 if (!instream0)
1662 continue;
1663 memset(&output, 0, sizeof(output));
1664 output.gpr = out->gpr;
1665 output.elem_size = 3;
1666 output.swizzle_x = 0;
1667 output.swizzle_y = 1;
1668 output.swizzle_z = 2;
1669 output.swizzle_w = 3;
1670 output.burst_count = 1;
1671 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
1672 output.op = CF_OP_EXPORT;
1673 switch (out->name) {
1674 case TGSI_SEMANTIC_POSITION:
1675 output.array_base = 60;
1676 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1677 break;
1678
1679 case TGSI_SEMANTIC_PSIZE:
1680 output.array_base = 61;
1681 if (next_clip_pos == 61)
1682 next_clip_pos = 62;
1683 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1684 output.swizzle_y = 7;
1685 output.swizzle_z = 7;
1686 output.swizzle_w = 7;
1687 ctx.shader->vs_out_misc_write = 1;
1688 ctx.shader->vs_out_point_size = 1;
1689 break;
1690 case TGSI_SEMANTIC_LAYER:
1691 if (out->spi_sid) {
1692 /* duplicate it as PARAM to pass to the pixel shader */
1693 output.array_base = next_param++;
1694 r600_bytecode_add_output(ctx.bc, &output);
1695 last_exp_param = ctx.bc->cf_last;
1696 }
1697 output.array_base = 61;
1698 if (next_clip_pos == 61)
1699 next_clip_pos = 62;
1700 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1701 output.swizzle_x = 7;
1702 output.swizzle_y = 7;
1703 output.swizzle_z = 0;
1704 output.swizzle_w = 7;
1705 ctx.shader->vs_out_misc_write = 1;
1706 ctx.shader->vs_out_layer = 1;
1707 break;
1708 case TGSI_SEMANTIC_VIEWPORT_INDEX:
1709 if (out->spi_sid) {
1710 /* duplicate it as PARAM to pass to the pixel shader */
1711 output.array_base = next_param++;
1712 r600_bytecode_add_output(ctx.bc, &output);
1713 last_exp_param = ctx.bc->cf_last;
1714 }
1715 output.array_base = 61;
1716 if (next_clip_pos == 61)
1717 next_clip_pos = 62;
1718 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1719 ctx.shader->vs_out_misc_write = 1;
1720 ctx.shader->vs_out_viewport = 1;
1721 output.swizzle_x = 7;
1722 output.swizzle_y = 7;
1723 output.swizzle_z = 7;
1724 output.swizzle_w = 0;
1725 break;
1726 case TGSI_SEMANTIC_CLIPDIST:
1727 /* spi_sid is 0 for clipdistance outputs that were generated
1728 * for clipvertex - we don't need to pass them to PS */
1729 ctx.shader->clip_dist_write = gs->shader.clip_dist_write;
1730 if (out->spi_sid) {
1731 /* duplicate it as PARAM to pass to the pixel shader */
1732 output.array_base = next_param++;
1733 r600_bytecode_add_output(ctx.bc, &output);
1734 last_exp_param = ctx.bc->cf_last;
1735 }
1736 output.array_base = next_clip_pos++;
1737 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1738 break;
1739 case TGSI_SEMANTIC_FOG:
1740 output.swizzle_y = 4; /* 0 */
1741 output.swizzle_z = 4; /* 0 */
1742 output.swizzle_w = 5; /* 1 */
1743 break;
1744 default:
1745 output.array_base = next_param++;
1746 break;
1747 }
1748 r600_bytecode_add_output(ctx.bc, &output);
1749 if (output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM)
1750 last_exp_param = ctx.bc->cf_last;
1751 else
1752 last_exp_pos = ctx.bc->cf_last;
1753 }
1754
1755 if (!last_exp_pos) {
1756 memset(&output, 0, sizeof(output));
1757 output.gpr = 0;
1758 output.elem_size = 3;
1759 output.swizzle_x = 7;
1760 output.swizzle_y = 7;
1761 output.swizzle_z = 7;
1762 output.swizzle_w = 7;
1763 output.burst_count = 1;
1764 output.type = 2;
1765 output.op = CF_OP_EXPORT;
1766 output.array_base = 60;
1767 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1768 r600_bytecode_add_output(ctx.bc, &output);
1769 last_exp_pos = ctx.bc->cf_last;
1770 }
1771
1772 if (!last_exp_param) {
1773 memset(&output, 0, sizeof(output));
1774 output.gpr = 0;
1775 output.elem_size = 3;
1776 output.swizzle_x = 7;
1777 output.swizzle_y = 7;
1778 output.swizzle_z = 7;
1779 output.swizzle_w = 7;
1780 output.burst_count = 1;
1781 output.type = 2;
1782 output.op = CF_OP_EXPORT;
1783 output.array_base = next_param++;
1784 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
1785 r600_bytecode_add_output(ctx.bc, &output);
1786 last_exp_param = ctx.bc->cf_last;
1787 }
1788
1789 last_exp_pos->op = CF_OP_EXPORT_DONE;
1790 last_exp_param->op = CF_OP_EXPORT_DONE;
1791
1792 r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
1793 cf_pop = ctx.bc->cf_last;
1794
1795 cf_jump->cf_addr = cf_pop->id + 2;
1796 cf_jump->pop_count = 1;
1797 cf_pop->cf_addr = cf_pop->id + 2;
1798 cf_pop->pop_count = 1;
1799
1800 if (ctx.bc->chip_class == CAYMAN)
1801 cm_bytecode_add_cf_end(ctx.bc);
1802 else {
1803 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
1804 ctx.bc->cf_last->end_of_program = 1;
1805 }
1806
1807 gs->gs_copy_shader = cshader;
1808 cshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
1809
1810 ctx.bc->nstack = 1;
1811
1812 return r600_bytecode_build(ctx.bc);
1813 }
1814
1815 static int emit_inc_ring_offset(struct r600_shader_ctx *ctx, int idx, bool ind)
1816 {
1817 if (ind) {
1818 struct r600_bytecode_alu alu;
1819 int r;
1820
1821 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1822 alu.op = ALU_OP2_ADD_INT;
1823 alu.src[0].sel = ctx->gs_export_gpr_tregs[idx];
1824 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1825 alu.src[1].value = ctx->gs_out_ring_offset >> 4;
1826 alu.dst.sel = ctx->gs_export_gpr_tregs[idx];
1827 alu.dst.write = 1;
1828 alu.last = 1;
1829 r = r600_bytecode_add_alu(ctx->bc, &alu);
1830 if (r)
1831 return r;
1832 }
1833 return 0;
1834 }
1835
1836 static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so, int stream, bool ind)
1837 {
1838 struct r600_bytecode_output output;
1839 int i, k, ring_offset;
1840 int effective_stream = stream == -1 ? 0 : stream;
1841 int idx = 0;
1842
1843 for (i = 0; i < ctx->shader->noutput; i++) {
1844 if (ctx->gs_for_vs) {
1845 /* for ES we need to lookup corresponding ring offset expected by GS
1846 * (map this output to GS input by name and sid) */
1847 /* FIXME precompute offsets */
1848 ring_offset = -1;
1849 for(k = 0; k < ctx->gs_for_vs->ninput; ++k) {
1850 struct r600_shader_io *in = &ctx->gs_for_vs->input[k];
1851 struct r600_shader_io *out = &ctx->shader->output[i];
1852 if (in->name == out->name && in->sid == out->sid)
1853 ring_offset = in->ring_offset;
1854 }
1855
1856 if (ring_offset == -1)
1857 continue;
1858 } else {
1859 ring_offset = idx * 16;
1860 idx++;
1861 }
1862
1863 if (stream > 0 && ctx->shader->output[i].name == TGSI_SEMANTIC_POSITION)
1864 continue;
1865 /* next_ring_offset after parsing input decls contains total size of
1866 * single vertex data, gs_next_vertex - current vertex index */
1867 if (!ind)
1868 ring_offset += ctx->gs_out_ring_offset * ctx->gs_next_vertex;
1869
1870 memset(&output, 0, sizeof(struct r600_bytecode_output));
1871 output.gpr = ctx->shader->output[i].gpr;
1872 output.elem_size = 3;
1873 output.comp_mask = 0xF;
1874 output.burst_count = 1;
1875
1876 if (ind)
1877 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
1878 else
1879 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
1880
1881 switch (stream) {
1882 default:
1883 case 0:
1884 output.op = CF_OP_MEM_RING; break;
1885 case 1:
1886 output.op = CF_OP_MEM_RING1; break;
1887 case 2:
1888 output.op = CF_OP_MEM_RING2; break;
1889 case 3:
1890 output.op = CF_OP_MEM_RING3; break;
1891 }
1892
1893 if (ind) {
1894 output.array_base = ring_offset >> 2; /* in dwords */
1895 output.array_size = 0xfff;
1896 output.index_gpr = ctx->gs_export_gpr_tregs[effective_stream];
1897 } else
1898 output.array_base = ring_offset >> 2; /* in dwords */
1899 r600_bytecode_add_output(ctx->bc, &output);
1900 }
1901
1902 ++ctx->gs_next_vertex;
1903 return 0;
1904 }
1905
1906 static int r600_shader_from_tgsi(struct r600_context *rctx,
1907 struct r600_pipe_shader *pipeshader,
1908 union r600_shader_key key)
1909 {
1910 struct r600_screen *rscreen = rctx->screen;
1911 struct r600_shader *shader = &pipeshader->shader;
1912 struct tgsi_token *tokens = pipeshader->selector->tokens;
1913 struct pipe_stream_output_info so = pipeshader->selector->so;
1914 struct tgsi_full_immediate *immediate;
1915 struct r600_shader_ctx ctx;
1916 struct r600_bytecode_output output[32];
1917 unsigned output_done, noutput;
1918 unsigned opcode;
1919 int i, j, k, r = 0;
1920 int next_param_base = 0, next_clip_base;
1921 int max_color_exports = MAX2(key.ps.nr_cbufs, 1);
1922 /* Declarations used by llvm code */
1923 bool use_llvm = false;
1924 bool indirect_gprs;
1925 bool ring_outputs = false;
1926 bool pos_emitted = false;
1927
1928 #ifdef R600_USE_LLVM
1929 use_llvm = rscreen->b.debug_flags & DBG_LLVM;
1930 #endif
1931 ctx.bc = &shader->bc;
1932 ctx.shader = shader;
1933 ctx.native_integers = true;
1934
1935
1936 r600_bytecode_init(ctx.bc, rscreen->b.chip_class, rscreen->b.family,
1937 rscreen->has_compressed_msaa_texturing);
1938 ctx.tokens = tokens;
1939 tgsi_scan_shader(tokens, &ctx.info);
1940 shader->indirect_files = ctx.info.indirect_files;
1941
1942 shader->uses_doubles = ctx.info.uses_doubles;
1943
1944 indirect_gprs = ctx.info.indirect_files & ~((1 << TGSI_FILE_CONSTANT) | (1 << TGSI_FILE_SAMPLER));
1945 tgsi_parse_init(&ctx.parse, tokens);
1946 ctx.type = ctx.info.processor;
1947 shader->processor_type = ctx.type;
1948 ctx.bc->type = shader->processor_type;
1949
1950 switch (ctx.type) {
1951 case TGSI_PROCESSOR_VERTEX:
1952 shader->vs_as_gs_a = key.vs.as_gs_a;
1953 shader->vs_as_es = key.vs.as_es;
1954 shader->vs_as_ls = key.vs.as_ls;
1955 if (shader->vs_as_es)
1956 ring_outputs = true;
1957 break;
1958 case TGSI_PROCESSOR_GEOMETRY:
1959 ring_outputs = true;
1960 break;
1961 case TGSI_PROCESSOR_TESS_CTRL:
1962 shader->tcs_prim_mode = key.tcs.prim_mode;
1963 break;
1964 case TGSI_PROCESSOR_TESS_EVAL:
1965 shader->tes_as_es = key.tes.as_es;
1966 if (shader->tes_as_es)
1967 ring_outputs = true;
1968 break;
1969 case TGSI_PROCESSOR_FRAGMENT:
1970 shader->two_side = key.ps.color_two_side;
1971 break;
1972 default:
1973 break;
1974 }
1975
1976 if (shader->vs_as_es || shader->tes_as_es) {
1977 ctx.gs_for_vs = &rctx->gs_shader->current->shader;
1978 } else {
1979 ctx.gs_for_vs = NULL;
1980 }
1981
1982 ctx.next_ring_offset = 0;
1983 ctx.gs_out_ring_offset = 0;
1984 ctx.gs_next_vertex = 0;
1985 ctx.gs_stream_output_info = &so;
1986
1987 ctx.face_gpr = -1;
1988 ctx.fixed_pt_position_gpr = -1;
1989 ctx.fragcoord_input = -1;
1990 ctx.colors_used = 0;
1991 ctx.clip_vertex_write = 0;
1992
1993 shader->nr_ps_color_exports = 0;
1994 shader->nr_ps_max_color_exports = 0;
1995
1996
1997 /* register allocations */
1998 /* Values [0,127] correspond to GPR[0..127].
1999 * Values [128,159] correspond to constant buffer bank 0
2000 * Values [160,191] correspond to constant buffer bank 1
2001 * Values [256,511] correspond to cfile constants c[0..255]. (Gone on EG)
2002 * Values [256,287] correspond to constant buffer bank 2 (EG)
2003 * Values [288,319] correspond to constant buffer bank 3 (EG)
2004 * Other special values are shown in the list below.
2005 * 244 ALU_SRC_1_DBL_L: special constant 1.0 double-float, LSW. (RV670+)
2006 * 245 ALU_SRC_1_DBL_M: special constant 1.0 double-float, MSW. (RV670+)
2007 * 246 ALU_SRC_0_5_DBL_L: special constant 0.5 double-float, LSW. (RV670+)
2008 * 247 ALU_SRC_0_5_DBL_M: special constant 0.5 double-float, MSW. (RV670+)
2009 * 248 SQ_ALU_SRC_0: special constant 0.0.
2010 * 249 SQ_ALU_SRC_1: special constant 1.0 float.
2011 * 250 SQ_ALU_SRC_1_INT: special constant 1 integer.
2012 * 251 SQ_ALU_SRC_M_1_INT: special constant -1 integer.
2013 * 252 SQ_ALU_SRC_0_5: special constant 0.5 float.
2014 * 253 SQ_ALU_SRC_LITERAL: literal constant.
2015 * 254 SQ_ALU_SRC_PV: previous vector result.
2016 * 255 SQ_ALU_SRC_PS: previous scalar result.
2017 */
2018 for (i = 0; i < TGSI_FILE_COUNT; i++) {
2019 ctx.file_offset[i] = 0;
2020 }
2021
2022 #ifdef R600_USE_LLVM
2023 if (use_llvm && ctx.info.indirect_files && (ctx.info.indirect_files & (1 << TGSI_FILE_CONSTANT)) != ctx.info.indirect_files) {
2024 fprintf(stderr, "Warning: R600 LLVM backend does not support "
2025 "indirect adressing. Falling back to TGSI "
2026 "backend.\n");
2027 use_llvm = 0;
2028 }
2029 #endif
2030 if (ctx.type == TGSI_PROCESSOR_VERTEX) {
2031 ctx.file_offset[TGSI_FILE_INPUT] = 1;
2032 if (!use_llvm) {
2033 r600_bytecode_add_cfinst(ctx.bc, CF_OP_CALL_FS);
2034 }
2035 }
2036 if (ctx.type == TGSI_PROCESSOR_FRAGMENT) {
2037 if (ctx.bc->chip_class >= EVERGREEN)
2038 ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx);
2039 else
2040 ctx.file_offset[TGSI_FILE_INPUT] = allocate_system_value_inputs(&ctx, ctx.file_offset[TGSI_FILE_INPUT]);
2041 }
2042 if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
2043 /* FIXME 1 would be enough in some cases (3 or less input vertices) */
2044 ctx.file_offset[TGSI_FILE_INPUT] = 2;
2045 }
2046 ctx.use_llvm = use_llvm;
2047
2048 if (use_llvm) {
2049 ctx.file_offset[TGSI_FILE_OUTPUT] =
2050 ctx.file_offset[TGSI_FILE_INPUT];
2051 } else {
2052 ctx.file_offset[TGSI_FILE_OUTPUT] =
2053 ctx.file_offset[TGSI_FILE_INPUT] +
2054 ctx.info.file_max[TGSI_FILE_INPUT] + 1;
2055 }
2056 ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] +
2057 ctx.info.file_max[TGSI_FILE_OUTPUT] + 1;
2058
2059 /* Outside the GPR range. This will be translated to one of the
2060 * kcache banks later. */
2061 ctx.file_offset[TGSI_FILE_CONSTANT] = 512;
2062
2063 ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL;
2064 ctx.bc->ar_reg = ctx.file_offset[TGSI_FILE_TEMPORARY] +
2065 ctx.info.file_max[TGSI_FILE_TEMPORARY] + 1;
2066 ctx.bc->index_reg[0] = ctx.bc->ar_reg + 1;
2067 ctx.bc->index_reg[1] = ctx.bc->ar_reg + 2;
2068
2069 if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
2070 ctx.gs_export_gpr_tregs[0] = ctx.bc->ar_reg + 3;
2071 ctx.gs_export_gpr_tregs[1] = ctx.bc->ar_reg + 4;
2072 ctx.gs_export_gpr_tregs[2] = ctx.bc->ar_reg + 5;
2073 ctx.gs_export_gpr_tregs[3] = ctx.bc->ar_reg + 6;
2074 ctx.temp_reg = ctx.bc->ar_reg + 7;
2075 } else {
2076 ctx.temp_reg = ctx.bc->ar_reg + 3;
2077 }
2078
2079 shader->max_arrays = 0;
2080 shader->num_arrays = 0;
2081 if (indirect_gprs) {
2082
2083 if (ctx.info.indirect_files & (1 << TGSI_FILE_INPUT)) {
2084 r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_INPUT],
2085 ctx.file_offset[TGSI_FILE_OUTPUT] -
2086 ctx.file_offset[TGSI_FILE_INPUT],
2087 0x0F);
2088 }
2089 if (ctx.info.indirect_files & (1 << TGSI_FILE_OUTPUT)) {
2090 r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_OUTPUT],
2091 ctx.file_offset[TGSI_FILE_TEMPORARY] -
2092 ctx.file_offset[TGSI_FILE_OUTPUT],
2093 0x0F);
2094 }
2095 }
2096
2097 ctx.nliterals = 0;
2098 ctx.literals = NULL;
2099
2100 shader->fs_write_all = ctx.info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS];
2101 shader->vs_position_window_space = ctx.info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
2102 shader->ps_conservative_z = (uint8_t)ctx.info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT];
2103
2104 if (shader->vs_as_gs_a)
2105 vs_add_primid_output(&ctx, key.vs.prim_id_out);
2106
2107 while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
2108 tgsi_parse_token(&ctx.parse);
2109 switch (ctx.parse.FullToken.Token.Type) {
2110 case TGSI_TOKEN_TYPE_IMMEDIATE:
2111 immediate = &ctx.parse.FullToken.FullImmediate;
2112 ctx.literals = realloc(ctx.literals, (ctx.nliterals + 1) * 16);
2113 if(ctx.literals == NULL) {
2114 r = -ENOMEM;
2115 goto out_err;
2116 }
2117 ctx.literals[ctx.nliterals * 4 + 0] = immediate->u[0].Uint;
2118 ctx.literals[ctx.nliterals * 4 + 1] = immediate->u[1].Uint;
2119 ctx.literals[ctx.nliterals * 4 + 2] = immediate->u[2].Uint;
2120 ctx.literals[ctx.nliterals * 4 + 3] = immediate->u[3].Uint;
2121 ctx.nliterals++;
2122 break;
2123 case TGSI_TOKEN_TYPE_DECLARATION:
2124 r = tgsi_declaration(&ctx);
2125 if (r)
2126 goto out_err;
2127 break;
2128 case TGSI_TOKEN_TYPE_INSTRUCTION:
2129 case TGSI_TOKEN_TYPE_PROPERTY:
2130 break;
2131 default:
2132 R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type);
2133 r = -EINVAL;
2134 goto out_err;
2135 }
2136 }
2137
2138 shader->ring_item_sizes[0] = ctx.next_ring_offset;
2139 shader->ring_item_sizes[1] = 0;
2140 shader->ring_item_sizes[2] = 0;
2141 shader->ring_item_sizes[3] = 0;
2142
2143 /* Process two side if needed */
2144 if (shader->two_side && ctx.colors_used) {
2145 int i, count = ctx.shader->ninput;
2146 unsigned next_lds_loc = ctx.shader->nlds;
2147
2148 /* additional inputs will be allocated right after the existing inputs,
2149 * we won't need them after the color selection, so we don't need to
2150 * reserve these gprs for the rest of the shader code and to adjust
2151 * output offsets etc. */
2152 int gpr = ctx.file_offset[TGSI_FILE_INPUT] +
2153 ctx.info.file_max[TGSI_FILE_INPUT] + 1;
2154
2155 /* if two sided and neither face or sample mask is used by shader, ensure face_gpr is emitted */
2156 if (ctx.face_gpr == -1) {
2157 i = ctx.shader->ninput++;
2158 ctx.shader->input[i].name = TGSI_SEMANTIC_FACE;
2159 ctx.shader->input[i].spi_sid = 0;
2160 ctx.shader->input[i].gpr = gpr++;
2161 ctx.face_gpr = ctx.shader->input[i].gpr;
2162 }
2163
2164 for (i = 0; i < count; i++) {
2165 if (ctx.shader->input[i].name == TGSI_SEMANTIC_COLOR) {
2166 int ni = ctx.shader->ninput++;
2167 memcpy(&ctx.shader->input[ni],&ctx.shader->input[i], sizeof(struct r600_shader_io));
2168 ctx.shader->input[ni].name = TGSI_SEMANTIC_BCOLOR;
2169 ctx.shader->input[ni].spi_sid = r600_spi_sid(&ctx.shader->input[ni]);
2170 ctx.shader->input[ni].gpr = gpr++;
2171 // TGSI to LLVM needs to know the lds position of inputs.
2172 // Non LLVM path computes it later (in process_twoside_color)
2173 ctx.shader->input[ni].lds_pos = next_lds_loc++;
2174 ctx.shader->input[i].back_color_input = ni;
2175 if (ctx.bc->chip_class >= EVERGREEN) {
2176 if ((r = evergreen_interp_input(&ctx, ni)))
2177 return r;
2178 }
2179 }
2180 }
2181 }
2182
2183 /* LLVM backend setup */
2184 #ifdef R600_USE_LLVM
2185 if (use_llvm) {
2186 struct radeon_llvm_context radeon_llvm_ctx;
2187 LLVMModuleRef mod;
2188 bool dump = r600_can_dump_shader(&rscreen->b, tokens);
2189 boolean use_kill = false;
2190
2191 memset(&radeon_llvm_ctx, 0, sizeof(radeon_llvm_ctx));
2192 radeon_llvm_ctx.type = ctx.type;
2193 radeon_llvm_ctx.two_side = shader->two_side;
2194 radeon_llvm_ctx.face_gpr = ctx.face_gpr;
2195 radeon_llvm_ctx.inputs_count = ctx.shader->ninput + 1;
2196 radeon_llvm_ctx.r600_inputs = ctx.shader->input;
2197 radeon_llvm_ctx.r600_outputs = ctx.shader->output;
2198 radeon_llvm_ctx.color_buffer_count = max_color_exports;
2199 radeon_llvm_ctx.chip_class = ctx.bc->chip_class;
2200 radeon_llvm_ctx.fs_color_all = shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN);
2201 radeon_llvm_ctx.stream_outputs = &so;
2202 radeon_llvm_ctx.alpha_to_one = key.ps.alpha_to_one;
2203 radeon_llvm_ctx.has_compressed_msaa_texturing =
2204 ctx.bc->has_compressed_msaa_texturing;
2205 mod = r600_tgsi_llvm(&radeon_llvm_ctx, tokens);
2206 ctx.shader->has_txq_cube_array_z_comp = radeon_llvm_ctx.has_txq_cube_array_z_comp;
2207 ctx.shader->uses_tex_buffers = radeon_llvm_ctx.uses_tex_buffers;
2208
2209 if (r600_llvm_compile(mod, rscreen->b.family, ctx.bc, &use_kill, dump)) {
2210 radeon_llvm_dispose(&radeon_llvm_ctx);
2211 use_llvm = 0;
2212 fprintf(stderr, "R600 LLVM backend failed to compile "
2213 "shader. Falling back to TGSI\n");
2214 } else {
2215 ctx.file_offset[TGSI_FILE_OUTPUT] =
2216 ctx.file_offset[TGSI_FILE_INPUT];
2217 }
2218 if (use_kill)
2219 ctx.shader->uses_kill = use_kill;
2220 radeon_llvm_dispose(&radeon_llvm_ctx);
2221 }
2222 #endif
2223 /* End of LLVM backend setup */
2224
2225 if (shader->fs_write_all && rscreen->b.chip_class >= EVERGREEN)
2226 shader->nr_ps_max_color_exports = 8;
2227
2228 if (!use_llvm) {
2229 if (ctx.fragcoord_input >= 0) {
2230 if (ctx.bc->chip_class == CAYMAN) {
2231 for (j = 0 ; j < 4; j++) {
2232 struct r600_bytecode_alu alu;
2233 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2234 alu.op = ALU_OP1_RECIP_IEEE;
2235 alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
2236 alu.src[0].chan = 3;
2237
2238 alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
2239 alu.dst.chan = j;
2240 alu.dst.write = (j == 3);
2241 alu.last = 1;
2242 if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
2243 return r;
2244 }
2245 } else {
2246 struct r600_bytecode_alu alu;
2247 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2248 alu.op = ALU_OP1_RECIP_IEEE;
2249 alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
2250 alu.src[0].chan = 3;
2251
2252 alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
2253 alu.dst.chan = 3;
2254 alu.dst.write = 1;
2255 alu.last = 1;
2256 if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
2257 return r;
2258 }
2259 }
2260
2261 if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
2262 struct r600_bytecode_alu alu;
2263 int r;
2264
2265 /* GS thread with no output workaround - emit a cut at start of GS */
2266 if (ctx.bc->chip_class == R600)
2267 r600_bytecode_add_cfinst(ctx.bc, CF_OP_CUT_VERTEX);
2268
2269 for (j = 0; j < 4; j++) {
2270 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2271 alu.op = ALU_OP1_MOV;
2272 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
2273 alu.src[0].value = 0;
2274 alu.dst.sel = ctx.gs_export_gpr_tregs[j];
2275 alu.dst.write = 1;
2276 alu.last = 1;
2277 r = r600_bytecode_add_alu(ctx.bc, &alu);
2278 if (r)
2279 return r;
2280 }
2281 }
2282 if (shader->two_side && ctx.colors_used) {
2283 if ((r = process_twoside_color_inputs(&ctx)))
2284 return r;
2285 }
2286
2287 tgsi_parse_init(&ctx.parse, tokens);
2288 while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
2289 tgsi_parse_token(&ctx.parse);
2290 switch (ctx.parse.FullToken.Token.Type) {
2291 case TGSI_TOKEN_TYPE_INSTRUCTION:
2292 r = tgsi_is_supported(&ctx);
2293 if (r)
2294 goto out_err;
2295 ctx.max_driver_temp_used = 0;
2296 /* reserve first tmp for everyone */
2297 r600_get_temp(&ctx);
2298
2299 opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode;
2300 if ((r = tgsi_split_constant(&ctx)))
2301 goto out_err;
2302 if ((r = tgsi_split_literal_constant(&ctx)))
2303 goto out_err;
2304 if (ctx.type == TGSI_PROCESSOR_GEOMETRY)
2305 if ((r = tgsi_split_gs_inputs(&ctx)))
2306 goto out_err;
2307 if (ctx.bc->chip_class == CAYMAN)
2308 ctx.inst_info = &cm_shader_tgsi_instruction[opcode];
2309 else if (ctx.bc->chip_class >= EVERGREEN)
2310 ctx.inst_info = &eg_shader_tgsi_instruction[opcode];
2311 else
2312 ctx.inst_info = &r600_shader_tgsi_instruction[opcode];
2313 r = ctx.inst_info->process(&ctx);
2314 if (r)
2315 goto out_err;
2316 break;
2317 default:
2318 break;
2319 }
2320 }
2321 }
2322
2323 /* Reset the temporary register counter. */
2324 ctx.max_driver_temp_used = 0;
2325
2326 noutput = shader->noutput;
2327
2328 if (!ring_outputs && ctx.clip_vertex_write) {
2329 unsigned clipdist_temp[2];
2330
2331 clipdist_temp[0] = r600_get_temp(&ctx);
2332 clipdist_temp[1] = r600_get_temp(&ctx);
2333
2334 /* need to convert a clipvertex write into clipdistance writes and not export
2335 the clip vertex anymore */
2336
2337 memset(&shader->output[noutput], 0, 2*sizeof(struct r600_shader_io));
2338 shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
2339 shader->output[noutput].gpr = clipdist_temp[0];
2340 noutput++;
2341 shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
2342 shader->output[noutput].gpr = clipdist_temp[1];
2343 noutput++;
2344
2345 /* reset spi_sid for clipvertex output to avoid confusing spi */
2346 shader->output[ctx.cv_output].spi_sid = 0;
2347
2348 shader->clip_dist_write = 0xFF;
2349
2350 for (i = 0; i < 8; i++) {
2351 int oreg = i >> 2;
2352 int ochan = i & 3;
2353
2354 for (j = 0; j < 4; j++) {
2355 struct r600_bytecode_alu alu;
2356 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2357 alu.op = ALU_OP2_DOT4;
2358 alu.src[0].sel = shader->output[ctx.cv_output].gpr;
2359 alu.src[0].chan = j;
2360
2361 alu.src[1].sel = 512 + i;
2362 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
2363 alu.src[1].chan = j;
2364
2365 alu.dst.sel = clipdist_temp[oreg];
2366 alu.dst.chan = j;
2367 alu.dst.write = (j == ochan);
2368 if (j == 3)
2369 alu.last = 1;
2370 if (!use_llvm)
2371 r = r600_bytecode_add_alu(ctx.bc, &alu);
2372 if (r)
2373 return r;
2374 }
2375 }
2376 }
2377
2378 /* Add stream outputs. */
2379 if (!ring_outputs && ctx.type == TGSI_PROCESSOR_VERTEX &&
2380 so.num_outputs && !use_llvm)
2381 emit_streamout(&ctx, &so, -1, NULL);
2382
2383 pipeshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
2384 convert_edgeflag_to_int(&ctx);
2385
2386 if (ring_outputs) {
2387 if (shader->vs_as_es || shader->tes_as_es) {
2388 ctx.gs_export_gpr_tregs[0] = r600_get_temp(&ctx);
2389 ctx.gs_export_gpr_tregs[1] = -1;
2390 ctx.gs_export_gpr_tregs[2] = -1;
2391 ctx.gs_export_gpr_tregs[3] = -1;
2392
2393 emit_gs_ring_writes(&ctx, &so, -1, FALSE);
2394 }
2395 } else {
2396 /* Export output */
2397 next_clip_base = shader->vs_out_misc_write ? 62 : 61;
2398
2399 for (i = 0, j = 0; i < noutput; i++, j++) {
2400 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
2401 output[j].gpr = shader->output[i].gpr;
2402 output[j].elem_size = 3;
2403 output[j].swizzle_x = 0;
2404 output[j].swizzle_y = 1;
2405 output[j].swizzle_z = 2;
2406 output[j].swizzle_w = 3;
2407 output[j].burst_count = 1;
2408 output[j].type = -1;
2409 output[j].op = CF_OP_EXPORT;
2410 switch (ctx.type) {
2411 case TGSI_PROCESSOR_VERTEX:
2412 switch (shader->output[i].name) {
2413 case TGSI_SEMANTIC_POSITION:
2414 output[j].array_base = 60;
2415 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2416 pos_emitted = true;
2417 break;
2418
2419 case TGSI_SEMANTIC_PSIZE:
2420 output[j].array_base = 61;
2421 output[j].swizzle_y = 7;
2422 output[j].swizzle_z = 7;
2423 output[j].swizzle_w = 7;
2424 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2425 pos_emitted = true;
2426 break;
2427 case TGSI_SEMANTIC_EDGEFLAG:
2428 output[j].array_base = 61;
2429 output[j].swizzle_x = 7;
2430 output[j].swizzle_y = 0;
2431 output[j].swizzle_z = 7;
2432 output[j].swizzle_w = 7;
2433 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2434 pos_emitted = true;
2435 break;
2436 case TGSI_SEMANTIC_LAYER:
2437 /* spi_sid is 0 for outputs that are
2438 * not consumed by PS */
2439 if (shader->output[i].spi_sid) {
2440 output[j].array_base = next_param_base++;
2441 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2442 j++;
2443 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
2444 }
2445 output[j].array_base = 61;
2446 output[j].swizzle_x = 7;
2447 output[j].swizzle_y = 7;
2448 output[j].swizzle_z = 0;
2449 output[j].swizzle_w = 7;
2450 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2451 pos_emitted = true;
2452 break;
2453 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2454 /* spi_sid is 0 for outputs that are
2455 * not consumed by PS */
2456 if (shader->output[i].spi_sid) {
2457 output[j].array_base = next_param_base++;
2458 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2459 j++;
2460 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
2461 }
2462 output[j].array_base = 61;
2463 output[j].swizzle_x = 7;
2464 output[j].swizzle_y = 7;
2465 output[j].swizzle_z = 7;
2466 output[j].swizzle_w = 0;
2467 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2468 pos_emitted = true;
2469 break;
2470 case TGSI_SEMANTIC_CLIPVERTEX:
2471 j--;
2472 break;
2473 case TGSI_SEMANTIC_CLIPDIST:
2474 output[j].array_base = next_clip_base++;
2475 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2476 pos_emitted = true;
2477 /* spi_sid is 0 for clipdistance outputs that were generated
2478 * for clipvertex - we don't need to pass them to PS */
2479 if (shader->output[i].spi_sid) {
2480 j++;
2481 /* duplicate it as PARAM to pass to the pixel shader */
2482 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
2483 output[j].array_base = next_param_base++;
2484 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2485 }
2486 break;
2487 case TGSI_SEMANTIC_FOG:
2488 output[j].swizzle_y = 4; /* 0 */
2489 output[j].swizzle_z = 4; /* 0 */
2490 output[j].swizzle_w = 5; /* 1 */
2491 break;
2492 case TGSI_SEMANTIC_PRIMID:
2493 output[j].swizzle_x = 2;
2494 output[j].swizzle_y = 4; /* 0 */
2495 output[j].swizzle_z = 4; /* 0 */
2496 output[j].swizzle_w = 4; /* 0 */
2497 break;
2498 }
2499
2500 break;
2501 case TGSI_PROCESSOR_FRAGMENT:
2502 if (shader->output[i].name == TGSI_SEMANTIC_COLOR) {
2503 /* never export more colors than the number of CBs */
2504 if (shader->output[i].sid >= max_color_exports) {
2505 /* skip export */
2506 j--;
2507 continue;
2508 }
2509 output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
2510 output[j].array_base = shader->output[i].sid;
2511 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2512 shader->nr_ps_color_exports++;
2513 if (shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN)) {
2514 for (k = 1; k < max_color_exports; k++) {
2515 j++;
2516 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
2517 output[j].gpr = shader->output[i].gpr;
2518 output[j].elem_size = 3;
2519 output[j].swizzle_x = 0;
2520 output[j].swizzle_y = 1;
2521 output[j].swizzle_z = 2;
2522 output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
2523 output[j].burst_count = 1;
2524 output[j].array_base = k;
2525 output[j].op = CF_OP_EXPORT;
2526 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2527 shader->nr_ps_color_exports++;
2528 }
2529 }
2530 } else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) {
2531 output[j].array_base = 61;
2532 output[j].swizzle_x = 2;
2533 output[j].swizzle_y = 7;
2534 output[j].swizzle_z = output[j].swizzle_w = 7;
2535 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2536 } else if (shader->output[i].name == TGSI_SEMANTIC_STENCIL) {
2537 output[j].array_base = 61;
2538 output[j].swizzle_x = 7;
2539 output[j].swizzle_y = 1;
2540 output[j].swizzle_z = output[j].swizzle_w = 7;
2541 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2542 } else if (shader->output[i].name == TGSI_SEMANTIC_SAMPLEMASK) {
2543 output[j].array_base = 61;
2544 output[j].swizzle_x = 7;
2545 output[j].swizzle_y = 7;
2546 output[j].swizzle_z = 0;
2547 output[j].swizzle_w = 7;
2548 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2549 } else {
2550 R600_ERR("unsupported fragment output name %d\n", shader->output[i].name);
2551 r = -EINVAL;
2552 goto out_err;
2553 }
2554 break;
2555 default:
2556 R600_ERR("unsupported processor type %d\n", ctx.type);
2557 r = -EINVAL;
2558 goto out_err;
2559 }
2560
2561 if (output[j].type==-1) {
2562 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2563 output[j].array_base = next_param_base++;
2564 }
2565 }
2566
2567 /* add fake position export */
2568 if (ctx.type == TGSI_PROCESSOR_VERTEX && pos_emitted == false) {
2569 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
2570 output[j].gpr = 0;
2571 output[j].elem_size = 3;
2572 output[j].swizzle_x = 7;
2573 output[j].swizzle_y = 7;
2574 output[j].swizzle_z = 7;
2575 output[j].swizzle_w = 7;
2576 output[j].burst_count = 1;
2577 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2578 output[j].array_base = 60;
2579 output[j].op = CF_OP_EXPORT;
2580 j++;
2581 }
2582
2583 /* add fake param output for vertex shader if no param is exported */
2584 if (ctx.type == TGSI_PROCESSOR_VERTEX && next_param_base == 0) {
2585 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
2586 output[j].gpr = 0;
2587 output[j].elem_size = 3;
2588 output[j].swizzle_x = 7;
2589 output[j].swizzle_y = 7;
2590 output[j].swizzle_z = 7;
2591 output[j].swizzle_w = 7;
2592 output[j].burst_count = 1;
2593 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2594 output[j].array_base = 0;
2595 output[j].op = CF_OP_EXPORT;
2596 j++;
2597 }
2598
2599 /* add fake pixel export */
2600 if (ctx.type == TGSI_PROCESSOR_FRAGMENT && shader->nr_ps_color_exports == 0) {
2601 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
2602 output[j].gpr = 0;
2603 output[j].elem_size = 3;
2604 output[j].swizzle_x = 7;
2605 output[j].swizzle_y = 7;
2606 output[j].swizzle_z = 7;
2607 output[j].swizzle_w = 7;
2608 output[j].burst_count = 1;
2609 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2610 output[j].array_base = 0;
2611 output[j].op = CF_OP_EXPORT;
2612 j++;
2613 shader->nr_ps_color_exports++;
2614 }
2615
2616 noutput = j;
2617
2618 /* set export done on last export of each type */
2619 for (i = noutput - 1, output_done = 0; i >= 0; i--) {
2620 if (!(output_done & (1 << output[i].type))) {
2621 output_done |= (1 << output[i].type);
2622 output[i].op = CF_OP_EXPORT_DONE;
2623 }
2624 }
2625 /* add output to bytecode */
2626 if (!use_llvm) {
2627 for (i = 0; i < noutput; i++) {
2628 r = r600_bytecode_add_output(ctx.bc, &output[i]);
2629 if (r)
2630 goto out_err;
2631 }
2632 }
2633 }
2634
2635 /* add program end */
2636 if (!use_llvm) {
2637 if (ctx.bc->chip_class == CAYMAN)
2638 cm_bytecode_add_cf_end(ctx.bc);
2639 else {
2640 const struct cf_op_info *last = NULL;
2641
2642 if (ctx.bc->cf_last)
2643 last = r600_isa_cf(ctx.bc->cf_last->op);
2644
2645 /* alu clause instructions don't have EOP bit, so add NOP */
2646 if (!last || last->flags & CF_ALU || ctx.bc->cf_last->op == CF_OP_LOOP_END || ctx.bc->cf_last->op == CF_OP_CALL_FS)
2647 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
2648
2649 ctx.bc->cf_last->end_of_program = 1;
2650 }
2651 }
2652
2653 /* check GPR limit - we have 124 = 128 - 4
2654 * (4 are reserved as alu clause temporary registers) */
2655 if (ctx.bc->ngpr > 124) {
2656 R600_ERR("GPR limit exceeded - shader requires %d registers\n", ctx.bc->ngpr);
2657 r = -ENOMEM;
2658 goto out_err;
2659 }
2660
2661 if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
2662 if ((r = generate_gs_copy_shader(rctx, pipeshader, &so)))
2663 return r;
2664 }
2665
2666 free(ctx.literals);
2667 tgsi_parse_free(&ctx.parse);
2668 return 0;
2669 out_err:
2670 free(ctx.literals);
2671 tgsi_parse_free(&ctx.parse);
2672 return r;
2673 }
2674
2675 static int tgsi_unsupported(struct r600_shader_ctx *ctx)
2676 {
2677 const unsigned tgsi_opcode =
2678 ctx->parse.FullToken.FullInstruction.Instruction.Opcode;
2679 R600_ERR("%s tgsi opcode unsupported\n",
2680 tgsi_get_opcode_name(tgsi_opcode));
2681 return -EINVAL;
2682 }
2683
2684 static int tgsi_end(struct r600_shader_ctx *ctx)
2685 {
2686 return 0;
2687 }
2688
2689 static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
2690 const struct r600_shader_src *shader_src,
2691 unsigned chan)
2692 {
2693 bc_src->sel = shader_src->sel;
2694 bc_src->chan = shader_src->swizzle[chan];
2695 bc_src->neg = shader_src->neg;
2696 bc_src->abs = shader_src->abs;
2697 bc_src->rel = shader_src->rel;
2698 bc_src->value = shader_src->value[bc_src->chan];
2699 bc_src->kc_bank = shader_src->kc_bank;
2700 bc_src->kc_rel = shader_src->kc_rel;
2701 }
2702
2703 static void r600_bytecode_src_set_abs(struct r600_bytecode_alu_src *bc_src)
2704 {
2705 bc_src->abs = 1;
2706 bc_src->neg = 0;
2707 }
2708
2709 static void r600_bytecode_src_toggle_neg(struct r600_bytecode_alu_src *bc_src)
2710 {
2711 bc_src->neg = !bc_src->neg;
2712 }
2713
2714 static void tgsi_dst(struct r600_shader_ctx *ctx,
2715 const struct tgsi_full_dst_register *tgsi_dst,
2716 unsigned swizzle,
2717 struct r600_bytecode_alu_dst *r600_dst)
2718 {
2719 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2720
2721 r600_dst->sel = tgsi_dst->Register.Index;
2722 r600_dst->sel += ctx->file_offset[tgsi_dst->Register.File];
2723 r600_dst->chan = swizzle;
2724 r600_dst->write = 1;
2725 if (tgsi_dst->Register.Indirect)
2726 r600_dst->rel = V_SQ_REL_RELATIVE;
2727 if (inst->Instruction.Saturate) {
2728 r600_dst->clamp = 1;
2729 }
2730 }
2731
2732 static int tgsi_last_instruction(unsigned writemask)
2733 {
2734 int i, lasti = 0;
2735
2736 for (i = 0; i < 4; i++) {
2737 if (writemask & (1 << i)) {
2738 lasti = i;
2739 }
2740 }
2741 return lasti;
2742 }
2743
2744
2745
2746 static int tgsi_op2_64_params(struct r600_shader_ctx *ctx, bool singledest, bool swap)
2747 {
2748 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2749 unsigned write_mask = inst->Dst[0].Register.WriteMask;
2750 struct r600_bytecode_alu alu;
2751 int i, j, r, lasti = tgsi_last_instruction(write_mask);
2752 int use_tmp = 0;
2753
2754 if (singledest) {
2755 switch (write_mask) {
2756 case 0x1:
2757 write_mask = 0x3;
2758 break;
2759 case 0x2:
2760 use_tmp = 1;
2761 write_mask = 0x3;
2762 break;
2763 case 0x4:
2764 write_mask = 0xc;
2765 break;
2766 case 0x8:
2767 write_mask = 0xc;
2768 use_tmp = 3;
2769 break;
2770 }
2771 }
2772
2773 lasti = tgsi_last_instruction(write_mask);
2774 for (i = 0; i <= lasti; i++) {
2775
2776 if (!(write_mask & (1 << i)))
2777 continue;
2778
2779 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2780
2781 if (singledest) {
2782 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2783 if (use_tmp) {
2784 alu.dst.sel = ctx->temp_reg;
2785 alu.dst.chan = i;
2786 alu.dst.write = 1;
2787 }
2788 if (i == 1 || i == 3)
2789 alu.dst.write = 0;
2790 } else
2791 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2792
2793 alu.op = ctx->inst_info->op;
2794 if (ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DABS) {
2795 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
2796 } else if (!swap) {
2797 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
2798 r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
2799 }
2800 } else {
2801 r600_bytecode_src(&alu.src[0], &ctx->src[1], fp64_switch(i));
2802 r600_bytecode_src(&alu.src[1], &ctx->src[0], fp64_switch(i));
2803 }
2804
2805 /* handle some special cases */
2806 if (i == 1 || i == 3) {
2807 switch (ctx->parse.FullToken.FullInstruction.Instruction.Opcode) {
2808 case TGSI_OPCODE_SUB:
2809 r600_bytecode_src_toggle_neg(&alu.src[1]);
2810 break;
2811 case TGSI_OPCODE_DABS:
2812 r600_bytecode_src_set_abs(&alu.src[0]);
2813 break;
2814 default:
2815 break;
2816 }
2817 }
2818 if (i == lasti) {
2819 alu.last = 1;
2820 }
2821 r = r600_bytecode_add_alu(ctx->bc, &alu);
2822 if (r)
2823 return r;
2824 }
2825
2826 if (use_tmp) {
2827 write_mask = inst->Dst[0].Register.WriteMask;
2828
2829 /* move result from temp to dst */
2830 for (i = 0; i <= lasti; i++) {
2831 if (!(write_mask & (1 << i)))
2832 continue;
2833
2834 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2835 alu.op = ALU_OP1_MOV;
2836 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2837 alu.src[0].sel = ctx->temp_reg;
2838 alu.src[0].chan = use_tmp - 1;
2839 alu.last = (i == lasti);
2840
2841 r = r600_bytecode_add_alu(ctx->bc, &alu);
2842 if (r)
2843 return r;
2844 }
2845 }
2846 return 0;
2847 }
2848
2849 static int tgsi_op2_64(struct r600_shader_ctx *ctx)
2850 {
2851 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2852 unsigned write_mask = inst->Dst[0].Register.WriteMask;
2853 /* confirm writemasking */
2854 if ((write_mask & 0x3) != 0x3 &&
2855 (write_mask & 0xc) != 0xc) {
2856 fprintf(stderr, "illegal writemask for 64-bit: 0x%x\n", write_mask);
2857 return -1;
2858 }
2859 return tgsi_op2_64_params(ctx, false, false);
2860 }
2861
2862 static int tgsi_op2_64_single_dest(struct r600_shader_ctx *ctx)
2863 {
2864 return tgsi_op2_64_params(ctx, true, false);
2865 }
2866
2867 static int tgsi_op2_64_single_dest_s(struct r600_shader_ctx *ctx)
2868 {
2869 return tgsi_op2_64_params(ctx, true, true);
2870 }
2871
2872 static int tgsi_op3_64(struct r600_shader_ctx *ctx)
2873 {
2874 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2875 struct r600_bytecode_alu alu;
2876 int i, j, r;
2877 int lasti = 3;
2878 int tmp = r600_get_temp(ctx);
2879
2880 for (i = 0; i < lasti + 1; i++) {
2881
2882 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2883 alu.op = ctx->inst_info->op;
2884 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
2885 r600_bytecode_src(&alu.src[j], &ctx->src[j], i == 3 ? 0 : 1);
2886 }
2887
2888 if (inst->Dst[0].Register.WriteMask & (1 << i))
2889 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2890 else
2891 alu.dst.sel = tmp;
2892
2893 alu.dst.chan = i;
2894 alu.is_op3 = 1;
2895 if (i == lasti) {
2896 alu.last = 1;
2897 }
2898 r = r600_bytecode_add_alu(ctx->bc, &alu);
2899 if (r)
2900 return r;
2901 }
2902 return 0;
2903 }
2904
2905 static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap, int trans_only)
2906 {
2907 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2908 struct r600_bytecode_alu alu;
2909 unsigned write_mask = inst->Dst[0].Register.WriteMask;
2910 int i, j, r, lasti = tgsi_last_instruction(write_mask);
2911 /* use temp register if trans_only and more than one dst component */
2912 int use_tmp = trans_only && (write_mask ^ (1 << lasti));
2913
2914 for (i = 0; i <= lasti; i++) {
2915 if (!(write_mask & (1 << i)))
2916 continue;
2917
2918 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2919 if (use_tmp) {
2920 alu.dst.sel = ctx->temp_reg;
2921 alu.dst.chan = i;
2922 alu.dst.write = 1;
2923 } else
2924 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2925
2926 alu.op = ctx->inst_info->op;
2927 if (!swap) {
2928 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
2929 r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
2930 }
2931 } else {
2932 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
2933 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
2934 }
2935 /* handle some special cases */
2936 switch (inst->Instruction.Opcode) {
2937 case TGSI_OPCODE_SUB:
2938 r600_bytecode_src_toggle_neg(&alu.src[1]);
2939 break;
2940 case TGSI_OPCODE_ABS:
2941 r600_bytecode_src_set_abs(&alu.src[0]);
2942 break;
2943 default:
2944 break;
2945 }
2946 if (i == lasti || trans_only) {
2947 alu.last = 1;
2948 }
2949 r = r600_bytecode_add_alu(ctx->bc, &alu);
2950 if (r)
2951 return r;
2952 }
2953
2954 if (use_tmp) {
2955 /* move result from temp to dst */
2956 for (i = 0; i <= lasti; i++) {
2957 if (!(write_mask & (1 << i)))
2958 continue;
2959
2960 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2961 alu.op = ALU_OP1_MOV;
2962 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2963 alu.src[0].sel = ctx->temp_reg;
2964 alu.src[0].chan = i;
2965 alu.last = (i == lasti);
2966
2967 r = r600_bytecode_add_alu(ctx->bc, &alu);
2968 if (r)
2969 return r;
2970 }
2971 }
2972 return 0;
2973 }
2974
2975 static int tgsi_op2(struct r600_shader_ctx *ctx)
2976 {
2977 return tgsi_op2_s(ctx, 0, 0);
2978 }
2979
2980 static int tgsi_op2_swap(struct r600_shader_ctx *ctx)
2981 {
2982 return tgsi_op2_s(ctx, 1, 0);
2983 }
2984
2985 static int tgsi_op2_trans(struct r600_shader_ctx *ctx)
2986 {
2987 return tgsi_op2_s(ctx, 0, 1);
2988 }
2989
2990 static int tgsi_ineg(struct r600_shader_ctx *ctx)
2991 {
2992 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2993 struct r600_bytecode_alu alu;
2994 int i, r;
2995 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
2996
2997 for (i = 0; i < lasti + 1; i++) {
2998
2999 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
3000 continue;
3001 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3002 alu.op = ctx->inst_info->op;
3003
3004 alu.src[0].sel = V_SQ_ALU_SRC_0;
3005
3006 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3007
3008 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3009
3010 if (i == lasti) {
3011 alu.last = 1;
3012 }
3013 r = r600_bytecode_add_alu(ctx->bc, &alu);
3014 if (r)
3015 return r;
3016 }
3017 return 0;
3018
3019 }
3020
3021 static int tgsi_dneg(struct r600_shader_ctx *ctx)
3022 {
3023 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3024 struct r600_bytecode_alu alu;
3025 int i, r;
3026 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3027
3028 for (i = 0; i < lasti + 1; i++) {
3029
3030 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
3031 continue;
3032 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3033 alu.op = ALU_OP1_MOV;
3034
3035 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3036
3037 if (i == 1 || i == 3)
3038 r600_bytecode_src_toggle_neg(&alu.src[0]);
3039 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3040
3041 if (i == lasti) {
3042 alu.last = 1;
3043 }
3044 r = r600_bytecode_add_alu(ctx->bc, &alu);
3045 if (r)
3046 return r;
3047 }
3048 return 0;
3049
3050 }
3051
3052 static int tgsi_dfracexp(struct r600_shader_ctx *ctx)
3053 {
3054 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3055 struct r600_bytecode_alu alu;
3056 unsigned write_mask = inst->Dst[0].Register.WriteMask;
3057 int i, j, r;
3058 int firsti = write_mask == 0xc ? 2 : 0;
3059
3060 for (i = 0; i <= 3; i++) {
3061 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3062 alu.op = ctx->inst_info->op;
3063
3064 alu.dst.sel = ctx->temp_reg;
3065 alu.dst.chan = i;
3066 alu.dst.write = 1;
3067 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3068 r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
3069 }
3070
3071 if (i == 3)
3072 alu.last = 1;
3073
3074 r = r600_bytecode_add_alu(ctx->bc, &alu);
3075 if (r)
3076 return r;
3077 }
3078
3079 /* MOV first two channels to writemask dst0 */
3080 for (i = 0; i <= 1; i++) {
3081 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3082 alu.op = ALU_OP1_MOV;
3083 alu.src[0].chan = i + 2;
3084 alu.src[0].sel = ctx->temp_reg;
3085
3086 tgsi_dst(ctx, &inst->Dst[0], firsti + i, &alu.dst);
3087 alu.dst.write = (inst->Dst[0].Register.WriteMask >> (firsti + i)) & 1;
3088 alu.last = 1;
3089 r = r600_bytecode_add_alu(ctx->bc, &alu);
3090 if (r)
3091 return r;
3092 }
3093
3094 for (i = 0; i <= 3; i++) {
3095 if (inst->Dst[1].Register.WriteMask & (1 << i)) {
3096 /* MOV third channels to writemask dst1 */
3097 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3098 alu.op = ALU_OP1_MOV;
3099 alu.src[0].chan = 1;
3100 alu.src[0].sel = ctx->temp_reg;
3101
3102 tgsi_dst(ctx, &inst->Dst[1], i, &alu.dst);
3103 alu.last = 1;
3104 r = r600_bytecode_add_alu(ctx->bc, &alu);
3105 if (r)
3106 return r;
3107 break;
3108 }
3109 }
3110 return 0;
3111 }
3112
3113
3114 static int egcm_int_to_double(struct r600_shader_ctx *ctx)
3115 {
3116 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3117 struct r600_bytecode_alu alu;
3118 int i, r;
3119 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3120
3121 assert(inst->Instruction.Opcode == TGSI_OPCODE_I2D ||
3122 inst->Instruction.Opcode == TGSI_OPCODE_U2D);
3123
3124 for (i = 0; i <= (lasti+1)/2; i++) {
3125 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3126 alu.op = ctx->inst_info->op;
3127
3128 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3129 alu.dst.sel = ctx->temp_reg;
3130 alu.dst.chan = i;
3131 alu.dst.write = 1;
3132 alu.last = 1;
3133
3134 r = r600_bytecode_add_alu(ctx->bc, &alu);
3135 if (r)
3136 return r;
3137 }
3138
3139 for (i = 0; i <= lasti; i++) {
3140 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3141 alu.op = ALU_OP1_FLT32_TO_FLT64;
3142
3143 alu.src[0].chan = i/2;
3144 if (i%2 == 0)
3145 alu.src[0].sel = ctx->temp_reg;
3146 else {
3147 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3148 alu.src[0].value = 0x0;
3149 }
3150 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3151 alu.last = i == lasti;
3152
3153 r = r600_bytecode_add_alu(ctx->bc, &alu);
3154 if (r)
3155 return r;
3156 }
3157
3158 return 0;
3159 }
3160
3161 static int egcm_double_to_int(struct r600_shader_ctx *ctx)
3162 {
3163 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3164 struct r600_bytecode_alu alu;
3165 int i, r;
3166 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3167
3168 assert(inst->Instruction.Opcode == TGSI_OPCODE_D2I ||
3169 inst->Instruction.Opcode == TGSI_OPCODE_D2U);
3170
3171 for (i = 0; i <= lasti; i++) {
3172 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3173 alu.op = ALU_OP1_FLT64_TO_FLT32;
3174
3175 r600_bytecode_src(&alu.src[0], &ctx->src[0], fp64_switch(i));
3176 alu.dst.chan = i;
3177 alu.dst.sel = ctx->temp_reg;
3178 alu.dst.write = i%2 == 0;
3179 alu.last = i == lasti;
3180
3181 r = r600_bytecode_add_alu(ctx->bc, &alu);
3182 if (r)
3183 return r;
3184 }
3185
3186 for (i = 0; i <= (lasti+1)/2; i++) {
3187 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3188 alu.op = ctx->inst_info->op;
3189
3190 alu.src[0].chan = i*2;
3191 alu.src[0].sel = ctx->temp_reg;
3192 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
3193 alu.last = 1;
3194
3195 r = r600_bytecode_add_alu(ctx->bc, &alu);
3196 if (r)
3197 return r;
3198 }
3199
3200 return 0;
3201 }
3202
3203 static int cayman_emit_double_instr(struct r600_shader_ctx *ctx)
3204 {
3205 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3206 int i, r;
3207 struct r600_bytecode_alu alu;
3208 int last_slot = 3;
3209 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3210 int t1 = ctx->temp_reg;
3211
3212 /* these have to write the result to X/Y by the looks of it */
3213 for (i = 0 ; i < last_slot; i++) {
3214 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3215 alu.op = ctx->inst_info->op;
3216
3217 /* should only be one src regs */
3218 assert (inst->Instruction.NumSrcRegs == 1);
3219
3220 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
3221 r600_bytecode_src(&alu.src[1], &ctx->src[0], 0);
3222
3223 /* RSQ should take the absolute value of src */
3224 if (ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DRSQ ||
3225 ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DSQRT) {
3226 r600_bytecode_src_set_abs(&alu.src[1]);
3227 }
3228 alu.dst.sel = t1;
3229 alu.dst.chan = i;
3230 alu.dst.write = (i == 0 || i == 1);
3231
3232 if (ctx->bc->chip_class != CAYMAN || i == last_slot - 1)
3233 alu.last = 1;
3234 r = r600_bytecode_add_alu(ctx->bc, &alu);
3235 if (r)
3236 return r;
3237 }
3238
3239 for (i = 0 ; i <= lasti; i++) {
3240 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
3241 continue;
3242 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3243 alu.op = ALU_OP1_MOV;
3244 alu.src[0].sel = t1;
3245 alu.src[0].chan = (i == 0 || i == 2) ? 0 : 1;
3246 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3247 alu.dst.write = 1;
3248 if (i == lasti)
3249 alu.last = 1;
3250 r = r600_bytecode_add_alu(ctx->bc, &alu);
3251 if (r)
3252 return r;
3253 }
3254 return 0;
3255 }
3256
3257 static int cayman_emit_float_instr(struct r600_shader_ctx *ctx)
3258 {
3259 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3260 int i, j, r;
3261 struct r600_bytecode_alu alu;
3262 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
3263
3264 for (i = 0 ; i < last_slot; i++) {
3265 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3266 alu.op = ctx->inst_info->op;
3267 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3268 r600_bytecode_src(&alu.src[j], &ctx->src[j], 0);
3269
3270 /* RSQ should take the absolute value of src */
3271 if (inst->Instruction.Opcode == TGSI_OPCODE_RSQ) {
3272 r600_bytecode_src_set_abs(&alu.src[j]);
3273 }
3274 }
3275 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3276 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
3277
3278 if (i == last_slot - 1)
3279 alu.last = 1;
3280 r = r600_bytecode_add_alu(ctx->bc, &alu);
3281 if (r)
3282 return r;
3283 }
3284 return 0;
3285 }
3286
3287 static int cayman_mul_int_instr(struct r600_shader_ctx *ctx)
3288 {
3289 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3290 int i, j, k, r;
3291 struct r600_bytecode_alu alu;
3292 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3293 int t1 = ctx->temp_reg;
3294
3295 for (k = 0; k <= lasti; k++) {
3296 if (!(inst->Dst[0].Register.WriteMask & (1 << k)))
3297 continue;
3298
3299 for (i = 0 ; i < 4; i++) {
3300 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3301 alu.op = ctx->inst_info->op;
3302 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3303 r600_bytecode_src(&alu.src[j], &ctx->src[j], k);
3304 }
3305 alu.dst.sel = t1;
3306 alu.dst.chan = i;
3307 alu.dst.write = (i == k);
3308 if (i == 3)
3309 alu.last = 1;
3310 r = r600_bytecode_add_alu(ctx->bc, &alu);
3311 if (r)
3312 return r;
3313 }
3314 }
3315
3316 for (i = 0 ; i <= lasti; i++) {
3317 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
3318 continue;
3319 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3320 alu.op = ALU_OP1_MOV;
3321 alu.src[0].sel = t1;
3322 alu.src[0].chan = i;
3323 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3324 alu.dst.write = 1;
3325 if (i == lasti)
3326 alu.last = 1;
3327 r = r600_bytecode_add_alu(ctx->bc, &alu);
3328 if (r)
3329 return r;
3330 }
3331
3332 return 0;
3333 }
3334
3335
3336 static int cayman_mul_double_instr(struct r600_shader_ctx *ctx)
3337 {
3338 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3339 int i, j, k, r;
3340 struct r600_bytecode_alu alu;
3341 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3342 int t1 = ctx->temp_reg;
3343
3344 for (k = 0; k < 2; k++) {
3345 if (!(inst->Dst[0].Register.WriteMask & (0x3 << (k * 2))))
3346 continue;
3347
3348 for (i = 0; i < 4; i++) {
3349 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3350 alu.op = ctx->inst_info->op;
3351 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3352 r600_bytecode_src(&alu.src[j], &ctx->src[j], k * 2 + ((i == 3) ? 0 : 1));;
3353 }
3354 alu.dst.sel = t1;
3355 alu.dst.chan = i;
3356 alu.dst.write = 1;
3357 if (i == 3)
3358 alu.last = 1;
3359 r = r600_bytecode_add_alu(ctx->bc, &alu);
3360 if (r)
3361 return r;
3362 }
3363 }
3364
3365 for (i = 0; i <= lasti; i++) {
3366 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
3367 continue;
3368 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3369 alu.op = ALU_OP1_MOV;
3370 alu.src[0].sel = t1;
3371 alu.src[0].chan = i;
3372 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3373 alu.dst.write = 1;
3374 if (i == lasti)
3375 alu.last = 1;
3376 r = r600_bytecode_add_alu(ctx->bc, &alu);
3377 if (r)
3378 return r;
3379 }
3380
3381 return 0;
3382 }
3383
3384 /*
3385 * r600 - trunc to -PI..PI range
3386 * r700 - normalize by dividing by 2PI
3387 * see fdo bug 27901
3388 */
3389 static int tgsi_setup_trig(struct r600_shader_ctx *ctx)
3390 {
3391 static float half_inv_pi = 1.0 /(3.1415926535 * 2);
3392 static float double_pi = 3.1415926535 * 2;
3393 static float neg_pi = -3.1415926535;
3394
3395 int r;
3396 struct r600_bytecode_alu alu;
3397
3398 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3399 alu.op = ALU_OP3_MULADD;
3400 alu.is_op3 = 1;
3401
3402 alu.dst.chan = 0;
3403 alu.dst.sel = ctx->temp_reg;
3404 alu.dst.write = 1;
3405
3406 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
3407
3408 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
3409 alu.src[1].chan = 0;
3410 alu.src[1].value = *(uint32_t *)&half_inv_pi;
3411 alu.src[2].sel = V_SQ_ALU_SRC_0_5;
3412 alu.src[2].chan = 0;
3413 alu.last = 1;
3414 r = r600_bytecode_add_alu(ctx->bc, &alu);
3415 if (r)
3416 return r;
3417
3418 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3419 alu.op = ALU_OP1_FRACT;
3420
3421 alu.dst.chan = 0;
3422 alu.dst.sel = ctx->temp_reg;
3423 alu.dst.write = 1;
3424
3425 alu.src[0].sel = ctx->temp_reg;
3426 alu.src[0].chan = 0;
3427 alu.last = 1;
3428 r = r600_bytecode_add_alu(ctx->bc, &alu);
3429 if (r)
3430 return r;
3431
3432 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3433 alu.op = ALU_OP3_MULADD;
3434 alu.is_op3 = 1;
3435
3436 alu.dst.chan = 0;
3437 alu.dst.sel = ctx->temp_reg;
3438 alu.dst.write = 1;
3439
3440 alu.src[0].sel = ctx->temp_reg;
3441 alu.src[0].chan = 0;
3442
3443 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
3444 alu.src[1].chan = 0;
3445 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
3446 alu.src[2].chan = 0;
3447
3448 if (ctx->bc->chip_class == R600) {
3449 alu.src[1].value = *(uint32_t *)&double_pi;
3450 alu.src[2].value = *(uint32_t *)&neg_pi;
3451 } else {
3452 alu.src[1].sel = V_SQ_ALU_SRC_1;
3453 alu.src[2].sel = V_SQ_ALU_SRC_0_5;
3454 alu.src[2].neg = 1;
3455 }
3456
3457 alu.last = 1;
3458 r = r600_bytecode_add_alu(ctx->bc, &alu);
3459 if (r)
3460 return r;
3461 return 0;
3462 }
3463
3464 static int cayman_trig(struct r600_shader_ctx *ctx)
3465 {
3466 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3467 struct r600_bytecode_alu alu;
3468 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
3469 int i, r;
3470
3471 r = tgsi_setup_trig(ctx);
3472 if (r)
3473 return r;
3474
3475
3476 for (i = 0; i < last_slot; i++) {
3477 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3478 alu.op = ctx->inst_info->op;
3479 alu.dst.chan = i;
3480
3481 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3482 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
3483
3484 alu.src[0].sel = ctx->temp_reg;
3485 alu.src[0].chan = 0;
3486 if (i == last_slot - 1)
3487 alu.last = 1;
3488 r = r600_bytecode_add_alu(ctx->bc, &alu);
3489 if (r)
3490 return r;
3491 }
3492 return 0;
3493 }
3494
3495 static int tgsi_trig(struct r600_shader_ctx *ctx)
3496 {
3497 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3498 struct r600_bytecode_alu alu;
3499 int i, r;
3500 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3501
3502 r = tgsi_setup_trig(ctx);
3503 if (r)
3504 return r;
3505
3506 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3507 alu.op = ctx->inst_info->op;
3508 alu.dst.chan = 0;
3509 alu.dst.sel = ctx->temp_reg;
3510 alu.dst.write = 1;
3511
3512 alu.src[0].sel = ctx->temp_reg;
3513 alu.src[0].chan = 0;
3514 alu.last = 1;
3515 r = r600_bytecode_add_alu(ctx->bc, &alu);
3516 if (r)
3517 return r;
3518
3519 /* replicate result */
3520 for (i = 0; i < lasti + 1; i++) {
3521 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
3522 continue;
3523
3524 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3525 alu.op = ALU_OP1_MOV;
3526
3527 alu.src[0].sel = ctx->temp_reg;
3528 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3529 if (i == lasti)
3530 alu.last = 1;
3531 r = r600_bytecode_add_alu(ctx->bc, &alu);
3532 if (r)
3533 return r;
3534 }
3535 return 0;
3536 }
3537
3538 static int tgsi_scs(struct r600_shader_ctx *ctx)
3539 {
3540 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3541 struct r600_bytecode_alu alu;
3542 int i, r;
3543
3544 /* We'll only need the trig stuff if we are going to write to the
3545 * X or Y components of the destination vector.
3546 */
3547 if (likely(inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY)) {
3548 r = tgsi_setup_trig(ctx);
3549 if (r)
3550 return r;
3551 }
3552
3553 /* dst.x = COS */
3554 if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
3555 if (ctx->bc->chip_class == CAYMAN) {
3556 for (i = 0 ; i < 3; i++) {
3557 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3558 alu.op = ALU_OP1_COS;
3559 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3560
3561 if (i == 0)
3562 alu.dst.write = 1;
3563 else
3564 alu.dst.write = 0;
3565 alu.src[0].sel = ctx->temp_reg;
3566 alu.src[0].chan = 0;
3567 if (i == 2)
3568 alu.last = 1;
3569 r = r600_bytecode_add_alu(ctx->bc, &alu);
3570 if (r)
3571 return r;
3572 }
3573 } else {
3574 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3575 alu.op = ALU_OP1_COS;
3576 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
3577
3578 alu.src[0].sel = ctx->temp_reg;
3579 alu.src[0].chan = 0;
3580 alu.last = 1;
3581 r = r600_bytecode_add_alu(ctx->bc, &alu);
3582 if (r)
3583 return r;
3584 }
3585 }
3586
3587 /* dst.y = SIN */
3588 if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3589 if (ctx->bc->chip_class == CAYMAN) {
3590 for (i = 0 ; i < 3; i++) {
3591 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3592 alu.op = ALU_OP1_SIN;
3593 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3594 if (i == 1)
3595 alu.dst.write = 1;
3596 else
3597 alu.dst.write = 0;
3598 alu.src[0].sel = ctx->temp_reg;
3599 alu.src[0].chan = 0;
3600 if (i == 2)
3601 alu.last = 1;
3602 r = r600_bytecode_add_alu(ctx->bc, &alu);
3603 if (r)
3604 return r;
3605 }
3606 } else {
3607 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3608 alu.op = ALU_OP1_SIN;
3609 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
3610
3611 alu.src[0].sel = ctx->temp_reg;
3612 alu.src[0].chan = 0;
3613 alu.last = 1;
3614 r = r600_bytecode_add_alu(ctx->bc, &alu);
3615 if (r)
3616 return r;
3617 }
3618 }
3619
3620 /* dst.z = 0.0; */
3621 if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3622 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3623
3624 alu.op = ALU_OP1_MOV;
3625
3626 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
3627
3628 alu.src[0].sel = V_SQ_ALU_SRC_0;
3629 alu.src[0].chan = 0;
3630
3631 alu.last = 1;
3632
3633 r = r600_bytecode_add_alu(ctx->bc, &alu);
3634 if (r)
3635 return r;
3636 }
3637
3638 /* dst.w = 1.0; */
3639 if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3640 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3641
3642 alu.op = ALU_OP1_MOV;
3643
3644 tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
3645
3646 alu.src[0].sel = V_SQ_ALU_SRC_1;
3647 alu.src[0].chan = 0;
3648
3649 alu.last = 1;
3650
3651 r = r600_bytecode_add_alu(ctx->bc, &alu);
3652 if (r)
3653 return r;
3654 }
3655
3656 return 0;
3657 }
3658
3659 static int tgsi_kill(struct r600_shader_ctx *ctx)
3660 {
3661 const struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3662 struct r600_bytecode_alu alu;
3663 int i, r;
3664
3665 for (i = 0; i < 4; i++) {
3666 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3667 alu.op = ctx->inst_info->op;
3668
3669 alu.dst.chan = i;
3670
3671 alu.src[0].sel = V_SQ_ALU_SRC_0;
3672
3673 if (inst->Instruction.Opcode == TGSI_OPCODE_KILL) {
3674 alu.src[1].sel = V_SQ_ALU_SRC_1;
3675 alu.src[1].neg = 1;
3676 } else {
3677 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3678 }
3679 if (i == 3) {
3680 alu.last = 1;
3681 }
3682 r = r600_bytecode_add_alu(ctx->bc, &alu);
3683 if (r)
3684 return r;
3685 }
3686
3687 /* kill must be last in ALU */
3688 ctx->bc->force_add_cf = 1;
3689 ctx->shader->uses_kill = TRUE;
3690 return 0;
3691 }
3692
3693 static int tgsi_lit(struct r600_shader_ctx *ctx)
3694 {
3695 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3696 struct r600_bytecode_alu alu;
3697 int r;
3698
3699 /* tmp.x = max(src.y, 0.0) */
3700 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3701 alu.op = ALU_OP2_MAX;
3702 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
3703 alu.src[1].sel = V_SQ_ALU_SRC_0; /*0.0*/
3704 alu.src[1].chan = 1;
3705
3706 alu.dst.sel = ctx->temp_reg;
3707 alu.dst.chan = 0;
3708 alu.dst.write = 1;
3709
3710 alu.last = 1;
3711 r = r600_bytecode_add_alu(ctx->bc, &alu);
3712 if (r)
3713 return r;
3714
3715 if (inst->Dst[0].Register.WriteMask & (1 << 2))
3716 {
3717 int chan;
3718 int sel;
3719 int i;
3720
3721 if (ctx->bc->chip_class == CAYMAN) {
3722 for (i = 0; i < 3; i++) {
3723 /* tmp.z = log(tmp.x) */
3724 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3725 alu.op = ALU_OP1_LOG_CLAMPED;
3726 alu.src[0].sel = ctx->temp_reg;
3727 alu.src[0].chan = 0;
3728 alu.dst.sel = ctx->temp_reg;
3729 alu.dst.chan = i;
3730 if (i == 2) {
3731 alu.dst.write = 1;
3732 alu.last = 1;
3733 } else
3734 alu.dst.write = 0;
3735
3736 r = r600_bytecode_add_alu(ctx->bc, &alu);
3737 if (r)
3738 return r;
3739 }
3740 } else {
3741 /* tmp.z = log(tmp.x) */
3742 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3743 alu.op = ALU_OP1_LOG_CLAMPED;
3744 alu.src[0].sel = ctx->temp_reg;
3745 alu.src[0].chan = 0;
3746 alu.dst.sel = ctx->temp_reg;
3747 alu.dst.chan = 2;
3748 alu.dst.write = 1;
3749 alu.last = 1;
3750 r = r600_bytecode_add_alu(ctx->bc, &alu);
3751 if (r)
3752 return r;
3753 }
3754
3755 chan = alu.dst.chan;
3756 sel = alu.dst.sel;
3757
3758 /* tmp.x = amd MUL_LIT(tmp.z, src.w, src.x ) */
3759 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3760 alu.op = ALU_OP3_MUL_LIT;
3761 alu.src[0].sel = sel;
3762 alu.src[0].chan = chan;
3763 r600_bytecode_src(&alu.src[1], &ctx->src[0], 3);
3764 r600_bytecode_src(&alu.src[2], &ctx->src[0], 0);
3765 alu.dst.sel = ctx->temp_reg;
3766 alu.dst.chan = 0;
3767 alu.dst.write = 1;
3768 alu.is_op3 = 1;
3769 alu.last = 1;
3770 r = r600_bytecode_add_alu(ctx->bc, &alu);
3771 if (r)
3772 return r;
3773
3774 if (ctx->bc->chip_class == CAYMAN) {
3775 for (i = 0; i < 3; i++) {
3776 /* dst.z = exp(tmp.x) */
3777 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3778 alu.op = ALU_OP1_EXP_IEEE;
3779 alu.src[0].sel = ctx->temp_reg;
3780 alu.src[0].chan = 0;
3781 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3782 if (i == 2) {
3783 alu.dst.write = 1;
3784 alu.last = 1;
3785 } else
3786 alu.dst.write = 0;
3787 r = r600_bytecode_add_alu(ctx->bc, &alu);
3788 if (r)
3789 return r;
3790 }
3791 } else {
3792 /* dst.z = exp(tmp.x) */
3793 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3794 alu.op = ALU_OP1_EXP_IEEE;
3795 alu.src[0].sel = ctx->temp_reg;
3796 alu.src[0].chan = 0;
3797 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
3798 alu.last = 1;
3799 r = r600_bytecode_add_alu(ctx->bc, &alu);
3800 if (r)
3801 return r;
3802 }
3803 }
3804
3805 /* dst.x, <- 1.0 */
3806 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3807 alu.op = ALU_OP1_MOV;
3808 alu.src[0].sel = V_SQ_ALU_SRC_1; /*1.0*/
3809 alu.src[0].chan = 0;
3810 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
3811 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 0) & 1;
3812 r = r600_bytecode_add_alu(ctx->bc, &alu);
3813 if (r)
3814 return r;
3815
3816 /* dst.y = max(src.x, 0.0) */
3817 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3818 alu.op = ALU_OP2_MAX;
3819 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
3820 alu.src[1].sel = V_SQ_ALU_SRC_0; /*0.0*/
3821 alu.src[1].chan = 0;
3822 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
3823 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 1) & 1;
3824 r = r600_bytecode_add_alu(ctx->bc, &alu);
3825 if (r)
3826 return r;
3827
3828 /* dst.w, <- 1.0 */
3829 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3830 alu.op = ALU_OP1_MOV;
3831 alu.src[0].sel = V_SQ_ALU_SRC_1;
3832 alu.src[0].chan = 0;
3833 tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
3834 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 3) & 1;
3835 alu.last = 1;
3836 r = r600_bytecode_add_alu(ctx->bc, &alu);
3837 if (r)
3838 return r;
3839
3840 return 0;
3841 }
3842
3843 static int tgsi_rsq(struct r600_shader_ctx *ctx)
3844 {
3845 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3846 struct r600_bytecode_alu alu;
3847 int i, r;
3848
3849 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3850
3851 /* XXX:
3852 * For state trackers other than OpenGL, we'll want to use
3853 * _RECIPSQRT_IEEE instead.
3854 */
3855 alu.op = ALU_OP1_RECIPSQRT_CLAMPED;
3856
3857 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
3858 r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
3859 r600_bytecode_src_set_abs(&alu.src[i]);
3860 }
3861 alu.dst.sel = ctx->temp_reg;
3862 alu.dst.write = 1;
3863 alu.last = 1;
3864 r = r600_bytecode_add_alu(ctx->bc, &alu);
3865 if (r)
3866 return r;
3867 /* replicate result */
3868 return tgsi_helper_tempx_replicate(ctx);
3869 }
3870
3871 static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx)
3872 {
3873 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3874 struct r600_bytecode_alu alu;
3875 int i, r;
3876
3877 for (i = 0; i < 4; i++) {
3878 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3879 alu.src[0].sel = ctx->temp_reg;
3880 alu.op = ALU_OP1_MOV;
3881 alu.dst.chan = i;
3882 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3883 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
3884 if (i == 3)
3885 alu.last = 1;
3886 r = r600_bytecode_add_alu(ctx->bc, &alu);
3887 if (r)
3888 return r;
3889 }
3890 return 0;
3891 }
3892
3893 static int tgsi_trans_srcx_replicate(struct r600_shader_ctx *ctx)
3894 {
3895 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3896 struct r600_bytecode_alu alu;
3897 int i, r;
3898
3899 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3900 alu.op = ctx->inst_info->op;
3901 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
3902 r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
3903 }
3904 alu.dst.sel = ctx->temp_reg;
3905 alu.dst.write = 1;
3906 alu.last = 1;
3907 r = r600_bytecode_add_alu(ctx->bc, &alu);
3908 if (r)
3909 return r;
3910 /* replicate result */
3911 return tgsi_helper_tempx_replicate(ctx);
3912 }
3913
3914 static int cayman_pow(struct r600_shader_ctx *ctx)
3915 {
3916 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3917 int i, r;
3918 struct r600_bytecode_alu alu;
3919 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
3920
3921 for (i = 0; i < 3; i++) {
3922 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3923 alu.op = ALU_OP1_LOG_IEEE;
3924 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
3925 alu.dst.sel = ctx->temp_reg;
3926 alu.dst.chan = i;
3927 alu.dst.write = 1;
3928 if (i == 2)
3929 alu.last = 1;
3930 r = r600_bytecode_add_alu(ctx->bc, &alu);
3931 if (r)
3932 return r;
3933 }
3934
3935 /* b * LOG2(a) */
3936 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3937 alu.op = ALU_OP2_MUL;
3938 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
3939 alu.src[1].sel = ctx->temp_reg;
3940 alu.dst.sel = ctx->temp_reg;
3941 alu.dst.write = 1;
3942 alu.last = 1;
3943 r = r600_bytecode_add_alu(ctx->bc, &alu);
3944 if (r)
3945 return r;
3946
3947 for (i = 0; i < last_slot; i++) {
3948 /* POW(a,b) = EXP2(b * LOG2(a))*/
3949 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3950 alu.op = ALU_OP1_EXP_IEEE;
3951 alu.src[0].sel = ctx->temp_reg;
3952
3953 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3954 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
3955 if (i == last_slot - 1)
3956 alu.last = 1;
3957 r = r600_bytecode_add_alu(ctx->bc, &alu);
3958 if (r)
3959 return r;
3960 }
3961 return 0;
3962 }
3963
3964 static int tgsi_pow(struct r600_shader_ctx *ctx)
3965 {
3966 struct r600_bytecode_alu alu;
3967 int r;
3968
3969 /* LOG2(a) */
3970 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3971 alu.op = ALU_OP1_LOG_IEEE;
3972 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
3973 alu.dst.sel = ctx->temp_reg;
3974 alu.dst.write = 1;
3975 alu.last = 1;
3976 r = r600_bytecode_add_alu(ctx->bc, &alu);
3977 if (r)
3978 return r;
3979 /* b * LOG2(a) */
3980 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3981 alu.op = ALU_OP2_MUL;
3982 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
3983 alu.src[1].sel = ctx->temp_reg;
3984 alu.dst.sel = ctx->temp_reg;
3985 alu.dst.write = 1;
3986 alu.last = 1;
3987 r = r600_bytecode_add_alu(ctx->bc, &alu);
3988 if (r)
3989 return r;
3990 /* POW(a,b) = EXP2(b * LOG2(a))*/
3991 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3992 alu.op = ALU_OP1_EXP_IEEE;
3993 alu.src[0].sel = ctx->temp_reg;
3994 alu.dst.sel = ctx->temp_reg;
3995 alu.dst.write = 1;
3996 alu.last = 1;
3997 r = r600_bytecode_add_alu(ctx->bc, &alu);
3998 if (r)
3999 return r;
4000 return tgsi_helper_tempx_replicate(ctx);
4001 }
4002
4003 static int tgsi_divmod(struct r600_shader_ctx *ctx, int mod, int signed_op)
4004 {
4005 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4006 struct r600_bytecode_alu alu;
4007 int i, r, j;
4008 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4009 int tmp0 = ctx->temp_reg;
4010 int tmp1 = r600_get_temp(ctx);
4011 int tmp2 = r600_get_temp(ctx);
4012 int tmp3 = r600_get_temp(ctx);
4013 /* Unsigned path:
4014 *
4015 * we need to represent src1 as src2*q + r, where q - quotient, r - remainder
4016 *
4017 * 1. tmp0.x = rcp (src2) = 2^32/src2 + e, where e is rounding error
4018 * 2. tmp0.z = lo (tmp0.x * src2)
4019 * 3. tmp0.w = -tmp0.z
4020 * 4. tmp0.y = hi (tmp0.x * src2)
4021 * 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z) = abs(lo(rcp*src2))
4022 * 6. tmp0.w = hi (tmp0.z * tmp0.x) = e, rounding error
4023 * 7. tmp1.x = tmp0.x - tmp0.w
4024 * 8. tmp1.y = tmp0.x + tmp0.w
4025 * 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x)
4026 * 10. tmp0.z = hi(tmp0.x * src1) = q
4027 * 11. tmp0.y = lo (tmp0.z * src2) = src2*q = src1 - r
4028 *
4029 * 12. tmp0.w = src1 - tmp0.y = r
4030 * 13. tmp1.x = tmp0.w >= src2 = r >= src2 (uint comparison)
4031 * 14. tmp1.y = src1 >= tmp0.y = r >= 0 (uint comparison)
4032 *
4033 * if DIV
4034 *
4035 * 15. tmp1.z = tmp0.z + 1 = q + 1
4036 * 16. tmp1.w = tmp0.z - 1 = q - 1
4037 *
4038 * else MOD
4039 *
4040 * 15. tmp1.z = tmp0.w - src2 = r - src2
4041 * 16. tmp1.w = tmp0.w + src2 = r + src2
4042 *
4043 * endif
4044 *
4045 * 17. tmp1.x = tmp1.x & tmp1.y
4046 *
4047 * DIV: 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z
4048 * MOD: 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z
4049 *
4050 * 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z
4051 * 20. dst = src2==0 ? MAX_UINT : tmp0.z
4052 *
4053 * Signed path:
4054 *
4055 * Same as unsigned, using abs values of the operands,
4056 * and fixing the sign of the result in the end.
4057 */
4058
4059 for (i = 0; i < 4; i++) {
4060 if (!(write_mask & (1<<i)))
4061 continue;
4062
4063 if (signed_op) {
4064
4065 /* tmp2.x = -src0 */
4066 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4067 alu.op = ALU_OP2_SUB_INT;
4068
4069 alu.dst.sel = tmp2;
4070 alu.dst.chan = 0;
4071 alu.dst.write = 1;
4072
4073 alu.src[0].sel = V_SQ_ALU_SRC_0;
4074
4075 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4076
4077 alu.last = 1;
4078 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4079 return r;
4080
4081 /* tmp2.y = -src1 */
4082 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4083 alu.op = ALU_OP2_SUB_INT;
4084
4085 alu.dst.sel = tmp2;
4086 alu.dst.chan = 1;
4087 alu.dst.write = 1;
4088
4089 alu.src[0].sel = V_SQ_ALU_SRC_0;
4090
4091 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4092
4093 alu.last = 1;
4094 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4095 return r;
4096
4097 /* tmp2.z sign bit is set if src0 and src2 signs are different */
4098 /* it will be a sign of the quotient */
4099 if (!mod) {
4100
4101 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4102 alu.op = ALU_OP2_XOR_INT;
4103
4104 alu.dst.sel = tmp2;
4105 alu.dst.chan = 2;
4106 alu.dst.write = 1;
4107
4108 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4109 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4110
4111 alu.last = 1;
4112 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4113 return r;
4114 }
4115
4116 /* tmp2.x = |src0| */
4117 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4118 alu.op = ALU_OP3_CNDGE_INT;
4119 alu.is_op3 = 1;
4120
4121 alu.dst.sel = tmp2;
4122 alu.dst.chan = 0;
4123 alu.dst.write = 1;
4124
4125 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4126 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4127 alu.src[2].sel = tmp2;
4128 alu.src[2].chan = 0;
4129
4130 alu.last = 1;
4131 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4132 return r;
4133
4134 /* tmp2.y = |src1| */
4135 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4136 alu.op = ALU_OP3_CNDGE_INT;
4137 alu.is_op3 = 1;
4138
4139 alu.dst.sel = tmp2;
4140 alu.dst.chan = 1;
4141 alu.dst.write = 1;
4142
4143 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4144 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4145 alu.src[2].sel = tmp2;
4146 alu.src[2].chan = 1;
4147
4148 alu.last = 1;
4149 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4150 return r;
4151
4152 }
4153
4154 /* 1. tmp0.x = rcp_u (src2) = 2^32/src2 + e, where e is rounding error */
4155 if (ctx->bc->chip_class == CAYMAN) {
4156 /* tmp3.x = u2f(src2) */
4157 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4158 alu.op = ALU_OP1_UINT_TO_FLT;
4159
4160 alu.dst.sel = tmp3;
4161 alu.dst.chan = 0;
4162 alu.dst.write = 1;
4163
4164 if (signed_op) {
4165 alu.src[0].sel = tmp2;
4166 alu.src[0].chan = 1;
4167 } else {
4168 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4169 }
4170
4171 alu.last = 1;
4172 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4173 return r;
4174
4175 /* tmp0.x = recip(tmp3.x) */
4176 for (j = 0 ; j < 3; j++) {
4177 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4178 alu.op = ALU_OP1_RECIP_IEEE;
4179
4180 alu.dst.sel = tmp0;
4181 alu.dst.chan = j;
4182 alu.dst.write = (j == 0);
4183
4184 alu.src[0].sel = tmp3;
4185 alu.src[0].chan = 0;
4186
4187 if (j == 2)
4188 alu.last = 1;
4189 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4190 return r;
4191 }
4192
4193 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4194 alu.op = ALU_OP2_MUL;
4195
4196 alu.src[0].sel = tmp0;
4197 alu.src[0].chan = 0;
4198
4199 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4200 alu.src[1].value = 0x4f800000;
4201
4202 alu.dst.sel = tmp3;
4203 alu.dst.write = 1;
4204 alu.last = 1;
4205 r = r600_bytecode_add_alu(ctx->bc, &alu);
4206 if (r)
4207 return r;
4208
4209 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4210 alu.op = ALU_OP1_FLT_TO_UINT;
4211
4212 alu.dst.sel = tmp0;
4213 alu.dst.chan = 0;
4214 alu.dst.write = 1;
4215
4216 alu.src[0].sel = tmp3;
4217 alu.src[0].chan = 0;
4218
4219 alu.last = 1;
4220 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4221 return r;
4222
4223 } else {
4224 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4225 alu.op = ALU_OP1_RECIP_UINT;
4226
4227 alu.dst.sel = tmp0;
4228 alu.dst.chan = 0;
4229 alu.dst.write = 1;
4230
4231 if (signed_op) {
4232 alu.src[0].sel = tmp2;
4233 alu.src[0].chan = 1;
4234 } else {
4235 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4236 }
4237
4238 alu.last = 1;
4239 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4240 return r;
4241 }
4242
4243 /* 2. tmp0.z = lo (tmp0.x * src2) */
4244 if (ctx->bc->chip_class == CAYMAN) {
4245 for (j = 0 ; j < 4; j++) {
4246 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4247 alu.op = ALU_OP2_MULLO_UINT;
4248
4249 alu.dst.sel = tmp0;
4250 alu.dst.chan = j;
4251 alu.dst.write = (j == 2);
4252
4253 alu.src[0].sel = tmp0;
4254 alu.src[0].chan = 0;
4255 if (signed_op) {
4256 alu.src[1].sel = tmp2;
4257 alu.src[1].chan = 1;
4258 } else {
4259 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4260 }
4261
4262 alu.last = (j == 3);
4263 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4264 return r;
4265 }
4266 } else {
4267 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4268 alu.op = ALU_OP2_MULLO_UINT;
4269
4270 alu.dst.sel = tmp0;
4271 alu.dst.chan = 2;
4272 alu.dst.write = 1;
4273
4274 alu.src[0].sel = tmp0;
4275 alu.src[0].chan = 0;
4276 if (signed_op) {
4277 alu.src[1].sel = tmp2;
4278 alu.src[1].chan = 1;
4279 } else {
4280 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4281 }
4282
4283 alu.last = 1;
4284 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4285 return r;
4286 }
4287
4288 /* 3. tmp0.w = -tmp0.z */
4289 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4290 alu.op = ALU_OP2_SUB_INT;
4291
4292 alu.dst.sel = tmp0;
4293 alu.dst.chan = 3;
4294 alu.dst.write = 1;
4295
4296 alu.src[0].sel = V_SQ_ALU_SRC_0;
4297 alu.src[1].sel = tmp0;
4298 alu.src[1].chan = 2;
4299
4300 alu.last = 1;
4301 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4302 return r;
4303
4304 /* 4. tmp0.y = hi (tmp0.x * src2) */
4305 if (ctx->bc->chip_class == CAYMAN) {
4306 for (j = 0 ; j < 4; j++) {
4307 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4308 alu.op = ALU_OP2_MULHI_UINT;
4309
4310 alu.dst.sel = tmp0;
4311 alu.dst.chan = j;
4312 alu.dst.write = (j == 1);
4313
4314 alu.src[0].sel = tmp0;
4315 alu.src[0].chan = 0;
4316
4317 if (signed_op) {
4318 alu.src[1].sel = tmp2;
4319 alu.src[1].chan = 1;
4320 } else {
4321 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4322 }
4323 alu.last = (j == 3);
4324 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4325 return r;
4326 }
4327 } else {
4328 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4329 alu.op = ALU_OP2_MULHI_UINT;
4330
4331 alu.dst.sel = tmp0;
4332 alu.dst.chan = 1;
4333 alu.dst.write = 1;
4334
4335 alu.src[0].sel = tmp0;
4336 alu.src[0].chan = 0;
4337
4338 if (signed_op) {
4339 alu.src[1].sel = tmp2;
4340 alu.src[1].chan = 1;
4341 } else {
4342 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4343 }
4344
4345 alu.last = 1;
4346 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4347 return r;
4348 }
4349
4350 /* 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z) = abs(lo(rcp*src)) */
4351 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4352 alu.op = ALU_OP3_CNDE_INT;
4353 alu.is_op3 = 1;
4354
4355 alu.dst.sel = tmp0;
4356 alu.dst.chan = 2;
4357 alu.dst.write = 1;
4358
4359 alu.src[0].sel = tmp0;
4360 alu.src[0].chan = 1;
4361 alu.src[1].sel = tmp0;
4362 alu.src[1].chan = 3;
4363 alu.src[2].sel = tmp0;
4364 alu.src[2].chan = 2;
4365
4366 alu.last = 1;
4367 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4368 return r;
4369
4370 /* 6. tmp0.w = hi (tmp0.z * tmp0.x) = e, rounding error */
4371 if (ctx->bc->chip_class == CAYMAN) {
4372 for (j = 0 ; j < 4; j++) {
4373 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4374 alu.op = ALU_OP2_MULHI_UINT;
4375
4376 alu.dst.sel = tmp0;
4377 alu.dst.chan = j;
4378 alu.dst.write = (j == 3);
4379
4380 alu.src[0].sel = tmp0;
4381 alu.src[0].chan = 2;
4382
4383 alu.src[1].sel = tmp0;
4384 alu.src[1].chan = 0;
4385
4386 alu.last = (j == 3);
4387 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4388 return r;
4389 }
4390 } else {
4391 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4392 alu.op = ALU_OP2_MULHI_UINT;
4393
4394 alu.dst.sel = tmp0;
4395 alu.dst.chan = 3;
4396 alu.dst.write = 1;
4397
4398 alu.src[0].sel = tmp0;
4399 alu.src[0].chan = 2;
4400
4401 alu.src[1].sel = tmp0;
4402 alu.src[1].chan = 0;
4403
4404 alu.last = 1;
4405 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4406 return r;
4407 }
4408
4409 /* 7. tmp1.x = tmp0.x - tmp0.w */
4410 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4411 alu.op = ALU_OP2_SUB_INT;
4412
4413 alu.dst.sel = tmp1;
4414 alu.dst.chan = 0;
4415 alu.dst.write = 1;
4416
4417 alu.src[0].sel = tmp0;
4418 alu.src[0].chan = 0;
4419 alu.src[1].sel = tmp0;
4420 alu.src[1].chan = 3;
4421
4422 alu.last = 1;
4423 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4424 return r;
4425
4426 /* 8. tmp1.y = tmp0.x + tmp0.w */
4427 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4428 alu.op = ALU_OP2_ADD_INT;
4429
4430 alu.dst.sel = tmp1;
4431 alu.dst.chan = 1;
4432 alu.dst.write = 1;
4433
4434 alu.src[0].sel = tmp0;
4435 alu.src[0].chan = 0;
4436 alu.src[1].sel = tmp0;
4437 alu.src[1].chan = 3;
4438
4439 alu.last = 1;
4440 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4441 return r;
4442
4443 /* 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) */
4444 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4445 alu.op = ALU_OP3_CNDE_INT;
4446 alu.is_op3 = 1;
4447
4448 alu.dst.sel = tmp0;
4449 alu.dst.chan = 0;
4450 alu.dst.write = 1;
4451
4452 alu.src[0].sel = tmp0;
4453 alu.src[0].chan = 1;
4454 alu.src[1].sel = tmp1;
4455 alu.src[1].chan = 1;
4456 alu.src[2].sel = tmp1;
4457 alu.src[2].chan = 0;
4458
4459 alu.last = 1;
4460 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4461 return r;
4462
4463 /* 10. tmp0.z = hi(tmp0.x * src1) = q */
4464 if (ctx->bc->chip_class == CAYMAN) {
4465 for (j = 0 ; j < 4; j++) {
4466 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4467 alu.op = ALU_OP2_MULHI_UINT;
4468
4469 alu.dst.sel = tmp0;
4470 alu.dst.chan = j;
4471 alu.dst.write = (j == 2);
4472
4473 alu.src[0].sel = tmp0;
4474 alu.src[0].chan = 0;
4475
4476 if (signed_op) {
4477 alu.src[1].sel = tmp2;
4478 alu.src[1].chan = 0;
4479 } else {
4480 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4481 }
4482
4483 alu.last = (j == 3);
4484 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4485 return r;
4486 }
4487 } else {
4488 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4489 alu.op = ALU_OP2_MULHI_UINT;
4490
4491 alu.dst.sel = tmp0;
4492 alu.dst.chan = 2;
4493 alu.dst.write = 1;
4494
4495 alu.src[0].sel = tmp0;
4496 alu.src[0].chan = 0;
4497
4498 if (signed_op) {
4499 alu.src[1].sel = tmp2;
4500 alu.src[1].chan = 0;
4501 } else {
4502 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4503 }
4504
4505 alu.last = 1;
4506 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4507 return r;
4508 }
4509
4510 /* 11. tmp0.y = lo (src2 * tmp0.z) = src2*q = src1 - r */
4511 if (ctx->bc->chip_class == CAYMAN) {
4512 for (j = 0 ; j < 4; j++) {
4513 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4514 alu.op = ALU_OP2_MULLO_UINT;
4515
4516 alu.dst.sel = tmp0;
4517 alu.dst.chan = j;
4518 alu.dst.write = (j == 1);
4519
4520 if (signed_op) {
4521 alu.src[0].sel = tmp2;
4522 alu.src[0].chan = 1;
4523 } else {
4524 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4525 }
4526
4527 alu.src[1].sel = tmp0;
4528 alu.src[1].chan = 2;
4529
4530 alu.last = (j == 3);
4531 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4532 return r;
4533 }
4534 } else {
4535 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4536 alu.op = ALU_OP2_MULLO_UINT;
4537
4538 alu.dst.sel = tmp0;
4539 alu.dst.chan = 1;
4540 alu.dst.write = 1;
4541
4542 if (signed_op) {
4543 alu.src[0].sel = tmp2;
4544 alu.src[0].chan = 1;
4545 } else {
4546 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4547 }
4548
4549 alu.src[1].sel = tmp0;
4550 alu.src[1].chan = 2;
4551
4552 alu.last = 1;
4553 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4554 return r;
4555 }
4556
4557 /* 12. tmp0.w = src1 - tmp0.y = r */
4558 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4559 alu.op = ALU_OP2_SUB_INT;
4560
4561 alu.dst.sel = tmp0;
4562 alu.dst.chan = 3;
4563 alu.dst.write = 1;
4564
4565 if (signed_op) {
4566 alu.src[0].sel = tmp2;
4567 alu.src[0].chan = 0;
4568 } else {
4569 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4570 }
4571
4572 alu.src[1].sel = tmp0;
4573 alu.src[1].chan = 1;
4574
4575 alu.last = 1;
4576 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4577 return r;
4578
4579 /* 13. tmp1.x = tmp0.w >= src2 = r >= src2 */
4580 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4581 alu.op = ALU_OP2_SETGE_UINT;
4582
4583 alu.dst.sel = tmp1;
4584 alu.dst.chan = 0;
4585 alu.dst.write = 1;
4586
4587 alu.src[0].sel = tmp0;
4588 alu.src[0].chan = 3;
4589 if (signed_op) {
4590 alu.src[1].sel = tmp2;
4591 alu.src[1].chan = 1;
4592 } else {
4593 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4594 }
4595
4596 alu.last = 1;
4597 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4598 return r;
4599
4600 /* 14. tmp1.y = src1 >= tmp0.y = r >= 0 */
4601 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4602 alu.op = ALU_OP2_SETGE_UINT;
4603
4604 alu.dst.sel = tmp1;
4605 alu.dst.chan = 1;
4606 alu.dst.write = 1;
4607
4608 if (signed_op) {
4609 alu.src[0].sel = tmp2;
4610 alu.src[0].chan = 0;
4611 } else {
4612 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4613 }
4614
4615 alu.src[1].sel = tmp0;
4616 alu.src[1].chan = 1;
4617
4618 alu.last = 1;
4619 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4620 return r;
4621
4622 if (mod) { /* UMOD */
4623
4624 /* 15. tmp1.z = tmp0.w - src2 = r - src2 */
4625 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4626 alu.op = ALU_OP2_SUB_INT;
4627
4628 alu.dst.sel = tmp1;
4629 alu.dst.chan = 2;
4630 alu.dst.write = 1;
4631
4632 alu.src[0].sel = tmp0;
4633 alu.src[0].chan = 3;
4634
4635 if (signed_op) {
4636 alu.src[1].sel = tmp2;
4637 alu.src[1].chan = 1;
4638 } else {
4639 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4640 }
4641
4642 alu.last = 1;
4643 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4644 return r;
4645
4646 /* 16. tmp1.w = tmp0.w + src2 = r + src2 */
4647 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4648 alu.op = ALU_OP2_ADD_INT;
4649
4650 alu.dst.sel = tmp1;
4651 alu.dst.chan = 3;
4652 alu.dst.write = 1;
4653
4654 alu.src[0].sel = tmp0;
4655 alu.src[0].chan = 3;
4656 if (signed_op) {
4657 alu.src[1].sel = tmp2;
4658 alu.src[1].chan = 1;
4659 } else {
4660 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4661 }
4662
4663 alu.last = 1;
4664 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4665 return r;
4666
4667 } else { /* UDIV */
4668
4669 /* 15. tmp1.z = tmp0.z + 1 = q + 1 DIV */
4670 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4671 alu.op = ALU_OP2_ADD_INT;
4672
4673 alu.dst.sel = tmp1;
4674 alu.dst.chan = 2;
4675 alu.dst.write = 1;
4676
4677 alu.src[0].sel = tmp0;
4678 alu.src[0].chan = 2;
4679 alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
4680
4681 alu.last = 1;
4682 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4683 return r;
4684
4685 /* 16. tmp1.w = tmp0.z - 1 = q - 1 */
4686 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4687 alu.op = ALU_OP2_ADD_INT;
4688
4689 alu.dst.sel = tmp1;
4690 alu.dst.chan = 3;
4691 alu.dst.write = 1;
4692
4693 alu.src[0].sel = tmp0;
4694 alu.src[0].chan = 2;
4695 alu.src[1].sel = V_SQ_ALU_SRC_M_1_INT;
4696
4697 alu.last = 1;
4698 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4699 return r;
4700
4701 }
4702
4703 /* 17. tmp1.x = tmp1.x & tmp1.y */
4704 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4705 alu.op = ALU_OP2_AND_INT;
4706
4707 alu.dst.sel = tmp1;
4708 alu.dst.chan = 0;
4709 alu.dst.write = 1;
4710
4711 alu.src[0].sel = tmp1;
4712 alu.src[0].chan = 0;
4713 alu.src[1].sel = tmp1;
4714 alu.src[1].chan = 1;
4715
4716 alu.last = 1;
4717 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4718 return r;
4719
4720 /* 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z DIV */
4721 /* 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z MOD */
4722 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4723 alu.op = ALU_OP3_CNDE_INT;
4724 alu.is_op3 = 1;
4725
4726 alu.dst.sel = tmp0;
4727 alu.dst.chan = 2;
4728 alu.dst.write = 1;
4729
4730 alu.src[0].sel = tmp1;
4731 alu.src[0].chan = 0;
4732 alu.src[1].sel = tmp0;
4733 alu.src[1].chan = mod ? 3 : 2;
4734 alu.src[2].sel = tmp1;
4735 alu.src[2].chan = 2;
4736
4737 alu.last = 1;
4738 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4739 return r;
4740
4741 /* 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z */
4742 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4743 alu.op = ALU_OP3_CNDE_INT;
4744 alu.is_op3 = 1;
4745
4746 if (signed_op) {
4747 alu.dst.sel = tmp0;
4748 alu.dst.chan = 2;
4749 alu.dst.write = 1;
4750 } else {
4751 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4752 }
4753
4754 alu.src[0].sel = tmp1;
4755 alu.src[0].chan = 1;
4756 alu.src[1].sel = tmp1;
4757 alu.src[1].chan = 3;
4758 alu.src[2].sel = tmp0;
4759 alu.src[2].chan = 2;
4760
4761 alu.last = 1;
4762 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4763 return r;
4764
4765 if (signed_op) {
4766
4767 /* fix the sign of the result */
4768
4769 if (mod) {
4770
4771 /* tmp0.x = -tmp0.z */
4772 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4773 alu.op = ALU_OP2_SUB_INT;
4774
4775 alu.dst.sel = tmp0;
4776 alu.dst.chan = 0;
4777 alu.dst.write = 1;
4778
4779 alu.src[0].sel = V_SQ_ALU_SRC_0;
4780 alu.src[1].sel = tmp0;
4781 alu.src[1].chan = 2;
4782
4783 alu.last = 1;
4784 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4785 return r;
4786
4787 /* sign of the remainder is the same as the sign of src0 */
4788 /* tmp0.x = src0>=0 ? tmp0.z : tmp0.x */
4789 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4790 alu.op = ALU_OP3_CNDGE_INT;
4791 alu.is_op3 = 1;
4792
4793 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4794
4795 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4796 alu.src[1].sel = tmp0;
4797 alu.src[1].chan = 2;
4798 alu.src[2].sel = tmp0;
4799 alu.src[2].chan = 0;
4800
4801 alu.last = 1;
4802 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4803 return r;
4804
4805 } else {
4806
4807 /* tmp0.x = -tmp0.z */
4808 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4809 alu.op = ALU_OP2_SUB_INT;
4810
4811 alu.dst.sel = tmp0;
4812 alu.dst.chan = 0;
4813 alu.dst.write = 1;
4814
4815 alu.src[0].sel = V_SQ_ALU_SRC_0;
4816 alu.src[1].sel = tmp0;
4817 alu.src[1].chan = 2;
4818
4819 alu.last = 1;
4820 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4821 return r;
4822
4823 /* fix the quotient sign (same as the sign of src0*src1) */
4824 /* tmp0.x = tmp2.z>=0 ? tmp0.z : tmp0.x */
4825 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4826 alu.op = ALU_OP3_CNDGE_INT;
4827 alu.is_op3 = 1;
4828
4829 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4830
4831 alu.src[0].sel = tmp2;
4832 alu.src[0].chan = 2;
4833 alu.src[1].sel = tmp0;
4834 alu.src[1].chan = 2;
4835 alu.src[2].sel = tmp0;
4836 alu.src[2].chan = 0;
4837
4838 alu.last = 1;
4839 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4840 return r;
4841 }
4842 }
4843 }
4844 return 0;
4845 }
4846
4847 static int tgsi_udiv(struct r600_shader_ctx *ctx)
4848 {
4849 return tgsi_divmod(ctx, 0, 0);
4850 }
4851
4852 static int tgsi_umod(struct r600_shader_ctx *ctx)
4853 {
4854 return tgsi_divmod(ctx, 1, 0);
4855 }
4856
4857 static int tgsi_idiv(struct r600_shader_ctx *ctx)
4858 {
4859 return tgsi_divmod(ctx, 0, 1);
4860 }
4861
4862 static int tgsi_imod(struct r600_shader_ctx *ctx)
4863 {
4864 return tgsi_divmod(ctx, 1, 1);
4865 }
4866
4867
4868 static int tgsi_f2i(struct r600_shader_ctx *ctx)
4869 {
4870 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4871 struct r600_bytecode_alu alu;
4872 int i, r;
4873 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4874 int last_inst = tgsi_last_instruction(write_mask);
4875
4876 for (i = 0; i < 4; i++) {
4877 if (!(write_mask & (1<<i)))
4878 continue;
4879
4880 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4881 alu.op = ALU_OP1_TRUNC;
4882
4883 alu.dst.sel = ctx->temp_reg;
4884 alu.dst.chan = i;
4885 alu.dst.write = 1;
4886
4887 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4888 if (i == last_inst)
4889 alu.last = 1;
4890 r = r600_bytecode_add_alu(ctx->bc, &alu);
4891 if (r)
4892 return r;
4893 }
4894
4895 for (i = 0; i < 4; i++) {
4896 if (!(write_mask & (1<<i)))
4897 continue;
4898
4899 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4900 alu.op = ctx->inst_info->op;
4901
4902 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4903
4904 alu.src[0].sel = ctx->temp_reg;
4905 alu.src[0].chan = i;
4906
4907 if (i == last_inst || alu.op == ALU_OP1_FLT_TO_UINT)
4908 alu.last = 1;
4909 r = r600_bytecode_add_alu(ctx->bc, &alu);
4910 if (r)
4911 return r;
4912 }
4913
4914 return 0;
4915 }
4916
4917 static int tgsi_iabs(struct r600_shader_ctx *ctx)
4918 {
4919 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4920 struct r600_bytecode_alu alu;
4921 int i, r;
4922 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4923 int last_inst = tgsi_last_instruction(write_mask);
4924
4925 /* tmp = -src */
4926 for (i = 0; i < 4; i++) {
4927 if (!(write_mask & (1<<i)))
4928 continue;
4929
4930 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4931 alu.op = ALU_OP2_SUB_INT;
4932
4933 alu.dst.sel = ctx->temp_reg;
4934 alu.dst.chan = i;
4935 alu.dst.write = 1;
4936
4937 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4938 alu.src[0].sel = V_SQ_ALU_SRC_0;
4939
4940 if (i == last_inst)
4941 alu.last = 1;
4942 r = r600_bytecode_add_alu(ctx->bc, &alu);
4943 if (r)
4944 return r;
4945 }
4946
4947 /* dst = (src >= 0 ? src : tmp) */
4948 for (i = 0; i < 4; i++) {
4949 if (!(write_mask & (1<<i)))
4950 continue;
4951
4952 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4953 alu.op = ALU_OP3_CNDGE_INT;
4954 alu.is_op3 = 1;
4955 alu.dst.write = 1;
4956
4957 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4958
4959 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4960 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4961 alu.src[2].sel = ctx->temp_reg;
4962 alu.src[2].chan = i;
4963
4964 if (i == last_inst)
4965 alu.last = 1;
4966 r = r600_bytecode_add_alu(ctx->bc, &alu);
4967 if (r)
4968 return r;
4969 }
4970 return 0;
4971 }
4972
4973 static int tgsi_issg(struct r600_shader_ctx *ctx)
4974 {
4975 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4976 struct r600_bytecode_alu alu;
4977 int i, r;
4978 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4979 int last_inst = tgsi_last_instruction(write_mask);
4980
4981 /* tmp = (src >= 0 ? src : -1) */
4982 for (i = 0; i < 4; i++) {
4983 if (!(write_mask & (1<<i)))
4984 continue;
4985
4986 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4987 alu.op = ALU_OP3_CNDGE_INT;
4988 alu.is_op3 = 1;
4989
4990 alu.dst.sel = ctx->temp_reg;
4991 alu.dst.chan = i;
4992 alu.dst.write = 1;
4993
4994 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4995 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4996 alu.src[2].sel = V_SQ_ALU_SRC_M_1_INT;
4997
4998 if (i == last_inst)
4999 alu.last = 1;
5000 r = r600_bytecode_add_alu(ctx->bc, &alu);
5001 if (r)
5002 return r;
5003 }
5004
5005 /* dst = (tmp > 0 ? 1 : tmp) */
5006 for (i = 0; i < 4; i++) {
5007 if (!(write_mask & (1<<i)))
5008 continue;
5009
5010 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5011 alu.op = ALU_OP3_CNDGT_INT;
5012 alu.is_op3 = 1;
5013 alu.dst.write = 1;
5014
5015 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5016
5017 alu.src[0].sel = ctx->temp_reg;
5018 alu.src[0].chan = i;
5019
5020 alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
5021
5022 alu.src[2].sel = ctx->temp_reg;
5023 alu.src[2].chan = i;
5024
5025 if (i == last_inst)
5026 alu.last = 1;
5027 r = r600_bytecode_add_alu(ctx->bc, &alu);
5028 if (r)
5029 return r;
5030 }
5031 return 0;
5032 }
5033
5034
5035
5036 static int tgsi_ssg(struct r600_shader_ctx *ctx)
5037 {
5038 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5039 struct r600_bytecode_alu alu;
5040 int i, r;
5041
5042 /* tmp = (src > 0 ? 1 : src) */
5043 for (i = 0; i < 4; i++) {
5044 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5045 alu.op = ALU_OP3_CNDGT;
5046 alu.is_op3 = 1;
5047
5048 alu.dst.sel = ctx->temp_reg;
5049 alu.dst.chan = i;
5050
5051 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5052 alu.src[1].sel = V_SQ_ALU_SRC_1;
5053 r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
5054
5055 if (i == 3)
5056 alu.last = 1;
5057 r = r600_bytecode_add_alu(ctx->bc, &alu);
5058 if (r)
5059 return r;
5060 }
5061
5062 /* dst = (-tmp > 0 ? -1 : tmp) */
5063 for (i = 0; i < 4; i++) {
5064 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5065 alu.op = ALU_OP3_CNDGT;
5066 alu.is_op3 = 1;
5067 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5068
5069 alu.src[0].sel = ctx->temp_reg;
5070 alu.src[0].chan = i;
5071 alu.src[0].neg = 1;
5072
5073 alu.src[1].sel = V_SQ_ALU_SRC_1;
5074 alu.src[1].neg = 1;
5075
5076 alu.src[2].sel = ctx->temp_reg;
5077 alu.src[2].chan = i;
5078
5079 if (i == 3)
5080 alu.last = 1;
5081 r = r600_bytecode_add_alu(ctx->bc, &alu);
5082 if (r)
5083 return r;
5084 }
5085 return 0;
5086 }
5087
5088 static int tgsi_bfi(struct r600_shader_ctx *ctx)
5089 {
5090 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5091 struct r600_bytecode_alu alu;
5092 int i, r, t1, t2;
5093
5094 unsigned write_mask = inst->Dst[0].Register.WriteMask;
5095 int last_inst = tgsi_last_instruction(write_mask);
5096
5097 t1 = ctx->temp_reg;
5098
5099 for (i = 0; i < 4; i++) {
5100 if (!(write_mask & (1<<i)))
5101 continue;
5102
5103 /* create mask tmp */
5104 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5105 alu.op = ALU_OP2_BFM_INT;
5106 alu.dst.sel = t1;
5107 alu.dst.chan = i;
5108 alu.dst.write = 1;
5109 alu.last = i == last_inst;
5110
5111 r600_bytecode_src(&alu.src[0], &ctx->src[3], i);
5112 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
5113
5114 r = r600_bytecode_add_alu(ctx->bc, &alu);
5115 if (r)
5116 return r;
5117 }
5118
5119 t2 = r600_get_temp(ctx);
5120
5121 for (i = 0; i < 4; i++) {
5122 if (!(write_mask & (1<<i)))
5123 continue;
5124
5125 /* shift insert left */
5126 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5127 alu.op = ALU_OP2_LSHL_INT;
5128 alu.dst.sel = t2;
5129 alu.dst.chan = i;
5130 alu.dst.write = 1;
5131 alu.last = i == last_inst;
5132
5133 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5134 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
5135
5136 r = r600_bytecode_add_alu(ctx->bc, &alu);
5137 if (r)
5138 return r;
5139 }
5140
5141 for (i = 0; i < 4; i++) {
5142 if (!(write_mask & (1<<i)))
5143 continue;
5144
5145 /* actual bitfield insert */
5146 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5147 alu.op = ALU_OP3_BFI_INT;
5148 alu.is_op3 = 1;
5149 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5150 alu.dst.chan = i;
5151 alu.dst.write = 1;
5152 alu.last = i == last_inst;
5153
5154 alu.src[0].sel = t1;
5155 alu.src[0].chan = i;
5156 alu.src[1].sel = t2;
5157 alu.src[1].chan = i;
5158 r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
5159
5160 r = r600_bytecode_add_alu(ctx->bc, &alu);
5161 if (r)
5162 return r;
5163 }
5164
5165 return 0;
5166 }
5167
5168 static int tgsi_msb(struct r600_shader_ctx *ctx)
5169 {
5170 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5171 struct r600_bytecode_alu alu;
5172 int i, r, t1, t2;
5173
5174 unsigned write_mask = inst->Dst[0].Register.WriteMask;
5175 int last_inst = tgsi_last_instruction(write_mask);
5176
5177 assert(ctx->inst_info->op == ALU_OP1_FFBH_INT ||
5178 ctx->inst_info->op == ALU_OP1_FFBH_UINT);
5179
5180 t1 = ctx->temp_reg;
5181
5182 /* bit position is indexed from lsb by TGSI, and from msb by the hardware */
5183 for (i = 0; i < 4; i++) {
5184 if (!(write_mask & (1<<i)))
5185 continue;
5186
5187 /* t1 = FFBH_INT / FFBH_UINT */
5188 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5189 alu.op = ctx->inst_info->op;
5190 alu.dst.sel = t1;
5191 alu.dst.chan = i;
5192 alu.dst.write = 1;
5193 alu.last = i == last_inst;
5194
5195 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5196
5197 r = r600_bytecode_add_alu(ctx->bc, &alu);
5198 if (r)
5199 return r;
5200 }
5201
5202 t2 = r600_get_temp(ctx);
5203
5204 for (i = 0; i < 4; i++) {
5205 if (!(write_mask & (1<<i)))
5206 continue;
5207
5208 /* t2 = 31 - t1 */
5209 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5210 alu.op = ALU_OP2_SUB_INT;
5211 alu.dst.sel = t2;
5212 alu.dst.chan = i;
5213 alu.dst.write = 1;
5214 alu.last = i == last_inst;
5215
5216 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
5217 alu.src[0].value = 31;
5218 alu.src[1].sel = t1;
5219 alu.src[1].chan = i;
5220
5221 r = r600_bytecode_add_alu(ctx->bc, &alu);
5222 if (r)
5223 return r;
5224 }
5225
5226 for (i = 0; i < 4; i++) {
5227 if (!(write_mask & (1<<i)))
5228 continue;
5229
5230 /* result = t1 >= 0 ? t2 : t1 */
5231 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5232 alu.op = ALU_OP3_CNDGE_INT;
5233 alu.is_op3 = 1;
5234 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5235 alu.dst.chan = i;
5236 alu.dst.write = 1;
5237 alu.last = i == last_inst;
5238
5239 alu.src[0].sel = t1;
5240 alu.src[0].chan = i;
5241 alu.src[1].sel = t2;
5242 alu.src[1].chan = i;
5243 alu.src[2].sel = t1;
5244 alu.src[2].chan = i;
5245
5246 r = r600_bytecode_add_alu(ctx->bc, &alu);
5247 if (r)
5248 return r;
5249 }
5250
5251 return 0;
5252 }
5253
5254 static int tgsi_interp_egcm(struct r600_shader_ctx *ctx)
5255 {
5256 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5257 struct r600_bytecode_alu alu;
5258 int r, i = 0, k, interp_gpr, interp_base_chan, tmp, lasti;
5259 unsigned location;
5260 int input;
5261
5262 assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
5263
5264 input = inst->Src[0].Register.Index;
5265
5266 /* Interpolators have been marked for use already by allocate_system_value_inputs */
5267 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5268 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5269 location = TGSI_INTERPOLATE_LOC_CENTER; /* sample offset will be added explicitly */
5270 }
5271 else {
5272 location = TGSI_INTERPOLATE_LOC_CENTROID;
5273 }
5274
5275 k = eg_get_interpolator_index(ctx->shader->input[input].interpolate, location);
5276 if (k < 0)
5277 k = 0;
5278 interp_gpr = ctx->eg_interpolators[k].ij_index / 2;
5279 interp_base_chan = 2 * (ctx->eg_interpolators[k].ij_index % 2);
5280
5281 /* NOTE: currently offset is not perspective correct */
5282 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5283 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5284 int sample_gpr = -1;
5285 int gradientsH, gradientsV;
5286 struct r600_bytecode_tex tex;
5287
5288 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5289 sample_gpr = load_sample_position(ctx, &ctx->src[1], ctx->src[1].swizzle[0]);
5290 }
5291
5292 gradientsH = r600_get_temp(ctx);
5293 gradientsV = r600_get_temp(ctx);
5294 for (i = 0; i < 2; i++) {
5295 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
5296 tex.op = i == 0 ? FETCH_OP_GET_GRADIENTS_H : FETCH_OP_GET_GRADIENTS_V;
5297 tex.src_gpr = interp_gpr;
5298 tex.src_sel_x = interp_base_chan + 0;
5299 tex.src_sel_y = interp_base_chan + 1;
5300 tex.src_sel_z = 0;
5301 tex.src_sel_w = 0;
5302 tex.dst_gpr = i == 0 ? gradientsH : gradientsV;
5303 tex.dst_sel_x = 0;
5304 tex.dst_sel_y = 1;
5305 tex.dst_sel_z = 7;
5306 tex.dst_sel_w = 7;
5307 tex.inst_mod = 1; // Use per pixel gradient calculation
5308 tex.sampler_id = 0;
5309 tex.resource_id = tex.sampler_id;
5310 r = r600_bytecode_add_tex(ctx->bc, &tex);
5311 if (r)
5312 return r;
5313 }
5314
5315 for (i = 0; i < 2; i++) {
5316 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5317 alu.op = ALU_OP3_MULADD;
5318 alu.is_op3 = 1;
5319 alu.src[0].sel = gradientsH;
5320 alu.src[0].chan = i;
5321 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5322 alu.src[1].sel = sample_gpr;
5323 alu.src[1].chan = 2;
5324 }
5325 else {
5326 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
5327 }
5328 alu.src[2].sel = interp_gpr;
5329 alu.src[2].chan = interp_base_chan + i;
5330 alu.dst.sel = ctx->temp_reg;
5331 alu.dst.chan = i;
5332 alu.last = i == 1;
5333
5334 r = r600_bytecode_add_alu(ctx->bc, &alu);
5335 if (r)
5336 return r;
5337 }
5338
5339 for (i = 0; i < 2; i++) {
5340 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5341 alu.op = ALU_OP3_MULADD;
5342 alu.is_op3 = 1;
5343 alu.src[0].sel = gradientsV;
5344 alu.src[0].chan = i;
5345 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5346 alu.src[1].sel = sample_gpr;
5347 alu.src[1].chan = 3;
5348 }
5349 else {
5350 r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
5351 }
5352 alu.src[2].sel = ctx->temp_reg;
5353 alu.src[2].chan = i;
5354 alu.dst.sel = ctx->temp_reg;
5355 alu.dst.chan = i;
5356 alu.last = i == 1;
5357
5358 r = r600_bytecode_add_alu(ctx->bc, &alu);
5359 if (r)
5360 return r;
5361 }
5362 }
5363
5364 tmp = r600_get_temp(ctx);
5365 for (i = 0; i < 8; i++) {
5366 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5367 alu.op = i < 4 ? ALU_OP2_INTERP_ZW : ALU_OP2_INTERP_XY;
5368
5369 alu.dst.sel = tmp;
5370 if ((i > 1 && i < 6)) {
5371 alu.dst.write = 1;
5372 }
5373 else {
5374 alu.dst.write = 0;
5375 }
5376 alu.dst.chan = i % 4;
5377
5378 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5379 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5380 alu.src[0].sel = ctx->temp_reg;
5381 alu.src[0].chan = 1 - (i % 2);
5382 } else {
5383 alu.src[0].sel = interp_gpr;
5384 alu.src[0].chan = interp_base_chan + 1 - (i % 2);
5385 }
5386 alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
5387 alu.src[1].chan = 0;
5388
5389 alu.last = i % 4 == 3;
5390 alu.bank_swizzle_force = SQ_ALU_VEC_210;
5391
5392 r = r600_bytecode_add_alu(ctx->bc, &alu);
5393 if (r)
5394 return r;
5395 }
5396
5397 // INTERP can't swizzle dst
5398 lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5399 for (i = 0; i <= lasti; i++) {
5400 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5401 continue;
5402
5403 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5404 alu.op = ALU_OP1_MOV;
5405 alu.src[0].sel = tmp;
5406 alu.src[0].chan = ctx->src[0].swizzle[i];
5407 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5408 alu.dst.write = 1;
5409 alu.last = i == lasti;
5410 r = r600_bytecode_add_alu(ctx->bc, &alu);
5411 if (r)
5412 return r;
5413 }
5414
5415 return 0;
5416 }
5417
5418
5419 static int tgsi_helper_copy(struct r600_shader_ctx *ctx, struct tgsi_full_instruction *inst)
5420 {
5421 struct r600_bytecode_alu alu;
5422 int i, r;
5423
5424 for (i = 0; i < 4; i++) {
5425 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5426 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) {
5427 alu.op = ALU_OP0_NOP;
5428 alu.dst.chan = i;
5429 } else {
5430 alu.op = ALU_OP1_MOV;
5431 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5432 alu.src[0].sel = ctx->temp_reg;
5433 alu.src[0].chan = i;
5434 }
5435 if (i == 3) {
5436 alu.last = 1;
5437 }
5438 r = r600_bytecode_add_alu(ctx->bc, &alu);
5439 if (r)
5440 return r;
5441 }
5442 return 0;
5443 }
5444
5445 static int tgsi_make_src_for_op3(struct r600_shader_ctx *ctx,
5446 unsigned temp, int chan,
5447 struct r600_bytecode_alu_src *bc_src,
5448 const struct r600_shader_src *shader_src)
5449 {
5450 struct r600_bytecode_alu alu;
5451 int r;
5452
5453 r600_bytecode_src(bc_src, shader_src, chan);
5454
5455 /* op3 operands don't support abs modifier */
5456 if (bc_src->abs) {
5457 assert(temp!=0); /* we actually need the extra register, make sure it is allocated. */
5458 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5459 alu.op = ALU_OP1_MOV;
5460 alu.dst.sel = temp;
5461 alu.dst.chan = chan;
5462 alu.dst.write = 1;
5463
5464 alu.src[0] = *bc_src;
5465 alu.last = true; // sufficient?
5466 r = r600_bytecode_add_alu(ctx->bc, &alu);
5467 if (r)
5468 return r;
5469
5470 memset(bc_src, 0, sizeof(*bc_src));
5471 bc_src->sel = temp;
5472 bc_src->chan = chan;
5473 }
5474 return 0;
5475 }
5476
5477 static int tgsi_op3(struct r600_shader_ctx *ctx)
5478 {
5479 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5480 struct r600_bytecode_alu alu;
5481 int i, j, r;
5482 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5483 int temp_regs[4];
5484
5485 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
5486 temp_regs[j] = 0;
5487 if (ctx->src[j].abs)
5488 temp_regs[j] = r600_get_temp(ctx);
5489 }
5490 for (i = 0; i < lasti + 1; i++) {
5491 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5492 continue;
5493
5494 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5495 alu.op = ctx->inst_info->op;
5496 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
5497 r = tgsi_make_src_for_op3(ctx, temp_regs[j], i, &alu.src[j], &ctx->src[j]);
5498 if (r)
5499 return r;
5500 }
5501
5502 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5503 alu.dst.chan = i;
5504 alu.dst.write = 1;
5505 alu.is_op3 = 1;
5506 if (i == lasti) {
5507 alu.last = 1;
5508 }
5509 r = r600_bytecode_add_alu(ctx->bc, &alu);
5510 if (r)
5511 return r;
5512 }
5513 return 0;
5514 }
5515
5516 static int tgsi_dp(struct r600_shader_ctx *ctx)
5517 {
5518 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5519 struct r600_bytecode_alu alu;
5520 int i, j, r;
5521
5522 for (i = 0; i < 4; i++) {
5523 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5524 alu.op = ctx->inst_info->op;
5525 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
5526 r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
5527 }
5528
5529 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5530 alu.dst.chan = i;
5531 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
5532 /* handle some special cases */
5533 switch (inst->Instruction.Opcode) {
5534 case TGSI_OPCODE_DP2:
5535 if (i > 1) {
5536 alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
5537 alu.src[0].chan = alu.src[1].chan = 0;
5538 }
5539 break;
5540 case TGSI_OPCODE_DP3:
5541 if (i > 2) {
5542 alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
5543 alu.src[0].chan = alu.src[1].chan = 0;
5544 }
5545 break;
5546 case TGSI_OPCODE_DPH:
5547 if (i == 3) {
5548 alu.src[0].sel = V_SQ_ALU_SRC_1;
5549 alu.src[0].chan = 0;
5550 alu.src[0].neg = 0;
5551 }
5552 break;
5553 default:
5554 break;
5555 }
5556 if (i == 3) {
5557 alu.last = 1;
5558 }
5559 r = r600_bytecode_add_alu(ctx->bc, &alu);
5560 if (r)
5561 return r;
5562 }
5563 return 0;
5564 }
5565
5566 static inline boolean tgsi_tex_src_requires_loading(struct r600_shader_ctx *ctx,
5567 unsigned index)
5568 {
5569 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5570 return (inst->Src[index].Register.File != TGSI_FILE_TEMPORARY &&
5571 inst->Src[index].Register.File != TGSI_FILE_INPUT &&
5572 inst->Src[index].Register.File != TGSI_FILE_OUTPUT) ||
5573 ctx->src[index].neg || ctx->src[index].abs ||
5574 (inst->Src[index].Register.File == TGSI_FILE_INPUT && ctx->type == TGSI_PROCESSOR_GEOMETRY);
5575 }
5576
5577 static inline unsigned tgsi_tex_get_src_gpr(struct r600_shader_ctx *ctx,
5578 unsigned index)
5579 {
5580 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5581 return ctx->file_offset[inst->Src[index].Register.File] + inst->Src[index].Register.Index;
5582 }
5583
5584 static int do_vtx_fetch_inst(struct r600_shader_ctx *ctx, boolean src_requires_loading)
5585 {
5586 struct r600_bytecode_vtx vtx;
5587 struct r600_bytecode_alu alu;
5588 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5589 int src_gpr, r, i;
5590 int id = tgsi_tex_get_src_gpr(ctx, 1);
5591
5592 src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
5593 if (src_requires_loading) {
5594 for (i = 0; i < 4; i++) {
5595 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5596 alu.op = ALU_OP1_MOV;
5597 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5598 alu.dst.sel = ctx->temp_reg;
5599 alu.dst.chan = i;
5600 if (i == 3)
5601 alu.last = 1;
5602 alu.dst.write = 1;
5603 r = r600_bytecode_add_alu(ctx->bc, &alu);
5604 if (r)
5605 return r;
5606 }
5607 src_gpr = ctx->temp_reg;
5608 }
5609
5610 memset(&vtx, 0, sizeof(vtx));
5611 vtx.op = FETCH_OP_VFETCH;
5612 vtx.buffer_id = id + R600_MAX_CONST_BUFFERS;
5613 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
5614 vtx.src_gpr = src_gpr;
5615 vtx.mega_fetch_count = 16;
5616 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
5617 vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; /* SEL_X */
5618 vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; /* SEL_Y */
5619 vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; /* SEL_Z */
5620 vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; /* SEL_W */
5621 vtx.use_const_fields = 1;
5622
5623 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
5624 return r;
5625
5626 if (ctx->bc->chip_class >= EVERGREEN)
5627 return 0;
5628
5629 for (i = 0; i < 4; i++) {
5630 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5631 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5632 continue;
5633
5634 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5635 alu.op = ALU_OP2_AND_INT;
5636
5637 alu.dst.chan = i;
5638 alu.dst.sel = vtx.dst_gpr;
5639 alu.dst.write = 1;
5640
5641 alu.src[0].sel = vtx.dst_gpr;
5642 alu.src[0].chan = i;
5643
5644 alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL;
5645 alu.src[1].sel += (id * 2);
5646 alu.src[1].chan = i % 4;
5647 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
5648
5649 if (i == lasti)
5650 alu.last = 1;
5651 r = r600_bytecode_add_alu(ctx->bc, &alu);
5652 if (r)
5653 return r;
5654 }
5655
5656 if (inst->Dst[0].Register.WriteMask & 3) {
5657 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5658 alu.op = ALU_OP2_OR_INT;
5659
5660 alu.dst.chan = 3;
5661 alu.dst.sel = vtx.dst_gpr;
5662 alu.dst.write = 1;
5663
5664 alu.src[0].sel = vtx.dst_gpr;
5665 alu.src[0].chan = 3;
5666
5667 alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL + (id * 2) + 1;
5668 alu.src[1].chan = 0;
5669 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
5670
5671 alu.last = 1;
5672 r = r600_bytecode_add_alu(ctx->bc, &alu);
5673 if (r)
5674 return r;
5675 }
5676 return 0;
5677 }
5678
5679 static int r600_do_buffer_txq(struct r600_shader_ctx *ctx)
5680 {
5681 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5682 struct r600_bytecode_alu alu;
5683 int r;
5684 int id = tgsi_tex_get_src_gpr(ctx, 1);
5685
5686 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5687 alu.op = ALU_OP1_MOV;
5688 alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
5689 if (ctx->bc->chip_class >= EVERGREEN) {
5690 /* channel 0 or 2 of each word */
5691 alu.src[0].sel += (id / 2);
5692 alu.src[0].chan = (id % 2) * 2;
5693 } else {
5694 /* r600 we have them at channel 2 of the second dword */
5695 alu.src[0].sel += (id * 2) + 1;
5696 alu.src[0].chan = 1;
5697 }
5698 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
5699 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
5700 alu.last = 1;
5701 r = r600_bytecode_add_alu(ctx->bc, &alu);
5702 if (r)
5703 return r;
5704 return 0;
5705 }
5706
5707 static int tgsi_tex(struct r600_shader_ctx *ctx)
5708 {
5709 static float one_point_five = 1.5f;
5710 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5711 struct r600_bytecode_tex tex;
5712 struct r600_bytecode_alu alu;
5713 unsigned src_gpr;
5714 int r, i, j;
5715 int opcode;
5716 bool read_compressed_msaa = ctx->bc->has_compressed_msaa_texturing &&
5717 inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
5718 (inst->Texture.Texture == TGSI_TEXTURE_2D_MSAA ||
5719 inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA);
5720
5721 bool txf_add_offsets = inst->Texture.NumOffsets &&
5722 inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
5723 inst->Texture.Texture != TGSI_TEXTURE_BUFFER;
5724
5725 /* Texture fetch instructions can only use gprs as source.
5726 * Also they cannot negate the source or take the absolute value */
5727 const boolean src_requires_loading = (inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ &&
5728 inst->Instruction.Opcode != TGSI_OPCODE_TXQS &&
5729 tgsi_tex_src_requires_loading(ctx, 0)) ||
5730 read_compressed_msaa || txf_add_offsets;
5731
5732 boolean src_loaded = FALSE;
5733 unsigned sampler_src_reg = inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ ? 0 : 1;
5734 int8_t offset_x = 0, offset_y = 0, offset_z = 0;
5735 boolean has_txq_cube_array_z = false;
5736 unsigned sampler_index_mode;
5737
5738 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ &&
5739 ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
5740 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)))
5741 if (inst->Dst[0].Register.WriteMask & 4) {
5742 ctx->shader->has_txq_cube_array_z_comp = true;
5743 has_txq_cube_array_z = true;
5744 }
5745
5746 if (inst->Instruction.Opcode == TGSI_OPCODE_TEX2 ||
5747 inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
5748 inst->Instruction.Opcode == TGSI_OPCODE_TXL2 ||
5749 inst->Instruction.Opcode == TGSI_OPCODE_TG4)
5750 sampler_src_reg = 2;
5751
5752 /* TGSI moves the sampler to src reg 3 for TXD */
5753 if (inst->Instruction.Opcode == TGSI_OPCODE_TXD)
5754 sampler_src_reg = 3;
5755
5756 sampler_index_mode = inst->Src[sampler_src_reg].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
5757
5758 src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
5759
5760 if (inst->Texture.Texture == TGSI_TEXTURE_BUFFER) {
5761 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) {
5762 ctx->shader->uses_tex_buffers = true;
5763 return r600_do_buffer_txq(ctx);
5764 }
5765 else if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
5766 if (ctx->bc->chip_class < EVERGREEN)
5767 ctx->shader->uses_tex_buffers = true;
5768 return do_vtx_fetch_inst(ctx, src_requires_loading);
5769 }
5770 }
5771
5772 if (inst->Instruction.Opcode == TGSI_OPCODE_TXP) {
5773 int out_chan;
5774 /* Add perspective divide */
5775 if (ctx->bc->chip_class == CAYMAN) {
5776 out_chan = 2;
5777 for (i = 0; i < 3; i++) {
5778 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5779 alu.op = ALU_OP1_RECIP_IEEE;
5780 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
5781
5782 alu.dst.sel = ctx->temp_reg;
5783 alu.dst.chan = i;
5784 if (i == 2)
5785 alu.last = 1;
5786 if (out_chan == i)
5787 alu.dst.write = 1;
5788 r = r600_bytecode_add_alu(ctx->bc, &alu);
5789 if (r)
5790 return r;
5791 }
5792
5793 } else {
5794 out_chan = 3;
5795 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5796 alu.op = ALU_OP1_RECIP_IEEE;
5797 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
5798
5799 alu.dst.sel = ctx->temp_reg;
5800 alu.dst.chan = out_chan;
5801 alu.last = 1;
5802 alu.dst.write = 1;
5803 r = r600_bytecode_add_alu(ctx->bc, &alu);
5804 if (r)
5805 return r;
5806 }
5807
5808 for (i = 0; i < 3; i++) {
5809 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5810 alu.op = ALU_OP2_MUL;
5811 alu.src[0].sel = ctx->temp_reg;
5812 alu.src[0].chan = out_chan;
5813 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5814 alu.dst.sel = ctx->temp_reg;
5815 alu.dst.chan = i;
5816 alu.dst.write = 1;
5817 r = r600_bytecode_add_alu(ctx->bc, &alu);
5818 if (r)
5819 return r;
5820 }
5821 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5822 alu.op = ALU_OP1_MOV;
5823 alu.src[0].sel = V_SQ_ALU_SRC_1;
5824 alu.src[0].chan = 0;
5825 alu.dst.sel = ctx->temp_reg;
5826 alu.dst.chan = 3;
5827 alu.last = 1;
5828 alu.dst.write = 1;
5829 r = r600_bytecode_add_alu(ctx->bc, &alu);
5830 if (r)
5831 return r;
5832 src_loaded = TRUE;
5833 src_gpr = ctx->temp_reg;
5834 }
5835
5836
5837 if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
5838 inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
5839 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
5840 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
5841 inst->Instruction.Opcode != TGSI_OPCODE_TXQ &&
5842 inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ) {
5843
5844 static const unsigned src0_swizzle[] = {2, 2, 0, 1};
5845 static const unsigned src1_swizzle[] = {1, 0, 2, 2};
5846
5847 /* tmp1.xyzw = CUBE(R0.zzxy, R0.yxzz) */
5848 for (i = 0; i < 4; i++) {
5849 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5850 alu.op = ALU_OP2_CUBE;
5851 r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
5852 r600_bytecode_src(&alu.src[1], &ctx->src[0], src1_swizzle[i]);
5853 alu.dst.sel = ctx->temp_reg;
5854 alu.dst.chan = i;
5855 if (i == 3)
5856 alu.last = 1;
5857 alu.dst.write = 1;
5858 r = r600_bytecode_add_alu(ctx->bc, &alu);
5859 if (r)
5860 return r;
5861 }
5862
5863 /* tmp1.z = RCP_e(|tmp1.z|) */
5864 if (ctx->bc->chip_class == CAYMAN) {
5865 for (i = 0; i < 3; i++) {
5866 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5867 alu.op = ALU_OP1_RECIP_IEEE;
5868 alu.src[0].sel = ctx->temp_reg;
5869 alu.src[0].chan = 2;
5870 alu.src[0].abs = 1;
5871 alu.dst.sel = ctx->temp_reg;
5872 alu.dst.chan = i;
5873 if (i == 2)
5874 alu.dst.write = 1;
5875 if (i == 2)
5876 alu.last = 1;
5877 r = r600_bytecode_add_alu(ctx->bc, &alu);
5878 if (r)
5879 return r;
5880 }
5881 } else {
5882 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5883 alu.op = ALU_OP1_RECIP_IEEE;
5884 alu.src[0].sel = ctx->temp_reg;
5885 alu.src[0].chan = 2;
5886 alu.src[0].abs = 1;
5887 alu.dst.sel = ctx->temp_reg;
5888 alu.dst.chan = 2;
5889 alu.dst.write = 1;
5890 alu.last = 1;
5891 r = r600_bytecode_add_alu(ctx->bc, &alu);
5892 if (r)
5893 return r;
5894 }
5895
5896 /* MULADD R0.x, R0.x, PS1, (0x3FC00000, 1.5f).x
5897 * MULADD R0.y, R0.y, PS1, (0x3FC00000, 1.5f).x
5898 * muladd has no writemask, have to use another temp
5899 */
5900 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5901 alu.op = ALU_OP3_MULADD;
5902 alu.is_op3 = 1;
5903
5904 alu.src[0].sel = ctx->temp_reg;
5905 alu.src[0].chan = 0;
5906 alu.src[1].sel = ctx->temp_reg;
5907 alu.src[1].chan = 2;
5908
5909 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
5910 alu.src[2].chan = 0;
5911 alu.src[2].value = *(uint32_t *)&one_point_five;
5912
5913 alu.dst.sel = ctx->temp_reg;
5914 alu.dst.chan = 0;
5915 alu.dst.write = 1;
5916
5917 r = r600_bytecode_add_alu(ctx->bc, &alu);
5918 if (r)
5919 return r;
5920
5921 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5922 alu.op = ALU_OP3_MULADD;
5923 alu.is_op3 = 1;
5924
5925 alu.src[0].sel = ctx->temp_reg;
5926 alu.src[0].chan = 1;
5927 alu.src[1].sel = ctx->temp_reg;
5928 alu.src[1].chan = 2;
5929
5930 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
5931 alu.src[2].chan = 0;
5932 alu.src[2].value = *(uint32_t *)&one_point_five;
5933
5934 alu.dst.sel = ctx->temp_reg;
5935 alu.dst.chan = 1;
5936 alu.dst.write = 1;
5937
5938 alu.last = 1;
5939 r = r600_bytecode_add_alu(ctx->bc, &alu);
5940 if (r)
5941 return r;
5942 /* write initial compare value into Z component
5943 - W src 0 for shadow cube
5944 - X src 1 for shadow cube array */
5945 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
5946 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
5947 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5948 alu.op = ALU_OP1_MOV;
5949 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
5950 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
5951 else
5952 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
5953 alu.dst.sel = ctx->temp_reg;
5954 alu.dst.chan = 2;
5955 alu.dst.write = 1;
5956 alu.last = 1;
5957 r = r600_bytecode_add_alu(ctx->bc, &alu);
5958 if (r)
5959 return r;
5960 }
5961
5962 if (inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
5963 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
5964 if (ctx->bc->chip_class >= EVERGREEN) {
5965 int mytmp = r600_get_temp(ctx);
5966 static const float eight = 8.0f;
5967 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5968 alu.op = ALU_OP1_MOV;
5969 alu.src[0].sel = ctx->temp_reg;
5970 alu.src[0].chan = 3;
5971 alu.dst.sel = mytmp;
5972 alu.dst.chan = 0;
5973 alu.dst.write = 1;
5974 alu.last = 1;
5975 r = r600_bytecode_add_alu(ctx->bc, &alu);
5976 if (r)
5977 return r;
5978
5979 /* have to multiply original layer by 8 and add to face id (temp.w) in Z */
5980 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5981 alu.op = ALU_OP3_MULADD;
5982 alu.is_op3 = 1;
5983 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
5984 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
5985 alu.src[1].chan = 0;
5986 alu.src[1].value = *(uint32_t *)&eight;
5987 alu.src[2].sel = mytmp;
5988 alu.src[2].chan = 0;
5989 alu.dst.sel = ctx->temp_reg;
5990 alu.dst.chan = 3;
5991 alu.dst.write = 1;
5992 alu.last = 1;
5993 r = r600_bytecode_add_alu(ctx->bc, &alu);
5994 if (r)
5995 return r;
5996 } else if (ctx->bc->chip_class < EVERGREEN) {
5997 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
5998 tex.op = FETCH_OP_SET_CUBEMAP_INDEX;
5999 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
6000 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
6001 tex.src_gpr = r600_get_temp(ctx);
6002 tex.src_sel_x = 0;
6003 tex.src_sel_y = 0;
6004 tex.src_sel_z = 0;
6005 tex.src_sel_w = 0;
6006 tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
6007 tex.coord_type_x = 1;
6008 tex.coord_type_y = 1;
6009 tex.coord_type_z = 1;
6010 tex.coord_type_w = 1;
6011 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6012 alu.op = ALU_OP1_MOV;
6013 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
6014 alu.dst.sel = tex.src_gpr;
6015 alu.dst.chan = 0;
6016 alu.last = 1;
6017 alu.dst.write = 1;
6018 r = r600_bytecode_add_alu(ctx->bc, &alu);
6019 if (r)
6020 return r;
6021
6022 r = r600_bytecode_add_tex(ctx->bc, &tex);
6023 if (r)
6024 return r;
6025 }
6026
6027 }
6028
6029 /* for cube forms of lod and bias we need to route things */
6030 if (inst->Instruction.Opcode == TGSI_OPCODE_TXB ||
6031 inst->Instruction.Opcode == TGSI_OPCODE_TXL ||
6032 inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
6033 inst->Instruction.Opcode == TGSI_OPCODE_TXL2) {
6034 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6035 alu.op = ALU_OP1_MOV;
6036 if (inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
6037 inst->Instruction.Opcode == TGSI_OPCODE_TXL2)
6038 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
6039 else
6040 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
6041 alu.dst.sel = ctx->temp_reg;
6042 alu.dst.chan = 2;
6043 alu.last = 1;
6044 alu.dst.write = 1;
6045 r = r600_bytecode_add_alu(ctx->bc, &alu);
6046 if (r)
6047 return r;
6048 }
6049
6050 src_loaded = TRUE;
6051 src_gpr = ctx->temp_reg;
6052 }
6053
6054 if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) {
6055 int temp_h = 0, temp_v = 0;
6056 int start_val = 0;
6057
6058 /* if we've already loaded the src (i.e. CUBE don't reload it). */
6059 if (src_loaded == TRUE)
6060 start_val = 1;
6061 else
6062 src_loaded = TRUE;
6063 for (i = start_val; i < 3; i++) {
6064 int treg = r600_get_temp(ctx);
6065
6066 if (i == 0)
6067 src_gpr = treg;
6068 else if (i == 1)
6069 temp_h = treg;
6070 else
6071 temp_v = treg;
6072
6073 for (j = 0; j < 4; j++) {
6074 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6075 alu.op = ALU_OP1_MOV;
6076 r600_bytecode_src(&alu.src[0], &ctx->src[i], j);
6077 alu.dst.sel = treg;
6078 alu.dst.chan = j;
6079 if (j == 3)
6080 alu.last = 1;
6081 alu.dst.write = 1;
6082 r = r600_bytecode_add_alu(ctx->bc, &alu);
6083 if (r)
6084 return r;
6085 }
6086 }
6087 for (i = 1; i < 3; i++) {
6088 /* set gradients h/v */
6089 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
6090 tex.op = (i == 1) ? FETCH_OP_SET_GRADIENTS_H :
6091 FETCH_OP_SET_GRADIENTS_V;
6092 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
6093 tex.sampler_index_mode = sampler_index_mode;
6094 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
6095 tex.resource_index_mode = sampler_index_mode;
6096
6097 tex.src_gpr = (i == 1) ? temp_h : temp_v;
6098 tex.src_sel_x = 0;
6099 tex.src_sel_y = 1;
6100 tex.src_sel_z = 2;
6101 tex.src_sel_w = 3;
6102
6103 tex.dst_gpr = r600_get_temp(ctx); /* just to avoid confusing the asm scheduler */
6104 tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
6105 if (inst->Texture.Texture != TGSI_TEXTURE_RECT) {
6106 tex.coord_type_x = 1;
6107 tex.coord_type_y = 1;
6108 tex.coord_type_z = 1;
6109 tex.coord_type_w = 1;
6110 }
6111 r = r600_bytecode_add_tex(ctx->bc, &tex);
6112 if (r)
6113 return r;
6114 }
6115 }
6116
6117 if (src_requires_loading && !src_loaded) {
6118 for (i = 0; i < 4; i++) {
6119 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6120 alu.op = ALU_OP1_MOV;
6121 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6122 alu.dst.sel = ctx->temp_reg;
6123 alu.dst.chan = i;
6124 if (i == 3)
6125 alu.last = 1;
6126 alu.dst.write = 1;
6127 r = r600_bytecode_add_alu(ctx->bc, &alu);
6128 if (r)
6129 return r;
6130 }
6131 src_loaded = TRUE;
6132 src_gpr = ctx->temp_reg;
6133 }
6134
6135 /* get offset values */
6136 if (inst->Texture.NumOffsets) {
6137 assert(inst->Texture.NumOffsets == 1);
6138
6139 /* The texture offset feature doesn't work with the TXF instruction
6140 * and must be emulated by adding the offset to the texture coordinates. */
6141 if (txf_add_offsets) {
6142 const struct tgsi_texture_offset *off = inst->TexOffsets;
6143
6144 switch (inst->Texture.Texture) {
6145 case TGSI_TEXTURE_3D:
6146 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6147 alu.op = ALU_OP2_ADD_INT;
6148 alu.src[0].sel = src_gpr;
6149 alu.src[0].chan = 2;
6150 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6151 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleZ];
6152 alu.dst.sel = src_gpr;
6153 alu.dst.chan = 2;
6154 alu.dst.write = 1;
6155 alu.last = 1;
6156 r = r600_bytecode_add_alu(ctx->bc, &alu);
6157 if (r)
6158 return r;
6159 /* fall through */
6160
6161 case TGSI_TEXTURE_2D:
6162 case TGSI_TEXTURE_SHADOW2D:
6163 case TGSI_TEXTURE_RECT:
6164 case TGSI_TEXTURE_SHADOWRECT:
6165 case TGSI_TEXTURE_2D_ARRAY:
6166 case TGSI_TEXTURE_SHADOW2D_ARRAY:
6167 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6168 alu.op = ALU_OP2_ADD_INT;
6169 alu.src[0].sel = src_gpr;
6170 alu.src[0].chan = 1;
6171 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6172 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleY];
6173 alu.dst.sel = src_gpr;
6174 alu.dst.chan = 1;
6175 alu.dst.write = 1;
6176 alu.last = 1;
6177 r = r600_bytecode_add_alu(ctx->bc, &alu);
6178 if (r)
6179 return r;
6180 /* fall through */
6181
6182 case TGSI_TEXTURE_1D:
6183 case TGSI_TEXTURE_SHADOW1D:
6184 case TGSI_TEXTURE_1D_ARRAY:
6185 case TGSI_TEXTURE_SHADOW1D_ARRAY:
6186 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6187 alu.op = ALU_OP2_ADD_INT;
6188 alu.src[0].sel = src_gpr;
6189 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6190 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleX];
6191 alu.dst.sel = src_gpr;
6192 alu.dst.write = 1;
6193 alu.last = 1;
6194 r = r600_bytecode_add_alu(ctx->bc, &alu);
6195 if (r)
6196 return r;
6197 break;
6198 /* texture offsets do not apply to other texture targets */
6199 }
6200 } else {
6201 switch (inst->Texture.Texture) {
6202 case TGSI_TEXTURE_3D:
6203 offset_z = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleZ] << 1;
6204 /* fallthrough */
6205 case TGSI_TEXTURE_2D:
6206 case TGSI_TEXTURE_SHADOW2D:
6207 case TGSI_TEXTURE_RECT:
6208 case TGSI_TEXTURE_SHADOWRECT:
6209 case TGSI_TEXTURE_2D_ARRAY:
6210 case TGSI_TEXTURE_SHADOW2D_ARRAY:
6211 offset_y = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleY] << 1;
6212 /* fallthrough */
6213 case TGSI_TEXTURE_1D:
6214 case TGSI_TEXTURE_SHADOW1D:
6215 case TGSI_TEXTURE_1D_ARRAY:
6216 case TGSI_TEXTURE_SHADOW1D_ARRAY:
6217 offset_x = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleX] << 1;
6218 }
6219 }
6220 }
6221
6222 /* Obtain the sample index for reading a compressed MSAA color texture.
6223 * To read the FMASK, we use the ldfptr instruction, which tells us
6224 * where the samples are stored.
6225 * For uncompressed 8x MSAA surfaces, ldfptr should return 0x76543210,
6226 * which is the identity mapping. Each nibble says which physical sample
6227 * should be fetched to get that sample.
6228 *
6229 * Assume src.z contains the sample index. It should be modified like this:
6230 * src.z = (ldfptr() >> (src.z * 4)) & 0xF;
6231 * Then fetch the texel with src.
6232 */
6233 if (read_compressed_msaa) {
6234 unsigned sample_chan = 3;
6235 unsigned temp = r600_get_temp(ctx);
6236 assert(src_loaded);
6237
6238 /* temp.w = ldfptr() */
6239 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
6240 tex.op = FETCH_OP_LD;
6241 tex.inst_mod = 1; /* to indicate this is ldfptr */
6242 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
6243 tex.sampler_index_mode = sampler_index_mode;
6244 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
6245 tex.resource_index_mode = sampler_index_mode;
6246 tex.src_gpr = src_gpr;
6247 tex.dst_gpr = temp;
6248 tex.dst_sel_x = 7; /* mask out these components */
6249 tex.dst_sel_y = 7;
6250 tex.dst_sel_z = 7;
6251 tex.dst_sel_w = 0; /* store X */
6252 tex.src_sel_x = 0;
6253 tex.src_sel_y = 1;
6254 tex.src_sel_z = 2;
6255 tex.src_sel_w = 3;
6256 tex.offset_x = offset_x;
6257 tex.offset_y = offset_y;
6258 tex.offset_z = offset_z;
6259 r = r600_bytecode_add_tex(ctx->bc, &tex);
6260 if (r)
6261 return r;
6262
6263 /* temp.x = sample_index*4 */
6264 if (ctx->bc->chip_class == CAYMAN) {
6265 for (i = 0 ; i < 4; i++) {
6266 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6267 alu.op = ALU_OP2_MULLO_INT;
6268 alu.src[0].sel = src_gpr;
6269 alu.src[0].chan = sample_chan;
6270 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6271 alu.src[1].value = 4;
6272 alu.dst.sel = temp;
6273 alu.dst.chan = i;
6274 alu.dst.write = i == 0;
6275 if (i == 3)
6276 alu.last = 1;
6277 r = r600_bytecode_add_alu(ctx->bc, &alu);
6278 if (r)
6279 return r;
6280 }
6281 } else {
6282 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6283 alu.op = ALU_OP2_MULLO_INT;
6284 alu.src[0].sel = src_gpr;
6285 alu.src[0].chan = sample_chan;
6286 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6287 alu.src[1].value = 4;
6288 alu.dst.sel = temp;
6289 alu.dst.chan = 0;
6290 alu.dst.write = 1;
6291 alu.last = 1;
6292 r = r600_bytecode_add_alu(ctx->bc, &alu);
6293 if (r)
6294 return r;
6295 }
6296
6297 /* sample_index = temp.w >> temp.x */
6298 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6299 alu.op = ALU_OP2_LSHR_INT;
6300 alu.src[0].sel = temp;
6301 alu.src[0].chan = 3;
6302 alu.src[1].sel = temp;
6303 alu.src[1].chan = 0;
6304 alu.dst.sel = src_gpr;
6305 alu.dst.chan = sample_chan;
6306 alu.dst.write = 1;
6307 alu.last = 1;
6308 r = r600_bytecode_add_alu(ctx->bc, &alu);
6309 if (r)
6310 return r;
6311
6312 /* sample_index & 0xF */
6313 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6314 alu.op = ALU_OP2_AND_INT;
6315 alu.src[0].sel = src_gpr;
6316 alu.src[0].chan = sample_chan;
6317 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6318 alu.src[1].value = 0xF;
6319 alu.dst.sel = src_gpr;
6320 alu.dst.chan = sample_chan;
6321 alu.dst.write = 1;
6322 alu.last = 1;
6323 r = r600_bytecode_add_alu(ctx->bc, &alu);
6324 if (r)
6325 return r;
6326 #if 0
6327 /* visualize the FMASK */
6328 for (i = 0; i < 4; i++) {
6329 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6330 alu.op = ALU_OP1_INT_TO_FLT;
6331 alu.src[0].sel = src_gpr;
6332 alu.src[0].chan = sample_chan;
6333 alu.dst.sel = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
6334 alu.dst.chan = i;
6335 alu.dst.write = 1;
6336 alu.last = 1;
6337 r = r600_bytecode_add_alu(ctx->bc, &alu);
6338 if (r)
6339 return r;
6340 }
6341 return 0;
6342 #endif
6343 }
6344
6345 /* does this shader want a num layers from TXQ for a cube array? */
6346 if (has_txq_cube_array_z) {
6347 int id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
6348
6349 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6350 alu.op = ALU_OP1_MOV;
6351
6352 alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
6353 if (ctx->bc->chip_class >= EVERGREEN) {
6354 /* channel 1 or 3 of each word */
6355 alu.src[0].sel += (id / 2);
6356 alu.src[0].chan = ((id % 2) * 2) + 1;
6357 } else {
6358 /* r600 we have them at channel 2 of the second dword */
6359 alu.src[0].sel += (id * 2) + 1;
6360 alu.src[0].chan = 2;
6361 }
6362 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
6363 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
6364 alu.last = 1;
6365 r = r600_bytecode_add_alu(ctx->bc, &alu);
6366 if (r)
6367 return r;
6368 /* disable writemask from texture instruction */
6369 inst->Dst[0].Register.WriteMask &= ~4;
6370 }
6371
6372 opcode = ctx->inst_info->op;
6373 if (opcode == FETCH_OP_GATHER4 &&
6374 inst->TexOffsets[0].File != TGSI_FILE_NULL &&
6375 inst->TexOffsets[0].File != TGSI_FILE_IMMEDIATE) {
6376 opcode = FETCH_OP_GATHER4_O;
6377
6378 /* GATHER4_O/GATHER4_C_O use offset values loaded by
6379 SET_TEXTURE_OFFSETS instruction. The immediate offset values
6380 encoded in the instruction are ignored. */
6381 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
6382 tex.op = FETCH_OP_SET_TEXTURE_OFFSETS;
6383 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
6384 tex.sampler_index_mode = sampler_index_mode;
6385 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
6386 tex.resource_index_mode = sampler_index_mode;
6387
6388 tex.src_gpr = ctx->file_offset[inst->TexOffsets[0].File] + inst->TexOffsets[0].Index;
6389 tex.src_sel_x = inst->TexOffsets[0].SwizzleX;
6390 tex.src_sel_y = inst->TexOffsets[0].SwizzleY;
6391 tex.src_sel_z = inst->TexOffsets[0].SwizzleZ;
6392 tex.src_sel_w = 4;
6393
6394 tex.dst_sel_x = 7;
6395 tex.dst_sel_y = 7;
6396 tex.dst_sel_z = 7;
6397 tex.dst_sel_w = 7;
6398
6399 r = r600_bytecode_add_tex(ctx->bc, &tex);
6400 if (r)
6401 return r;
6402 }
6403
6404 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
6405 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
6406 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
6407 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
6408 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY ||
6409 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
6410 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
6411 switch (opcode) {
6412 case FETCH_OP_SAMPLE:
6413 opcode = FETCH_OP_SAMPLE_C;
6414 break;
6415 case FETCH_OP_SAMPLE_L:
6416 opcode = FETCH_OP_SAMPLE_C_L;
6417 break;
6418 case FETCH_OP_SAMPLE_LB:
6419 opcode = FETCH_OP_SAMPLE_C_LB;
6420 break;
6421 case FETCH_OP_SAMPLE_G:
6422 opcode = FETCH_OP_SAMPLE_C_G;
6423 break;
6424 /* Texture gather variants */
6425 case FETCH_OP_GATHER4:
6426 opcode = FETCH_OP_GATHER4_C;
6427 break;
6428 case FETCH_OP_GATHER4_O:
6429 opcode = FETCH_OP_GATHER4_C_O;
6430 break;
6431 }
6432 }
6433
6434 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
6435 tex.op = opcode;
6436
6437 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
6438 tex.sampler_index_mode = sampler_index_mode;
6439 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
6440 tex.resource_index_mode = sampler_index_mode;
6441 tex.src_gpr = src_gpr;
6442 tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
6443
6444 if (inst->Instruction.Opcode == TGSI_OPCODE_DDX_FINE ||
6445 inst->Instruction.Opcode == TGSI_OPCODE_DDY_FINE) {
6446 tex.inst_mod = 1; /* per pixel gradient calculation instead of per 2x2 quad */
6447 }
6448
6449 if (inst->Instruction.Opcode == TGSI_OPCODE_TG4) {
6450 int8_t texture_component_select = ctx->literals[4 * inst->Src[1].Register.Index + inst->Src[1].Register.SwizzleX];
6451 tex.inst_mod = texture_component_select;
6452
6453 if (ctx->bc->chip_class == CAYMAN) {
6454 /* GATHER4 result order is different from TGSI TG4 */
6455 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 0 : 7;
6456 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 4) ? 1 : 7;
6457 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 1) ? 2 : 7;
6458 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
6459 } else {
6460 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
6461 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
6462 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
6463 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
6464 }
6465 }
6466 else if (inst->Instruction.Opcode == TGSI_OPCODE_LODQ) {
6467 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
6468 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
6469 tex.dst_sel_z = 7;
6470 tex.dst_sel_w = 7;
6471 }
6472 else if (inst->Instruction.Opcode == TGSI_OPCODE_TXQS) {
6473 tex.dst_sel_x = 3;
6474 tex.dst_sel_y = 7;
6475 tex.dst_sel_z = 7;
6476 tex.dst_sel_w = 7;
6477 }
6478 else {
6479 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
6480 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
6481 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
6482 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
6483 }
6484
6485
6486 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ ||
6487 inst->Instruction.Opcode == TGSI_OPCODE_TXQS) {
6488 tex.src_sel_x = 4;
6489 tex.src_sel_y = 4;
6490 tex.src_sel_z = 4;
6491 tex.src_sel_w = 4;
6492 } else if (src_loaded) {
6493 tex.src_sel_x = 0;
6494 tex.src_sel_y = 1;
6495 tex.src_sel_z = 2;
6496 tex.src_sel_w = 3;
6497 } else {
6498 tex.src_sel_x = ctx->src[0].swizzle[0];
6499 tex.src_sel_y = ctx->src[0].swizzle[1];
6500 tex.src_sel_z = ctx->src[0].swizzle[2];
6501 tex.src_sel_w = ctx->src[0].swizzle[3];
6502 tex.src_rel = ctx->src[0].rel;
6503 }
6504
6505 if (inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
6506 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
6507 inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
6508 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
6509 tex.src_sel_x = 1;
6510 tex.src_sel_y = 0;
6511 tex.src_sel_z = 3;
6512 tex.src_sel_w = 2; /* route Z compare or Lod value into W */
6513 }
6514
6515 if (inst->Texture.Texture != TGSI_TEXTURE_RECT &&
6516 inst->Texture.Texture != TGSI_TEXTURE_SHADOWRECT) {
6517 tex.coord_type_x = 1;
6518 tex.coord_type_y = 1;
6519 }
6520 tex.coord_type_z = 1;
6521 tex.coord_type_w = 1;
6522
6523 tex.offset_x = offset_x;
6524 tex.offset_y = offset_y;
6525 if (inst->Instruction.Opcode == TGSI_OPCODE_TG4 &&
6526 (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
6527 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY)) {
6528 tex.offset_z = 0;
6529 }
6530 else {
6531 tex.offset_z = offset_z;
6532 }
6533
6534 /* Put the depth for comparison in W.
6535 * TGSI_TEXTURE_SHADOW2D_ARRAY already has the depth in W.
6536 * Some instructions expect the depth in Z. */
6537 if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
6538 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
6539 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
6540 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) &&
6541 opcode != FETCH_OP_SAMPLE_C_L &&
6542 opcode != FETCH_OP_SAMPLE_C_LB) {
6543 tex.src_sel_w = tex.src_sel_z;
6544 }
6545
6546 if (inst->Texture.Texture == TGSI_TEXTURE_1D_ARRAY ||
6547 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) {
6548 if (opcode == FETCH_OP_SAMPLE_C_L ||
6549 opcode == FETCH_OP_SAMPLE_C_LB) {
6550 /* the array index is read from Y */
6551 tex.coord_type_y = 0;
6552 } else {
6553 /* the array index is read from Z */
6554 tex.coord_type_z = 0;
6555 tex.src_sel_z = tex.src_sel_y;
6556 }
6557 } else if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
6558 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
6559 ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
6560 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
6561 (ctx->bc->chip_class >= EVERGREEN)))
6562 /* the array index is read from Z */
6563 tex.coord_type_z = 0;
6564
6565 /* mask unused source components */
6566 if (opcode == FETCH_OP_SAMPLE || opcode == FETCH_OP_GATHER4) {
6567 switch (inst->Texture.Texture) {
6568 case TGSI_TEXTURE_2D:
6569 case TGSI_TEXTURE_RECT:
6570 tex.src_sel_z = 7;
6571 tex.src_sel_w = 7;
6572 break;
6573 case TGSI_TEXTURE_1D_ARRAY:
6574 tex.src_sel_y = 7;
6575 tex.src_sel_w = 7;
6576 break;
6577 case TGSI_TEXTURE_1D:
6578 tex.src_sel_y = 7;
6579 tex.src_sel_z = 7;
6580 tex.src_sel_w = 7;
6581 break;
6582 }
6583 }
6584
6585 r = r600_bytecode_add_tex(ctx->bc, &tex);
6586 if (r)
6587 return r;
6588
6589 /* add shadow ambient support - gallium doesn't do it yet */
6590 return 0;
6591 }
6592
6593 static int tgsi_lrp(struct r600_shader_ctx *ctx)
6594 {
6595 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6596 struct r600_bytecode_alu alu;
6597 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6598 unsigned i, temp_regs[2];
6599 int r;
6600
6601 /* optimize if it's just an equal balance */
6602 if (ctx->src[0].sel == V_SQ_ALU_SRC_0_5) {
6603 for (i = 0; i < lasti + 1; i++) {
6604 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6605 continue;
6606
6607 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6608 alu.op = ALU_OP2_ADD;
6609 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
6610 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6611 alu.omod = 3;
6612 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6613 alu.dst.chan = i;
6614 if (i == lasti) {
6615 alu.last = 1;
6616 }
6617 r = r600_bytecode_add_alu(ctx->bc, &alu);
6618 if (r)
6619 return r;
6620 }
6621 return 0;
6622 }
6623
6624 /* 1 - src0 */
6625 for (i = 0; i < lasti + 1; i++) {
6626 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6627 continue;
6628
6629 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6630 alu.op = ALU_OP2_ADD;
6631 alu.src[0].sel = V_SQ_ALU_SRC_1;
6632 alu.src[0].chan = 0;
6633 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6634 r600_bytecode_src_toggle_neg(&alu.src[1]);
6635 alu.dst.sel = ctx->temp_reg;
6636 alu.dst.chan = i;
6637 if (i == lasti) {
6638 alu.last = 1;
6639 }
6640 alu.dst.write = 1;
6641 r = r600_bytecode_add_alu(ctx->bc, &alu);
6642 if (r)
6643 return r;
6644 }
6645
6646 /* (1 - src0) * src2 */
6647 for (i = 0; i < lasti + 1; i++) {
6648 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6649 continue;
6650
6651 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6652 alu.op = ALU_OP2_MUL;
6653 alu.src[0].sel = ctx->temp_reg;
6654 alu.src[0].chan = i;
6655 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6656 alu.dst.sel = ctx->temp_reg;
6657 alu.dst.chan = i;
6658 if (i == lasti) {
6659 alu.last = 1;
6660 }
6661 alu.dst.write = 1;
6662 r = r600_bytecode_add_alu(ctx->bc, &alu);
6663 if (r)
6664 return r;
6665 }
6666
6667 /* src0 * src1 + (1 - src0) * src2 */
6668 if (ctx->src[0].abs)
6669 temp_regs[0] = r600_get_temp(ctx);
6670 else
6671 temp_regs[0] = 0;
6672 if (ctx->src[1].abs)
6673 temp_regs[1] = r600_get_temp(ctx);
6674 else
6675 temp_regs[1] = 0;
6676
6677 for (i = 0; i < lasti + 1; i++) {
6678 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6679 continue;
6680
6681 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6682 alu.op = ALU_OP3_MULADD;
6683 alu.is_op3 = 1;
6684 r = tgsi_make_src_for_op3(ctx, temp_regs[0], i, &alu.src[0], &ctx->src[0]);
6685 if (r)
6686 return r;
6687 r = tgsi_make_src_for_op3(ctx, temp_regs[1], i, &alu.src[1], &ctx->src[1]);
6688 if (r)
6689 return r;
6690 alu.src[2].sel = ctx->temp_reg;
6691 alu.src[2].chan = i;
6692
6693 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6694 alu.dst.chan = i;
6695 if (i == lasti) {
6696 alu.last = 1;
6697 }
6698 r = r600_bytecode_add_alu(ctx->bc, &alu);
6699 if (r)
6700 return r;
6701 }
6702 return 0;
6703 }
6704
6705 static int tgsi_cmp(struct r600_shader_ctx *ctx)
6706 {
6707 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6708 struct r600_bytecode_alu alu;
6709 int i, r, j;
6710 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6711 int temp_regs[3];
6712
6713 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
6714 temp_regs[j] = 0;
6715 if (ctx->src[j].abs)
6716 temp_regs[j] = r600_get_temp(ctx);
6717 }
6718
6719 for (i = 0; i < lasti + 1; i++) {
6720 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6721 continue;
6722
6723 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6724 alu.op = ALU_OP3_CNDGE;
6725 r = tgsi_make_src_for_op3(ctx, temp_regs[0], i, &alu.src[0], &ctx->src[0]);
6726 if (r)
6727 return r;
6728 r = tgsi_make_src_for_op3(ctx, temp_regs[2], i, &alu.src[1], &ctx->src[2]);
6729 if (r)
6730 return r;
6731 r = tgsi_make_src_for_op3(ctx, temp_regs[1], i, &alu.src[2], &ctx->src[1]);
6732 if (r)
6733 return r;
6734 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6735 alu.dst.chan = i;
6736 alu.dst.write = 1;
6737 alu.is_op3 = 1;
6738 if (i == lasti)
6739 alu.last = 1;
6740 r = r600_bytecode_add_alu(ctx->bc, &alu);
6741 if (r)
6742 return r;
6743 }
6744 return 0;
6745 }
6746
6747 static int tgsi_ucmp(struct r600_shader_ctx *ctx)
6748 {
6749 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6750 struct r600_bytecode_alu alu;
6751 int i, r;
6752 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6753
6754 for (i = 0; i < lasti + 1; i++) {
6755 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6756 continue;
6757
6758 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6759 alu.op = ALU_OP3_CNDE_INT;
6760 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6761 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6762 r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
6763 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6764 alu.dst.chan = i;
6765 alu.dst.write = 1;
6766 alu.is_op3 = 1;
6767 if (i == lasti)
6768 alu.last = 1;
6769 r = r600_bytecode_add_alu(ctx->bc, &alu);
6770 if (r)
6771 return r;
6772 }
6773 return 0;
6774 }
6775
6776 static int tgsi_xpd(struct r600_shader_ctx *ctx)
6777 {
6778 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6779 static const unsigned int src0_swizzle[] = {2, 0, 1};
6780 static const unsigned int src1_swizzle[] = {1, 2, 0};
6781 struct r600_bytecode_alu alu;
6782 uint32_t use_temp = 0;
6783 int i, r;
6784
6785 if (inst->Dst[0].Register.WriteMask != 0xf)
6786 use_temp = 1;
6787
6788 for (i = 0; i < 4; i++) {
6789 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6790 alu.op = ALU_OP2_MUL;
6791 if (i < 3) {
6792 r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
6793 r600_bytecode_src(&alu.src[1], &ctx->src[1], src1_swizzle[i]);
6794 } else {
6795 alu.src[0].sel = V_SQ_ALU_SRC_0;
6796 alu.src[0].chan = i;
6797 alu.src[1].sel = V_SQ_ALU_SRC_0;
6798 alu.src[1].chan = i;
6799 }
6800
6801 alu.dst.sel = ctx->temp_reg;
6802 alu.dst.chan = i;
6803 alu.dst.write = 1;
6804
6805 if (i == 3)
6806 alu.last = 1;
6807 r = r600_bytecode_add_alu(ctx->bc, &alu);
6808 if (r)
6809 return r;
6810 }
6811
6812 for (i = 0; i < 4; i++) {
6813 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6814 alu.op = ALU_OP3_MULADD;
6815
6816 if (i < 3) {
6817 r600_bytecode_src(&alu.src[0], &ctx->src[0], src1_swizzle[i]);
6818 r600_bytecode_src(&alu.src[1], &ctx->src[1], src0_swizzle[i]);
6819 } else {
6820 alu.src[0].sel = V_SQ_ALU_SRC_0;
6821 alu.src[0].chan = i;
6822 alu.src[1].sel = V_SQ_ALU_SRC_0;
6823 alu.src[1].chan = i;
6824 }
6825
6826 alu.src[2].sel = ctx->temp_reg;
6827 alu.src[2].neg = 1;
6828 alu.src[2].chan = i;
6829
6830 if (use_temp)
6831 alu.dst.sel = ctx->temp_reg;
6832 else
6833 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6834 alu.dst.chan = i;
6835 alu.dst.write = 1;
6836 alu.is_op3 = 1;
6837 if (i == 3)
6838 alu.last = 1;
6839 r = r600_bytecode_add_alu(ctx->bc, &alu);
6840 if (r)
6841 return r;
6842 }
6843 if (use_temp)
6844 return tgsi_helper_copy(ctx, inst);
6845 return 0;
6846 }
6847
6848 static int tgsi_exp(struct r600_shader_ctx *ctx)
6849 {
6850 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6851 struct r600_bytecode_alu alu;
6852 int r;
6853 int i;
6854
6855 /* result.x = 2^floor(src); */
6856 if (inst->Dst[0].Register.WriteMask & 1) {
6857 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6858
6859 alu.op = ALU_OP1_FLOOR;
6860 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6861
6862 alu.dst.sel = ctx->temp_reg;
6863 alu.dst.chan = 0;
6864 alu.dst.write = 1;
6865 alu.last = 1;
6866 r = r600_bytecode_add_alu(ctx->bc, &alu);
6867 if (r)
6868 return r;
6869
6870 if (ctx->bc->chip_class == CAYMAN) {
6871 for (i = 0; i < 3; i++) {
6872 alu.op = ALU_OP1_EXP_IEEE;
6873 alu.src[0].sel = ctx->temp_reg;
6874 alu.src[0].chan = 0;
6875
6876 alu.dst.sel = ctx->temp_reg;
6877 alu.dst.chan = i;
6878 alu.dst.write = i == 0;
6879 alu.last = i == 2;
6880 r = r600_bytecode_add_alu(ctx->bc, &alu);
6881 if (r)
6882 return r;
6883 }
6884 } else {
6885 alu.op = ALU_OP1_EXP_IEEE;
6886 alu.src[0].sel = ctx->temp_reg;
6887 alu.src[0].chan = 0;
6888
6889 alu.dst.sel = ctx->temp_reg;
6890 alu.dst.chan = 0;
6891 alu.dst.write = 1;
6892 alu.last = 1;
6893 r = r600_bytecode_add_alu(ctx->bc, &alu);
6894 if (r)
6895 return r;
6896 }
6897 }
6898
6899 /* result.y = tmp - floor(tmp); */
6900 if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
6901 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6902
6903 alu.op = ALU_OP1_FRACT;
6904 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6905
6906 alu.dst.sel = ctx->temp_reg;
6907 #if 0
6908 r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6909 if (r)
6910 return r;
6911 #endif
6912 alu.dst.write = 1;
6913 alu.dst.chan = 1;
6914
6915 alu.last = 1;
6916
6917 r = r600_bytecode_add_alu(ctx->bc, &alu);
6918 if (r)
6919 return r;
6920 }
6921
6922 /* result.z = RoughApprox2ToX(tmp);*/
6923 if ((inst->Dst[0].Register.WriteMask >> 2) & 0x1) {
6924 if (ctx->bc->chip_class == CAYMAN) {
6925 for (i = 0; i < 3; i++) {
6926 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6927 alu.op = ALU_OP1_EXP_IEEE;
6928 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6929
6930 alu.dst.sel = ctx->temp_reg;
6931 alu.dst.chan = i;
6932 if (i == 2) {
6933 alu.dst.write = 1;
6934 alu.last = 1;
6935 }
6936
6937 r = r600_bytecode_add_alu(ctx->bc, &alu);
6938 if (r)
6939 return r;
6940 }
6941 } else {
6942 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6943 alu.op = ALU_OP1_EXP_IEEE;
6944 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6945
6946 alu.dst.sel = ctx->temp_reg;
6947 alu.dst.write = 1;
6948 alu.dst.chan = 2;
6949
6950 alu.last = 1;
6951
6952 r = r600_bytecode_add_alu(ctx->bc, &alu);
6953 if (r)
6954 return r;
6955 }
6956 }
6957
6958 /* result.w = 1.0;*/
6959 if ((inst->Dst[0].Register.WriteMask >> 3) & 0x1) {
6960 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6961
6962 alu.op = ALU_OP1_MOV;
6963 alu.src[0].sel = V_SQ_ALU_SRC_1;
6964 alu.src[0].chan = 0;
6965
6966 alu.dst.sel = ctx->temp_reg;
6967 alu.dst.chan = 3;
6968 alu.dst.write = 1;
6969 alu.last = 1;
6970 r = r600_bytecode_add_alu(ctx->bc, &alu);
6971 if (r)
6972 return r;
6973 }
6974 return tgsi_helper_copy(ctx, inst);
6975 }
6976
6977 static int tgsi_log(struct r600_shader_ctx *ctx)
6978 {
6979 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6980 struct r600_bytecode_alu alu;
6981 int r;
6982 int i;
6983
6984 /* result.x = floor(log2(|src|)); */
6985 if (inst->Dst[0].Register.WriteMask & 1) {
6986 if (ctx->bc->chip_class == CAYMAN) {
6987 for (i = 0; i < 3; i++) {
6988 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6989
6990 alu.op = ALU_OP1_LOG_IEEE;
6991 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6992 r600_bytecode_src_set_abs(&alu.src[0]);
6993
6994 alu.dst.sel = ctx->temp_reg;
6995 alu.dst.chan = i;
6996 if (i == 0)
6997 alu.dst.write = 1;
6998 if (i == 2)
6999 alu.last = 1;
7000 r = r600_bytecode_add_alu(ctx->bc, &alu);
7001 if (r)
7002 return r;
7003 }
7004
7005 } else {
7006 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7007
7008 alu.op = ALU_OP1_LOG_IEEE;
7009 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7010 r600_bytecode_src_set_abs(&alu.src[0]);
7011
7012 alu.dst.sel = ctx->temp_reg;
7013 alu.dst.chan = 0;
7014 alu.dst.write = 1;
7015 alu.last = 1;
7016 r = r600_bytecode_add_alu(ctx->bc, &alu);
7017 if (r)
7018 return r;
7019 }
7020
7021 alu.op = ALU_OP1_FLOOR;
7022 alu.src[0].sel = ctx->temp_reg;
7023 alu.src[0].chan = 0;
7024
7025 alu.dst.sel = ctx->temp_reg;
7026 alu.dst.chan = 0;
7027 alu.dst.write = 1;
7028 alu.last = 1;
7029
7030 r = r600_bytecode_add_alu(ctx->bc, &alu);
7031 if (r)
7032 return r;
7033 }
7034
7035 /* result.y = |src.x| / (2 ^ floor(log2(|src.x|))); */
7036 if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
7037
7038 if (ctx->bc->chip_class == CAYMAN) {
7039 for (i = 0; i < 3; i++) {
7040 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7041
7042 alu.op = ALU_OP1_LOG_IEEE;
7043 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7044 r600_bytecode_src_set_abs(&alu.src[0]);
7045
7046 alu.dst.sel = ctx->temp_reg;
7047 alu.dst.chan = i;
7048 if (i == 1)
7049 alu.dst.write = 1;
7050 if (i == 2)
7051 alu.last = 1;
7052
7053 r = r600_bytecode_add_alu(ctx->bc, &alu);
7054 if (r)
7055 return r;
7056 }
7057 } else {
7058 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7059
7060 alu.op = ALU_OP1_LOG_IEEE;
7061 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7062 r600_bytecode_src_set_abs(&alu.src[0]);
7063
7064 alu.dst.sel = ctx->temp_reg;
7065 alu.dst.chan = 1;
7066 alu.dst.write = 1;
7067 alu.last = 1;
7068
7069 r = r600_bytecode_add_alu(ctx->bc, &alu);
7070 if (r)
7071 return r;
7072 }
7073
7074 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7075
7076 alu.op = ALU_OP1_FLOOR;
7077 alu.src[0].sel = ctx->temp_reg;
7078 alu.src[0].chan = 1;
7079
7080 alu.dst.sel = ctx->temp_reg;
7081 alu.dst.chan = 1;
7082 alu.dst.write = 1;
7083 alu.last = 1;
7084
7085 r = r600_bytecode_add_alu(ctx->bc, &alu);
7086 if (r)
7087 return r;
7088
7089 if (ctx->bc->chip_class == CAYMAN) {
7090 for (i = 0; i < 3; i++) {
7091 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7092 alu.op = ALU_OP1_EXP_IEEE;
7093 alu.src[0].sel = ctx->temp_reg;
7094 alu.src[0].chan = 1;
7095
7096 alu.dst.sel = ctx->temp_reg;
7097 alu.dst.chan = i;
7098 if (i == 1)
7099 alu.dst.write = 1;
7100 if (i == 2)
7101 alu.last = 1;
7102
7103 r = r600_bytecode_add_alu(ctx->bc, &alu);
7104 if (r)
7105 return r;
7106 }
7107 } else {
7108 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7109 alu.op = ALU_OP1_EXP_IEEE;
7110 alu.src[0].sel = ctx->temp_reg;
7111 alu.src[0].chan = 1;
7112
7113 alu.dst.sel = ctx->temp_reg;
7114 alu.dst.chan = 1;
7115 alu.dst.write = 1;
7116 alu.last = 1;
7117
7118 r = r600_bytecode_add_alu(ctx->bc, &alu);
7119 if (r)
7120 return r;
7121 }
7122
7123 if (ctx->bc->chip_class == CAYMAN) {
7124 for (i = 0; i < 3; i++) {
7125 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7126 alu.op = ALU_OP1_RECIP_IEEE;
7127 alu.src[0].sel = ctx->temp_reg;
7128 alu.src[0].chan = 1;
7129
7130 alu.dst.sel = ctx->temp_reg;
7131 alu.dst.chan = i;
7132 if (i == 1)
7133 alu.dst.write = 1;
7134 if (i == 2)
7135 alu.last = 1;
7136
7137 r = r600_bytecode_add_alu(ctx->bc, &alu);
7138 if (r)
7139 return r;
7140 }
7141 } else {
7142 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7143 alu.op = ALU_OP1_RECIP_IEEE;
7144 alu.src[0].sel = ctx->temp_reg;
7145 alu.src[0].chan = 1;
7146
7147 alu.dst.sel = ctx->temp_reg;
7148 alu.dst.chan = 1;
7149 alu.dst.write = 1;
7150 alu.last = 1;
7151
7152 r = r600_bytecode_add_alu(ctx->bc, &alu);
7153 if (r)
7154 return r;
7155 }
7156
7157 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7158
7159 alu.op = ALU_OP2_MUL;
7160
7161 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7162 r600_bytecode_src_set_abs(&alu.src[0]);
7163
7164 alu.src[1].sel = ctx->temp_reg;
7165 alu.src[1].chan = 1;
7166
7167 alu.dst.sel = ctx->temp_reg;
7168 alu.dst.chan = 1;
7169 alu.dst.write = 1;
7170 alu.last = 1;
7171
7172 r = r600_bytecode_add_alu(ctx->bc, &alu);
7173 if (r)
7174 return r;
7175 }
7176
7177 /* result.z = log2(|src|);*/
7178 if ((inst->Dst[0].Register.WriteMask >> 2) & 1) {
7179 if (ctx->bc->chip_class == CAYMAN) {
7180 for (i = 0; i < 3; i++) {
7181 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7182
7183 alu.op = ALU_OP1_LOG_IEEE;
7184 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7185 r600_bytecode_src_set_abs(&alu.src[0]);
7186
7187 alu.dst.sel = ctx->temp_reg;
7188 if (i == 2)
7189 alu.dst.write = 1;
7190 alu.dst.chan = i;
7191 if (i == 2)
7192 alu.last = 1;
7193
7194 r = r600_bytecode_add_alu(ctx->bc, &alu);
7195 if (r)
7196 return r;
7197 }
7198 } else {
7199 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7200
7201 alu.op = ALU_OP1_LOG_IEEE;
7202 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7203 r600_bytecode_src_set_abs(&alu.src[0]);
7204
7205 alu.dst.sel = ctx->temp_reg;
7206 alu.dst.write = 1;
7207 alu.dst.chan = 2;
7208 alu.last = 1;
7209
7210 r = r600_bytecode_add_alu(ctx->bc, &alu);
7211 if (r)
7212 return r;
7213 }
7214 }
7215
7216 /* result.w = 1.0; */
7217 if ((inst->Dst[0].Register.WriteMask >> 3) & 1) {
7218 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7219
7220 alu.op = ALU_OP1_MOV;
7221 alu.src[0].sel = V_SQ_ALU_SRC_1;
7222 alu.src[0].chan = 0;
7223
7224 alu.dst.sel = ctx->temp_reg;
7225 alu.dst.chan = 3;
7226 alu.dst.write = 1;
7227 alu.last = 1;
7228
7229 r = r600_bytecode_add_alu(ctx->bc, &alu);
7230 if (r)
7231 return r;
7232 }
7233
7234 return tgsi_helper_copy(ctx, inst);
7235 }
7236
7237 static int tgsi_eg_arl(struct r600_shader_ctx *ctx)
7238 {
7239 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7240 struct r600_bytecode_alu alu;
7241 int r;
7242 int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7243 unsigned reg = get_address_file_reg(ctx, inst->Dst[0].Register.Index);
7244
7245 assert(inst->Dst[0].Register.Index < 3);
7246 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7247
7248 switch (inst->Instruction.Opcode) {
7249 case TGSI_OPCODE_ARL:
7250 alu.op = ALU_OP1_FLT_TO_INT_FLOOR;
7251 break;
7252 case TGSI_OPCODE_ARR:
7253 alu.op = ALU_OP1_FLT_TO_INT;
7254 break;
7255 case TGSI_OPCODE_UARL:
7256 alu.op = ALU_OP1_MOV;
7257 break;
7258 default:
7259 assert(0);
7260 return -1;
7261 }
7262
7263 for (i = 0; i <= lasti; ++i) {
7264 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7265 continue;
7266 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7267 alu.last = i == lasti;
7268 alu.dst.sel = reg;
7269 alu.dst.chan = i;
7270 alu.dst.write = 1;
7271 r = r600_bytecode_add_alu(ctx->bc, &alu);
7272 if (r)
7273 return r;
7274 }
7275
7276 if (inst->Dst[0].Register.Index > 0)
7277 ctx->bc->index_loaded[inst->Dst[0].Register.Index - 1] = 0;
7278 else
7279 ctx->bc->ar_loaded = 0;
7280
7281 return 0;
7282 }
7283 static int tgsi_r600_arl(struct r600_shader_ctx *ctx)
7284 {
7285 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7286 struct r600_bytecode_alu alu;
7287 int r;
7288 int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7289
7290 switch (inst->Instruction.Opcode) {
7291 case TGSI_OPCODE_ARL:
7292 memset(&alu, 0, sizeof(alu));
7293 alu.op = ALU_OP1_FLOOR;
7294 alu.dst.sel = ctx->bc->ar_reg;
7295 alu.dst.write = 1;
7296 for (i = 0; i <= lasti; ++i) {
7297 if (inst->Dst[0].Register.WriteMask & (1 << i)) {
7298 alu.dst.chan = i;
7299 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7300 alu.last = i == lasti;
7301 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
7302 return r;
7303 }
7304 }
7305
7306 memset(&alu, 0, sizeof(alu));
7307 alu.op = ALU_OP1_FLT_TO_INT;
7308 alu.src[0].sel = ctx->bc->ar_reg;
7309 alu.dst.sel = ctx->bc->ar_reg;
7310 alu.dst.write = 1;
7311 /* FLT_TO_INT is trans-only on r600/r700 */
7312 alu.last = TRUE;
7313 for (i = 0; i <= lasti; ++i) {
7314 alu.dst.chan = i;
7315 alu.src[0].chan = i;
7316 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
7317 return r;
7318 }
7319 break;
7320 case TGSI_OPCODE_ARR:
7321 memset(&alu, 0, sizeof(alu));
7322 alu.op = ALU_OP1_FLT_TO_INT;
7323 alu.dst.sel = ctx->bc->ar_reg;
7324 alu.dst.write = 1;
7325 /* FLT_TO_INT is trans-only on r600/r700 */
7326 alu.last = TRUE;
7327 for (i = 0; i <= lasti; ++i) {
7328 if (inst->Dst[0].Register.WriteMask & (1 << i)) {
7329 alu.dst.chan = i;
7330 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7331 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
7332 return r;
7333 }
7334 }
7335 break;
7336 case TGSI_OPCODE_UARL:
7337 memset(&alu, 0, sizeof(alu));
7338 alu.op = ALU_OP1_MOV;
7339 alu.dst.sel = ctx->bc->ar_reg;
7340 alu.dst.write = 1;
7341 for (i = 0; i <= lasti; ++i) {
7342 if (inst->Dst[0].Register.WriteMask & (1 << i)) {
7343 alu.dst.chan = i;
7344 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7345 alu.last = i == lasti;
7346 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
7347 return r;
7348 }
7349 }
7350 break;
7351 default:
7352 assert(0);
7353 return -1;
7354 }
7355
7356 ctx->bc->ar_loaded = 0;
7357 return 0;
7358 }
7359
7360 static int tgsi_opdst(struct r600_shader_ctx *ctx)
7361 {
7362 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7363 struct r600_bytecode_alu alu;
7364 int i, r = 0;
7365
7366 for (i = 0; i < 4; i++) {
7367 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7368
7369 alu.op = ALU_OP2_MUL;
7370 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7371
7372 if (i == 0 || i == 3) {
7373 alu.src[0].sel = V_SQ_ALU_SRC_1;
7374 } else {
7375 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7376 }
7377
7378 if (i == 0 || i == 2) {
7379 alu.src[1].sel = V_SQ_ALU_SRC_1;
7380 } else {
7381 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
7382 }
7383 if (i == 3)
7384 alu.last = 1;
7385 r = r600_bytecode_add_alu(ctx->bc, &alu);
7386 if (r)
7387 return r;
7388 }
7389 return 0;
7390 }
7391
7392 static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode, int alu_type)
7393 {
7394 struct r600_bytecode_alu alu;
7395 int r;
7396
7397 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7398 alu.op = opcode;
7399 alu.execute_mask = 1;
7400 alu.update_pred = 1;
7401
7402 alu.dst.sel = ctx->temp_reg;
7403 alu.dst.write = 1;
7404 alu.dst.chan = 0;
7405
7406 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7407 alu.src[1].sel = V_SQ_ALU_SRC_0;
7408 alu.src[1].chan = 0;
7409
7410 alu.last = 1;
7411
7412 r = r600_bytecode_add_alu_type(ctx->bc, &alu, alu_type);
7413 if (r)
7414 return r;
7415 return 0;
7416 }
7417
7418 static int pops(struct r600_shader_ctx *ctx, int pops)
7419 {
7420 unsigned force_pop = ctx->bc->force_add_cf;
7421
7422 if (!force_pop) {
7423 int alu_pop = 3;
7424 if (ctx->bc->cf_last) {
7425 if (ctx->bc->cf_last->op == CF_OP_ALU)
7426 alu_pop = 0;
7427 else if (ctx->bc->cf_last->op == CF_OP_ALU_POP_AFTER)
7428 alu_pop = 1;
7429 }
7430 alu_pop += pops;
7431 if (alu_pop == 1) {
7432 ctx->bc->cf_last->op = CF_OP_ALU_POP_AFTER;
7433 ctx->bc->force_add_cf = 1;
7434 } else if (alu_pop == 2) {
7435 ctx->bc->cf_last->op = CF_OP_ALU_POP2_AFTER;
7436 ctx->bc->force_add_cf = 1;
7437 } else {
7438 force_pop = 1;
7439 }
7440 }
7441
7442 if (force_pop) {
7443 r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP);
7444 ctx->bc->cf_last->pop_count = pops;
7445 ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
7446 }
7447
7448 return 0;
7449 }
7450
7451 static inline void callstack_update_max_depth(struct r600_shader_ctx *ctx,
7452 unsigned reason)
7453 {
7454 struct r600_stack_info *stack = &ctx->bc->stack;
7455 unsigned elements, entries;
7456
7457 unsigned entry_size = stack->entry_size;
7458
7459 elements = (stack->loop + stack->push_wqm ) * entry_size;
7460 elements += stack->push;
7461
7462 switch (ctx->bc->chip_class) {
7463 case R600:
7464 case R700:
7465 /* pre-r8xx: if any non-WQM PUSH instruction is invoked, 2 elements on
7466 * the stack must be reserved to hold the current active/continue
7467 * masks */
7468 if (reason == FC_PUSH_VPM) {
7469 elements += 2;
7470 }
7471 break;
7472
7473 case CAYMAN:
7474 /* r9xx: any stack operation on empty stack consumes 2 additional
7475 * elements */
7476 elements += 2;
7477
7478 /* fallthrough */
7479 /* FIXME: do the two elements added above cover the cases for the
7480 * r8xx+ below? */
7481
7482 case EVERGREEN:
7483 /* r8xx+: 2 extra elements are not always required, but one extra
7484 * element must be added for each of the following cases:
7485 * 1. There is an ALU_ELSE_AFTER instruction at the point of greatest
7486 * stack usage.
7487 * (Currently we don't use ALU_ELSE_AFTER.)
7488 * 2. There are LOOP/WQM frames on the stack when any flavor of non-WQM
7489 * PUSH instruction executed.
7490 *
7491 * NOTE: it seems we also need to reserve additional element in some
7492 * other cases, e.g. when we have 4 levels of PUSH_VPM in the shader,
7493 * then STACK_SIZE should be 2 instead of 1 */
7494 if (reason == FC_PUSH_VPM) {
7495 elements += 1;
7496 }
7497 break;
7498
7499 default:
7500 assert(0);
7501 break;
7502 }
7503
7504 /* NOTE: it seems STACK_SIZE is interpreted by hw as if entry_size is 4
7505 * for all chips, so we use 4 in the final formula, not the real entry_size
7506 * for the chip */
7507 entry_size = 4;
7508
7509 entries = (elements + (entry_size - 1)) / entry_size;
7510
7511 if (entries > stack->max_entries)
7512 stack->max_entries = entries;
7513 }
7514
7515 static inline void callstack_pop(struct r600_shader_ctx *ctx, unsigned reason)
7516 {
7517 switch(reason) {
7518 case FC_PUSH_VPM:
7519 --ctx->bc->stack.push;
7520 assert(ctx->bc->stack.push >= 0);
7521 break;
7522 case FC_PUSH_WQM:
7523 --ctx->bc->stack.push_wqm;
7524 assert(ctx->bc->stack.push_wqm >= 0);
7525 break;
7526 case FC_LOOP:
7527 --ctx->bc->stack.loop;
7528 assert(ctx->bc->stack.loop >= 0);
7529 break;
7530 default:
7531 assert(0);
7532 break;
7533 }
7534 }
7535
7536 static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason)
7537 {
7538 switch (reason) {
7539 case FC_PUSH_VPM:
7540 ++ctx->bc->stack.push;
7541 break;
7542 case FC_PUSH_WQM:
7543 ++ctx->bc->stack.push_wqm;
7544 case FC_LOOP:
7545 ++ctx->bc->stack.loop;
7546 break;
7547 default:
7548 assert(0);
7549 }
7550
7551 callstack_update_max_depth(ctx, reason);
7552 }
7553
7554 static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp)
7555 {
7556 struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[fc_sp];
7557
7558 sp->mid = realloc((void *)sp->mid,
7559 sizeof(struct r600_bytecode_cf *) * (sp->num_mid + 1));
7560 sp->mid[sp->num_mid] = ctx->bc->cf_last;
7561 sp->num_mid++;
7562 }
7563
7564 static void fc_pushlevel(struct r600_shader_ctx *ctx, int type)
7565 {
7566 ctx->bc->fc_sp++;
7567 ctx->bc->fc_stack[ctx->bc->fc_sp].type = type;
7568 ctx->bc->fc_stack[ctx->bc->fc_sp].start = ctx->bc->cf_last;
7569 }
7570
7571 static void fc_poplevel(struct r600_shader_ctx *ctx)
7572 {
7573 struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[ctx->bc->fc_sp];
7574 free(sp->mid);
7575 sp->mid = NULL;
7576 sp->num_mid = 0;
7577 sp->start = NULL;
7578 sp->type = 0;
7579 ctx->bc->fc_sp--;
7580 }
7581
7582 #if 0
7583 static int emit_return(struct r600_shader_ctx *ctx)
7584 {
7585 r600_bytecode_add_cfinst(ctx->bc, CF_OP_RETURN));
7586 return 0;
7587 }
7588
7589 static int emit_jump_to_offset(struct r600_shader_ctx *ctx, int pops, int offset)
7590 {
7591
7592 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP));
7593 ctx->bc->cf_last->pop_count = pops;
7594 /* XXX work out offset */
7595 return 0;
7596 }
7597
7598 static int emit_setret_in_loop_flag(struct r600_shader_ctx *ctx, unsigned flag_value)
7599 {
7600 return 0;
7601 }
7602
7603 static void emit_testflag(struct r600_shader_ctx *ctx)
7604 {
7605
7606 }
7607
7608 static void emit_return_on_flag(struct r600_shader_ctx *ctx, unsigned ifidx)
7609 {
7610 emit_testflag(ctx);
7611 emit_jump_to_offset(ctx, 1, 4);
7612 emit_setret_in_loop_flag(ctx, V_SQ_ALU_SRC_0);
7613 pops(ctx, ifidx + 1);
7614 emit_return(ctx);
7615 }
7616
7617 static void break_loop_on_flag(struct r600_shader_ctx *ctx, unsigned fc_sp)
7618 {
7619 emit_testflag(ctx);
7620
7621 r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
7622 ctx->bc->cf_last->pop_count = 1;
7623
7624 fc_set_mid(ctx, fc_sp);
7625
7626 pops(ctx, 1);
7627 }
7628 #endif
7629
7630 static int emit_if(struct r600_shader_ctx *ctx, int opcode)
7631 {
7632 int alu_type = CF_OP_ALU_PUSH_BEFORE;
7633
7634 /* There is a hardware bug on Cayman where a BREAK/CONTINUE followed by
7635 * LOOP_STARTxxx for nested loops may put the branch stack into a state
7636 * such that ALU_PUSH_BEFORE doesn't work as expected. Workaround this
7637 * by replacing the ALU_PUSH_BEFORE with a PUSH + ALU */
7638 if (ctx->bc->chip_class == CAYMAN && ctx->bc->stack.loop > 1) {
7639 r600_bytecode_add_cfinst(ctx->bc, CF_OP_PUSH);
7640 ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
7641 alu_type = CF_OP_ALU;
7642 }
7643
7644 emit_logic_pred(ctx, opcode, alu_type);
7645
7646 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
7647
7648 fc_pushlevel(ctx, FC_IF);
7649
7650 callstack_push(ctx, FC_PUSH_VPM);
7651 return 0;
7652 }
7653
7654 static int tgsi_if(struct r600_shader_ctx *ctx)
7655 {
7656 return emit_if(ctx, ALU_OP2_PRED_SETNE);
7657 }
7658
7659 static int tgsi_uif(struct r600_shader_ctx *ctx)
7660 {
7661 return emit_if(ctx, ALU_OP2_PRED_SETNE_INT);
7662 }
7663
7664 static int tgsi_else(struct r600_shader_ctx *ctx)
7665 {
7666 r600_bytecode_add_cfinst(ctx->bc, CF_OP_ELSE);
7667 ctx->bc->cf_last->pop_count = 1;
7668
7669 fc_set_mid(ctx, ctx->bc->fc_sp);
7670 ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id;
7671 return 0;
7672 }
7673
7674 static int tgsi_endif(struct r600_shader_ctx *ctx)
7675 {
7676 pops(ctx, 1);
7677 if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_IF) {
7678 R600_ERR("if/endif unbalanced in shader\n");
7679 return -1;
7680 }
7681
7682 if (ctx->bc->fc_stack[ctx->bc->fc_sp].mid == NULL) {
7683 ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2;
7684 ctx->bc->fc_stack[ctx->bc->fc_sp].start->pop_count = 1;
7685 } else {
7686 ctx->bc->fc_stack[ctx->bc->fc_sp].mid[0]->cf_addr = ctx->bc->cf_last->id + 2;
7687 }
7688 fc_poplevel(ctx);
7689
7690 callstack_pop(ctx, FC_PUSH_VPM);
7691 return 0;
7692 }
7693
7694 static int tgsi_bgnloop(struct r600_shader_ctx *ctx)
7695 {
7696 /* LOOP_START_DX10 ignores the LOOP_CONFIG* registers, so it is not
7697 * limited to 4096 iterations, like the other LOOP_* instructions. */
7698 r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_START_DX10);
7699
7700 fc_pushlevel(ctx, FC_LOOP);
7701
7702 /* check stack depth */
7703 callstack_push(ctx, FC_LOOP);
7704 return 0;
7705 }
7706
7707 static int tgsi_endloop(struct r600_shader_ctx *ctx)
7708 {
7709 int i;
7710
7711 r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_END);
7712
7713 if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_LOOP) {
7714 R600_ERR("loop/endloop in shader code are not paired.\n");
7715 return -EINVAL;
7716 }
7717
7718 /* fixup loop pointers - from r600isa
7719 LOOP END points to CF after LOOP START,
7720 LOOP START point to CF after LOOP END
7721 BRK/CONT point to LOOP END CF
7722 */
7723 ctx->bc->cf_last->cf_addr = ctx->bc->fc_stack[ctx->bc->fc_sp].start->id + 2;
7724
7725 ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2;
7726
7727 for (i = 0; i < ctx->bc->fc_stack[ctx->bc->fc_sp].num_mid; i++) {
7728 ctx->bc->fc_stack[ctx->bc->fc_sp].mid[i]->cf_addr = ctx->bc->cf_last->id;
7729 }
7730 /* XXX add LOOPRET support */
7731 fc_poplevel(ctx);
7732 callstack_pop(ctx, FC_LOOP);
7733 return 0;
7734 }
7735
7736 static int tgsi_loop_breakc(struct r600_shader_ctx *ctx)
7737 {
7738 int r;
7739 unsigned int fscp;
7740
7741 for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
7742 {
7743 if (FC_LOOP == ctx->bc->fc_stack[fscp].type)
7744 break;
7745 }
7746 if (fscp == 0) {
7747 R600_ERR("BREAKC not inside loop/endloop pair\n");
7748 return -EINVAL;
7749 }
7750
7751 if (ctx->bc->chip_class == EVERGREEN &&
7752 ctx->bc->family != CHIP_CYPRESS &&
7753 ctx->bc->family != CHIP_JUNIPER) {
7754 /* HW bug: ALU_BREAK does not save the active mask correctly */
7755 r = tgsi_uif(ctx);
7756 if (r)
7757 return r;
7758
7759 r = r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_BREAK);
7760 if (r)
7761 return r;
7762 fc_set_mid(ctx, fscp);
7763
7764 return tgsi_endif(ctx);
7765 } else {
7766 r = emit_logic_pred(ctx, ALU_OP2_PRED_SETE_INT, CF_OP_ALU_BREAK);
7767 if (r)
7768 return r;
7769 fc_set_mid(ctx, fscp);
7770 }
7771
7772 return 0;
7773 }
7774
7775 static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx)
7776 {
7777 unsigned int fscp;
7778
7779 for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
7780 {
7781 if (FC_LOOP == ctx->bc->fc_stack[fscp].type)
7782 break;
7783 }
7784
7785 if (fscp == 0) {
7786 R600_ERR("Break not inside loop/endloop pair\n");
7787 return -EINVAL;
7788 }
7789
7790 r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
7791
7792 fc_set_mid(ctx, fscp);
7793
7794 return 0;
7795 }
7796
7797 static int tgsi_gs_emit(struct r600_shader_ctx *ctx)
7798 {
7799 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7800 int stream = ctx->literals[inst->Src[0].Register.Index * 4 + inst->Src[0].Register.SwizzleX];
7801 int r;
7802
7803 if (ctx->inst_info->op == CF_OP_EMIT_VERTEX)
7804 emit_gs_ring_writes(ctx, ctx->gs_stream_output_info, stream, TRUE);
7805
7806 r = r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
7807 if (!r) {
7808 ctx->bc->cf_last->count = stream; // Count field for CUT/EMIT_VERTEX indicates which stream
7809 return emit_inc_ring_offset(ctx, stream, TRUE);
7810 }
7811 return r;
7812 }
7813
7814 static int tgsi_umad(struct r600_shader_ctx *ctx)
7815 {
7816 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7817 struct r600_bytecode_alu alu;
7818 int i, j, k, r;
7819 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7820
7821 /* src0 * src1 */
7822 for (i = 0; i < lasti + 1; i++) {
7823 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7824 continue;
7825
7826 if (ctx->bc->chip_class == CAYMAN) {
7827 for (j = 0 ; j < 4; j++) {
7828 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7829
7830 alu.op = ALU_OP2_MULLO_UINT;
7831 for (k = 0; k < inst->Instruction.NumSrcRegs; k++) {
7832 r600_bytecode_src(&alu.src[k], &ctx->src[k], i);
7833 }
7834 alu.dst.chan = j;
7835 alu.dst.sel = ctx->temp_reg;
7836 alu.dst.write = (j == i);
7837 if (j == 3)
7838 alu.last = 1;
7839 r = r600_bytecode_add_alu(ctx->bc, &alu);
7840 if (r)
7841 return r;
7842 }
7843 } else {
7844 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7845
7846 alu.dst.chan = i;
7847 alu.dst.sel = ctx->temp_reg;
7848 alu.dst.write = 1;
7849
7850 alu.op = ALU_OP2_MULLO_UINT;
7851 for (j = 0; j < 2; j++) {
7852 r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
7853 }
7854
7855 alu.last = 1;
7856 r = r600_bytecode_add_alu(ctx->bc, &alu);
7857 if (r)
7858 return r;
7859 }
7860 }
7861
7862
7863 for (i = 0; i < lasti + 1; i++) {
7864 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7865 continue;
7866
7867 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7868 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7869
7870 alu.op = ALU_OP2_ADD_INT;
7871
7872 alu.src[0].sel = ctx->temp_reg;
7873 alu.src[0].chan = i;
7874
7875 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
7876 if (i == lasti) {
7877 alu.last = 1;
7878 }
7879 r = r600_bytecode_add_alu(ctx->bc, &alu);
7880 if (r)
7881 return r;
7882 }
7883 return 0;
7884 }
7885
7886 static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
7887 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_r600_arl},
7888 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2},
7889 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit},
7890
7891 /* XXX:
7892 * For state trackers other than OpenGL, we'll want to use
7893 * _RECIP_IEEE instead.
7894 */
7895 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_CLAMPED, tgsi_trans_srcx_replicate},
7896
7897 [TGSI_OPCODE_RSQ] = { ALU_OP0_NOP, tgsi_rsq},
7898 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp},
7899 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log},
7900 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL, tgsi_op2},
7901 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2},
7902 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4, tgsi_dp},
7903 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4, tgsi_dp},
7904 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst},
7905 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN, tgsi_op2},
7906 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX, tgsi_op2},
7907 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap},
7908 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2},
7909 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD, tgsi_op3},
7910 [TGSI_OPCODE_SUB] = { ALU_OP2_ADD, tgsi_op2},
7911 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp},
7912 [TGSI_OPCODE_FMA] = { ALU_OP0_NOP, tgsi_unsupported},
7913 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
7914 [TGSI_OPCODE_DP2A] = { ALU_OP0_NOP, tgsi_unsupported},
7915 [22] = { ALU_OP0_NOP, tgsi_unsupported},
7916 [23] = { ALU_OP0_NOP, tgsi_unsupported},
7917 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2},
7918 [TGSI_OPCODE_CLAMP] = { ALU_OP0_NOP, tgsi_unsupported},
7919 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2},
7920 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2},
7921 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
7922 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
7923 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, tgsi_pow},
7924 [TGSI_OPCODE_XPD] = { ALU_OP0_NOP, tgsi_xpd},
7925 [32] = { ALU_OP0_NOP, tgsi_unsupported},
7926 [TGSI_OPCODE_ABS] = { ALU_OP1_MOV, tgsi_op2},
7927 [34] = { ALU_OP0_NOP, tgsi_unsupported},
7928 [TGSI_OPCODE_DPH] = { ALU_OP2_DOT4, tgsi_dp},
7929 [TGSI_OPCODE_COS] = { ALU_OP1_COS, tgsi_trig},
7930 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
7931 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
7932 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */
7933 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_unsupported},
7934 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported},
7935 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported},
7936 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported},
7937 [44] = { ALU_OP0_NOP, tgsi_unsupported},
7938 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2},
7939 [46] = { ALU_OP0_NOP, tgsi_unsupported},
7940 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2},
7941 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, tgsi_trig},
7942 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap},
7943 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2},
7944 [51] = { ALU_OP0_NOP, tgsi_unsupported},
7945 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex},
7946 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex},
7947 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex},
7948 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_unsupported},
7949 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported},
7950 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported},
7951 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported},
7952 [59] = { ALU_OP0_NOP, tgsi_unsupported},
7953 [60] = { ALU_OP0_NOP, tgsi_unsupported},
7954 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_r600_arl},
7955 [62] = { ALU_OP0_NOP, tgsi_unsupported},
7956 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported},
7957 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported},
7958 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg},
7959 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp},
7960 [TGSI_OPCODE_SCS] = { ALU_OP0_NOP, tgsi_scs},
7961 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
7962 [69] = { ALU_OP0_NOP, tgsi_unsupported},
7963 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported},
7964 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4, tgsi_dp},
7965 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex},
7966 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
7967 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if},
7968 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif},
7969 [76] = { ALU_OP0_NOP, tgsi_unsupported},
7970 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else},
7971 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif},
7972 [TGSI_OPCODE_DDX_FINE] = { ALU_OP0_NOP, tgsi_unsupported},
7973 [TGSI_OPCODE_DDY_FINE] = { ALU_OP0_NOP, tgsi_unsupported},
7974 [TGSI_OPCODE_PUSHA] = { ALU_OP0_NOP, tgsi_unsupported},
7975 [TGSI_OPCODE_POPA] = { ALU_OP0_NOP, tgsi_unsupported},
7976 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2},
7977 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
7978 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2},
7979 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2},
7980 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2_trans},
7981 [88] = { ALU_OP0_NOP, tgsi_unsupported},
7982 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2},
7983 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2},
7984 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod},
7985 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2},
7986 [TGSI_OPCODE_SAD] = { ALU_OP0_NOP, tgsi_unsupported},
7987 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex},
7988 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
7989 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
7990 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
7991 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit},
7992 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop},
7993 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported},
7994 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop},
7995 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported},
7996 [TGSI_OPCODE_TXQ_LZ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
7997 [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
7998 [105] = { ALU_OP0_NOP, tgsi_unsupported},
7999 [106] = { ALU_OP0_NOP, tgsi_unsupported},
8000 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported},
8001 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2},
8002 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2},
8003 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
8004 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
8005 [112] = { ALU_OP0_NOP, tgsi_unsupported},
8006 [TGSI_OPCODE_CALLNZ] = { ALU_OP0_NOP, tgsi_unsupported},
8007 [114] = { ALU_OP0_NOP, tgsi_unsupported},
8008 [TGSI_OPCODE_BREAKC] = { ALU_OP0_NOP, tgsi_loop_breakc},
8009 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */
8010 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */
8011 [118] = { ALU_OP0_NOP, tgsi_unsupported},
8012 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_op2_trans},
8013 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv},
8014 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2},
8015 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2},
8016 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg},
8017 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2},
8018 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2_trans},
8019 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap},
8020 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_op2_trans},
8021 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
8022 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2},
8023 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv},
8024 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad},
8025 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2},
8026 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2},
8027 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod},
8028 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_UINT, tgsi_op2_trans},
8029 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2},
8030 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2},
8031 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2_trans},
8032 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
8033 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2_swap},
8034 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
8035 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported},
8036 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported},
8037 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
8038 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported},
8039 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported},
8040 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported},
8041 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported},
8042 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported},
8043 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported},
8044 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported},
8045 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported},
8046 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported},
8047 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported},
8048 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported},
8049 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported},
8050 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_r600_arl},
8051 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp},
8052 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs},
8053 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg},
8054 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_unsupported},
8055 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_unsupported},
8056 [TGSI_OPCODE_MFENCE] = { ALU_OP0_NOP, tgsi_unsupported},
8057 [TGSI_OPCODE_LFENCE] = { ALU_OP0_NOP, tgsi_unsupported},
8058 [TGSI_OPCODE_SFENCE] = { ALU_OP0_NOP, tgsi_unsupported},
8059 [TGSI_OPCODE_BARRIER] = { ALU_OP0_NOP, tgsi_unsupported},
8060 [TGSI_OPCODE_ATOMUADD] = { ALU_OP0_NOP, tgsi_unsupported},
8061 [TGSI_OPCODE_ATOMXCHG] = { ALU_OP0_NOP, tgsi_unsupported},
8062 [TGSI_OPCODE_ATOMCAS] = { ALU_OP0_NOP, tgsi_unsupported},
8063 [TGSI_OPCODE_ATOMAND] = { ALU_OP0_NOP, tgsi_unsupported},
8064 [TGSI_OPCODE_ATOMOR] = { ALU_OP0_NOP, tgsi_unsupported},
8065 [TGSI_OPCODE_ATOMXOR] = { ALU_OP0_NOP, tgsi_unsupported},
8066 [TGSI_OPCODE_ATOMUMIN] = { ALU_OP0_NOP, tgsi_unsupported},
8067 [TGSI_OPCODE_ATOMUMAX] = { ALU_OP0_NOP, tgsi_unsupported},
8068 [TGSI_OPCODE_ATOMIMIN] = { ALU_OP0_NOP, tgsi_unsupported},
8069 [TGSI_OPCODE_ATOMIMAX] = { ALU_OP0_NOP, tgsi_unsupported},
8070 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex},
8071 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
8072 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex},
8073 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, tgsi_op2_trans},
8074 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, tgsi_op2_trans},
8075 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_unsupported},
8076 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_unsupported},
8077 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_unsupported},
8078 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_unsupported},
8079 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_unsupported},
8080 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_unsupported},
8081 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_unsupported},
8082 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_unsupported},
8083 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_unsupported},
8084 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_unsupported},
8085 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_unsupported},
8086 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_unsupported},
8087 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_unsupported},
8088 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported},
8089 };
8090
8091 static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = {
8092 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_eg_arl},
8093 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2},
8094 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit},
8095 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate},
8096 [TGSI_OPCODE_RSQ] = { ALU_OP1_RECIPSQRT_IEEE, tgsi_rsq},
8097 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp},
8098 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log},
8099 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL, tgsi_op2},
8100 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2},
8101 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4, tgsi_dp},
8102 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4, tgsi_dp},
8103 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst},
8104 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN, tgsi_op2},
8105 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX, tgsi_op2},
8106 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap},
8107 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2},
8108 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD, tgsi_op3},
8109 [TGSI_OPCODE_SUB] = { ALU_OP2_ADD, tgsi_op2},
8110 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp},
8111 [TGSI_OPCODE_FMA] = { ALU_OP0_NOP, tgsi_unsupported},
8112 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
8113 [TGSI_OPCODE_DP2A] = { ALU_OP0_NOP, tgsi_unsupported},
8114 [22] = { ALU_OP0_NOP, tgsi_unsupported},
8115 [23] = { ALU_OP0_NOP, tgsi_unsupported},
8116 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2},
8117 [TGSI_OPCODE_CLAMP] = { ALU_OP0_NOP, tgsi_unsupported},
8118 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2},
8119 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2},
8120 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
8121 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
8122 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, tgsi_pow},
8123 [TGSI_OPCODE_XPD] = { ALU_OP0_NOP, tgsi_xpd},
8124 [32] = { ALU_OP0_NOP, tgsi_unsupported},
8125 [TGSI_OPCODE_ABS] = { ALU_OP1_MOV, tgsi_op2},
8126 [34] = { ALU_OP0_NOP, tgsi_unsupported},
8127 [TGSI_OPCODE_DPH] = { ALU_OP2_DOT4, tgsi_dp},
8128 [TGSI_OPCODE_COS] = { ALU_OP1_COS, tgsi_trig},
8129 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
8130 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
8131 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */
8132 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_unsupported},
8133 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported},
8134 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported},
8135 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported},
8136 [44] = { ALU_OP0_NOP, tgsi_unsupported},
8137 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2},
8138 [46] = { ALU_OP0_NOP, tgsi_unsupported},
8139 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2},
8140 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, tgsi_trig},
8141 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap},
8142 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2},
8143 [51] = { ALU_OP0_NOP, tgsi_unsupported},
8144 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex},
8145 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex},
8146 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex},
8147 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_unsupported},
8148 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported},
8149 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported},
8150 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported},
8151 [59] = { ALU_OP0_NOP, tgsi_unsupported},
8152 [60] = { ALU_OP0_NOP, tgsi_unsupported},
8153 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_eg_arl},
8154 [62] = { ALU_OP0_NOP, tgsi_unsupported},
8155 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported},
8156 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported},
8157 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg},
8158 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp},
8159 [TGSI_OPCODE_SCS] = { ALU_OP0_NOP, tgsi_scs},
8160 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
8161 [69] = { ALU_OP0_NOP, tgsi_unsupported},
8162 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported},
8163 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4, tgsi_dp},
8164 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex},
8165 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
8166 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if},
8167 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif},
8168 [76] = { ALU_OP0_NOP, tgsi_unsupported},
8169 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else},
8170 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif},
8171 [TGSI_OPCODE_DDX_FINE] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
8172 [TGSI_OPCODE_DDY_FINE] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
8173 [TGSI_OPCODE_PUSHA] = { ALU_OP0_NOP, tgsi_unsupported},
8174 [TGSI_OPCODE_POPA] = { ALU_OP0_NOP, tgsi_unsupported},
8175 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2},
8176 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
8177 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2},
8178 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2},
8179 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2},
8180 [88] = { ALU_OP0_NOP, tgsi_unsupported},
8181 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2},
8182 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2},
8183 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod},
8184 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2},
8185 [TGSI_OPCODE_SAD] = { ALU_OP0_NOP, tgsi_unsupported},
8186 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex},
8187 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
8188 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
8189 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
8190 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit},
8191 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop},
8192 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported},
8193 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop},
8194 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported},
8195 [TGSI_OPCODE_TXQ_LZ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
8196 [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
8197 [105] = { ALU_OP0_NOP, tgsi_unsupported},
8198 [106] = { ALU_OP0_NOP, tgsi_unsupported},
8199 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported},
8200 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2},
8201 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2},
8202 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
8203 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
8204 [112] = { ALU_OP0_NOP, tgsi_unsupported},
8205 [TGSI_OPCODE_CALLNZ] = { ALU_OP0_NOP, tgsi_unsupported},
8206 [114] = { ALU_OP0_NOP, tgsi_unsupported},
8207 [TGSI_OPCODE_BREAKC] = { ALU_OP0_NOP, tgsi_unsupported},
8208 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */
8209 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */
8210 [118] = { ALU_OP0_NOP, tgsi_unsupported},
8211 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_f2i},
8212 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv},
8213 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2},
8214 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2},
8215 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg},
8216 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2},
8217 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2},
8218 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap},
8219 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_f2i},
8220 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
8221 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2},
8222 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv},
8223 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad},
8224 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2},
8225 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2},
8226 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod},
8227 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_UINT, tgsi_op2_trans},
8228 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2},
8229 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2},
8230 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2},
8231 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
8232 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2},
8233 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
8234 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported},
8235 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported},
8236 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
8237 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported},
8238 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported},
8239 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported},
8240 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported},
8241 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported},
8242 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported},
8243 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported},
8244 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported},
8245 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported},
8246 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported},
8247 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported},
8248 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported},
8249 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_eg_arl},
8250 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp},
8251 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs},
8252 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg},
8253 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_unsupported},
8254 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_unsupported},
8255 [TGSI_OPCODE_MFENCE] = { ALU_OP0_NOP, tgsi_unsupported},
8256 [TGSI_OPCODE_LFENCE] = { ALU_OP0_NOP, tgsi_unsupported},
8257 [TGSI_OPCODE_SFENCE] = { ALU_OP0_NOP, tgsi_unsupported},
8258 [TGSI_OPCODE_BARRIER] = { ALU_OP0_NOP, tgsi_unsupported},
8259 [TGSI_OPCODE_ATOMUADD] = { ALU_OP0_NOP, tgsi_unsupported},
8260 [TGSI_OPCODE_ATOMXCHG] = { ALU_OP0_NOP, tgsi_unsupported},
8261 [TGSI_OPCODE_ATOMCAS] = { ALU_OP0_NOP, tgsi_unsupported},
8262 [TGSI_OPCODE_ATOMAND] = { ALU_OP0_NOP, tgsi_unsupported},
8263 [TGSI_OPCODE_ATOMOR] = { ALU_OP0_NOP, tgsi_unsupported},
8264 [TGSI_OPCODE_ATOMXOR] = { ALU_OP0_NOP, tgsi_unsupported},
8265 [TGSI_OPCODE_ATOMUMIN] = { ALU_OP0_NOP, tgsi_unsupported},
8266 [TGSI_OPCODE_ATOMUMAX] = { ALU_OP0_NOP, tgsi_unsupported},
8267 [TGSI_OPCODE_ATOMIMIN] = { ALU_OP0_NOP, tgsi_unsupported},
8268 [TGSI_OPCODE_ATOMIMAX] = { ALU_OP0_NOP, tgsi_unsupported},
8269 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex},
8270 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
8271 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex},
8272 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, tgsi_op2_trans},
8273 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, tgsi_op2_trans},
8274 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_tex},
8275 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_tex},
8276 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_op3},
8277 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_op3},
8278 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_bfi},
8279 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_op2},
8280 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_op2},
8281 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_op2},
8282 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_msb},
8283 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_msb},
8284 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_interp_egcm},
8285 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_interp_egcm},
8286 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_interp_egcm},
8287 [TGSI_OPCODE_F2D] = { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},
8288 [TGSI_OPCODE_D2F] = { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},
8289 [TGSI_OPCODE_DABS] = { ALU_OP1_MOV, tgsi_op2_64},
8290 [TGSI_OPCODE_DNEG] = { ALU_OP2_ADD_64, tgsi_dneg},
8291 [TGSI_OPCODE_DADD] = { ALU_OP2_ADD_64, tgsi_op2_64},
8292 [TGSI_OPCODE_DMUL] = { ALU_OP2_MUL_64, cayman_mul_double_instr},
8293 [TGSI_OPCODE_DMAX] = { ALU_OP2_MAX_64, tgsi_op2_64},
8294 [TGSI_OPCODE_DMIN] = { ALU_OP2_MIN_64, tgsi_op2_64},
8295 [TGSI_OPCODE_DSLT] = { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
8296 [TGSI_OPCODE_DSGE] = { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},
8297 [TGSI_OPCODE_DSEQ] = { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},
8298 [TGSI_OPCODE_DSNE] = { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},
8299 [TGSI_OPCODE_DRCP] = { ALU_OP2_RECIP_64, cayman_emit_double_instr},
8300 [TGSI_OPCODE_DSQRT] = { ALU_OP2_SQRT_64, cayman_emit_double_instr},
8301 [TGSI_OPCODE_DMAD] = { ALU_OP3_FMA_64, tgsi_op3_64},
8302 [TGSI_OPCODE_DFRAC] = { ALU_OP1_FRACT_64, tgsi_op2_64},
8303 [TGSI_OPCODE_DLDEXP] = { ALU_OP2_LDEXP_64, tgsi_op2_64},
8304 [TGSI_OPCODE_DFRACEXP] = { ALU_OP1_FREXP_64, tgsi_dfracexp},
8305 [TGSI_OPCODE_D2I] = { ALU_OP1_FLT_TO_INT, egcm_double_to_int},
8306 [TGSI_OPCODE_I2D] = { ALU_OP1_INT_TO_FLT, egcm_int_to_double},
8307 [TGSI_OPCODE_D2U] = { ALU_OP1_FLT_TO_UINT, egcm_double_to_int},
8308 [TGSI_OPCODE_U2D] = { ALU_OP1_UINT_TO_FLT, egcm_int_to_double},
8309 [TGSI_OPCODE_DRSQ] = { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},
8310 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported},
8311 };
8312
8313 static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = {
8314 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_eg_arl},
8315 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2},
8316 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit},
8317 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, cayman_emit_float_instr},
8318 [TGSI_OPCODE_RSQ] = { ALU_OP1_RECIPSQRT_IEEE, cayman_emit_float_instr},
8319 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp},
8320 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log},
8321 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL, tgsi_op2},
8322 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2},
8323 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4, tgsi_dp},
8324 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4, tgsi_dp},
8325 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst},
8326 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN, tgsi_op2},
8327 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX, tgsi_op2},
8328 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap},
8329 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2},
8330 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD, tgsi_op3},
8331 [TGSI_OPCODE_SUB] = { ALU_OP2_ADD, tgsi_op2},
8332 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp},
8333 [TGSI_OPCODE_FMA] = { ALU_OP0_NOP, tgsi_unsupported},
8334 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, cayman_emit_float_instr},
8335 [TGSI_OPCODE_DP2A] = { ALU_OP0_NOP, tgsi_unsupported},
8336 [22] = { ALU_OP0_NOP, tgsi_unsupported},
8337 [23] = { ALU_OP0_NOP, tgsi_unsupported},
8338 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2},
8339 [TGSI_OPCODE_CLAMP] = { ALU_OP0_NOP, tgsi_unsupported},
8340 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2},
8341 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2},
8342 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, cayman_emit_float_instr},
8343 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, cayman_emit_float_instr},
8344 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, cayman_pow},
8345 [TGSI_OPCODE_XPD] = { ALU_OP0_NOP, tgsi_xpd},
8346 [32] = { ALU_OP0_NOP, tgsi_unsupported},
8347 [TGSI_OPCODE_ABS] = { ALU_OP1_MOV, tgsi_op2},
8348 [34] = { ALU_OP0_NOP, tgsi_unsupported},
8349 [TGSI_OPCODE_DPH] = { ALU_OP2_DOT4, tgsi_dp},
8350 [TGSI_OPCODE_COS] = { ALU_OP1_COS, cayman_trig},
8351 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
8352 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
8353 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */
8354 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_unsupported},
8355 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported},
8356 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported},
8357 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported},
8358 [44] = { ALU_OP0_NOP, tgsi_unsupported},
8359 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2},
8360 [46] = { ALU_OP0_NOP, tgsi_unsupported},
8361 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2},
8362 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, cayman_trig},
8363 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap},
8364 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2},
8365 [51] = { ALU_OP0_NOP, tgsi_unsupported},
8366 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex},
8367 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex},
8368 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex},
8369 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_unsupported},
8370 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported},
8371 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported},
8372 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported},
8373 [59] = { ALU_OP0_NOP, tgsi_unsupported},
8374 [60] = { ALU_OP0_NOP, tgsi_unsupported},
8375 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_eg_arl},
8376 [62] = { ALU_OP0_NOP, tgsi_unsupported},
8377 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported},
8378 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported},
8379 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg},
8380 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp},
8381 [TGSI_OPCODE_SCS] = { ALU_OP0_NOP, tgsi_scs},
8382 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
8383 [69] = { ALU_OP0_NOP, tgsi_unsupported},
8384 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported},
8385 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4, tgsi_dp},
8386 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex},
8387 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
8388 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if},
8389 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif},
8390 [76] = { ALU_OP0_NOP, tgsi_unsupported},
8391 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else},
8392 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif},
8393 [TGSI_OPCODE_DDX_FINE] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
8394 [TGSI_OPCODE_DDY_FINE] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
8395 [TGSI_OPCODE_PUSHA] = { ALU_OP0_NOP, tgsi_unsupported},
8396 [TGSI_OPCODE_POPA] = { ALU_OP0_NOP, tgsi_unsupported},
8397 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2},
8398 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2},
8399 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2},
8400 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2},
8401 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2},
8402 [88] = { ALU_OP0_NOP, tgsi_unsupported},
8403 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2},
8404 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2},
8405 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod},
8406 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2},
8407 [TGSI_OPCODE_SAD] = { ALU_OP0_NOP, tgsi_unsupported},
8408 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex},
8409 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
8410 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
8411 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
8412 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit},
8413 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop},
8414 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported},
8415 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop},
8416 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported},
8417 [TGSI_OPCODE_TXQ_LZ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
8418 [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
8419 [105] = { ALU_OP0_NOP, tgsi_unsupported},
8420 [106] = { ALU_OP0_NOP, tgsi_unsupported},
8421 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported},
8422 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2},
8423 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2},
8424 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
8425 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
8426 [112] = { ALU_OP0_NOP, tgsi_unsupported},
8427 [TGSI_OPCODE_CALLNZ] = { ALU_OP0_NOP, tgsi_unsupported},
8428 [114] = { ALU_OP0_NOP, tgsi_unsupported},
8429 [TGSI_OPCODE_BREAKC] = { ALU_OP0_NOP, tgsi_unsupported},
8430 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */
8431 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */
8432 [118] = { ALU_OP0_NOP, tgsi_unsupported},
8433 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_op2},
8434 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv},
8435 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2},
8436 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2},
8437 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg},
8438 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2},
8439 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2},
8440 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap},
8441 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_op2},
8442 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2},
8443 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2},
8444 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv},
8445 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad},
8446 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2},
8447 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2},
8448 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod},
8449 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_INT, cayman_mul_int_instr},
8450 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2},
8451 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2},
8452 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2},
8453 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
8454 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2},
8455 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
8456 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported},
8457 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported},
8458 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
8459 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported},
8460 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported},
8461 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported},
8462 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported},
8463 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported},
8464 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported},
8465 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported},
8466 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported},
8467 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported},
8468 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported},
8469 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported},
8470 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported},
8471 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_eg_arl},
8472 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp},
8473 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs},
8474 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg},
8475 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_unsupported},
8476 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_unsupported},
8477 [TGSI_OPCODE_MFENCE] = { ALU_OP0_NOP, tgsi_unsupported},
8478 [TGSI_OPCODE_LFENCE] = { ALU_OP0_NOP, tgsi_unsupported},
8479 [TGSI_OPCODE_SFENCE] = { ALU_OP0_NOP, tgsi_unsupported},
8480 [TGSI_OPCODE_BARRIER] = { ALU_OP0_NOP, tgsi_unsupported},
8481 [TGSI_OPCODE_ATOMUADD] = { ALU_OP0_NOP, tgsi_unsupported},
8482 [TGSI_OPCODE_ATOMXCHG] = { ALU_OP0_NOP, tgsi_unsupported},
8483 [TGSI_OPCODE_ATOMCAS] = { ALU_OP0_NOP, tgsi_unsupported},
8484 [TGSI_OPCODE_ATOMAND] = { ALU_OP0_NOP, tgsi_unsupported},
8485 [TGSI_OPCODE_ATOMOR] = { ALU_OP0_NOP, tgsi_unsupported},
8486 [TGSI_OPCODE_ATOMXOR] = { ALU_OP0_NOP, tgsi_unsupported},
8487 [TGSI_OPCODE_ATOMUMIN] = { ALU_OP0_NOP, tgsi_unsupported},
8488 [TGSI_OPCODE_ATOMUMAX] = { ALU_OP0_NOP, tgsi_unsupported},
8489 [TGSI_OPCODE_ATOMIMIN] = { ALU_OP0_NOP, tgsi_unsupported},
8490 [TGSI_OPCODE_ATOMIMAX] = { ALU_OP0_NOP, tgsi_unsupported},
8491 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex},
8492 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
8493 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex},
8494 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, cayman_mul_int_instr},
8495 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, cayman_mul_int_instr},
8496 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_tex},
8497 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_tex},
8498 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_op3},
8499 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_op3},
8500 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_bfi},
8501 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_op2},
8502 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_op2},
8503 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_op2},
8504 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_msb},
8505 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_msb},
8506 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_interp_egcm},
8507 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_interp_egcm},
8508 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_interp_egcm},
8509 [TGSI_OPCODE_F2D] = { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},
8510 [TGSI_OPCODE_D2F] = { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},
8511 [TGSI_OPCODE_DABS] = { ALU_OP1_MOV, tgsi_op2_64},
8512 [TGSI_OPCODE_DNEG] = { ALU_OP2_ADD_64, tgsi_dneg},
8513 [TGSI_OPCODE_DADD] = { ALU_OP2_ADD_64, tgsi_op2_64},
8514 [TGSI_OPCODE_DMUL] = { ALU_OP2_MUL_64, cayman_mul_double_instr},
8515 [TGSI_OPCODE_DMAX] = { ALU_OP2_MAX_64, tgsi_op2_64},
8516 [TGSI_OPCODE_DMIN] = { ALU_OP2_MIN_64, tgsi_op2_64},
8517 [TGSI_OPCODE_DSLT] = { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
8518 [TGSI_OPCODE_DSGE] = { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},
8519 [TGSI_OPCODE_DSEQ] = { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},
8520 [TGSI_OPCODE_DSNE] = { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},
8521 [TGSI_OPCODE_DRCP] = { ALU_OP2_RECIP_64, cayman_emit_double_instr},
8522 [TGSI_OPCODE_DSQRT] = { ALU_OP2_SQRT_64, cayman_emit_double_instr},
8523 [TGSI_OPCODE_DMAD] = { ALU_OP3_FMA_64, tgsi_op3_64},
8524 [TGSI_OPCODE_DFRAC] = { ALU_OP1_FRACT_64, tgsi_op2_64},
8525 [TGSI_OPCODE_DLDEXP] = { ALU_OP2_LDEXP_64, tgsi_op2_64},
8526 [TGSI_OPCODE_DFRACEXP] = { ALU_OP1_FREXP_64, tgsi_dfracexp},
8527 [TGSI_OPCODE_D2I] = { ALU_OP1_FLT_TO_INT, egcm_double_to_int},
8528 [TGSI_OPCODE_I2D] = { ALU_OP1_INT_TO_FLT, egcm_int_to_double},
8529 [TGSI_OPCODE_D2U] = { ALU_OP1_FLT_TO_UINT, egcm_double_to_int},
8530 [TGSI_OPCODE_U2D] = { ALU_OP1_UINT_TO_FLT, egcm_int_to_double},
8531 [TGSI_OPCODE_DRSQ] = { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},
8532 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported},
8533 };