r600: update correct hw shaders depending on configuration.
[mesa.git] / src / gallium / drivers / r600 / r600_shader.c
1 /*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 */
23 #include "r600_sq.h"
24 #include "r600_llvm.h"
25 #include "r600_formats.h"
26 #include "r600_opcodes.h"
27 #include "r600_shader.h"
28 #include "r600d.h"
29
30 #include "sb/sb_public.h"
31
32 #include "pipe/p_shader_tokens.h"
33 #include "tgsi/tgsi_info.h"
34 #include "tgsi/tgsi_parse.h"
35 #include "tgsi/tgsi_scan.h"
36 #include "tgsi/tgsi_dump.h"
37 #include "util/u_memory.h"
38 #include "util/u_math.h"
39 #include <stdio.h>
40 #include <errno.h>
41
42 /* CAYMAN notes
43 Why CAYMAN got loops for lots of instructions is explained here.
44
45 -These 8xx t-slot only ops are implemented in all vector slots.
46 MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT
47 These 8xx t-slot only opcodes become vector ops, with all four
48 slots expecting the arguments on sources a and b. Result is
49 broadcast to all channels.
50 MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT, MUL_64
51 These 8xx t-slot only opcodes become vector ops in the z, y, and
52 x slots.
53 EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64
54 RECIPSQRT_IEEE/CLAMPED/FF/_64/CLAMPED_64
55 SQRT_IEEE/_64
56 SIN/COS
57 The w slot may have an independent co-issued operation, or if the
58 result is required to be in the w slot, the opcode above may be
59 issued in the w slot as well.
60 The compiler must issue the source argument to slots z, y, and x
61 */
62
63 #define R600_SHADER_BUFFER_INFO_SEL (512 + R600_BUFFER_INFO_OFFSET / 16)
64 static int r600_shader_from_tgsi(struct r600_context *rctx,
65 struct r600_pipe_shader *pipeshader,
66 union r600_shader_key key);
67
68
69 static void r600_add_gpr_array(struct r600_shader *ps, int start_gpr,
70 int size, unsigned comp_mask) {
71
72 if (!size)
73 return;
74
75 if (ps->num_arrays == ps->max_arrays) {
76 ps->max_arrays += 64;
77 ps->arrays = realloc(ps->arrays, ps->max_arrays *
78 sizeof(struct r600_shader_array));
79 }
80
81 int n = ps->num_arrays;
82 ++ps->num_arrays;
83
84 ps->arrays[n].comp_mask = comp_mask;
85 ps->arrays[n].gpr_start = start_gpr;
86 ps->arrays[n].gpr_count = size;
87 }
88
89 static void r600_dump_streamout(struct pipe_stream_output_info *so)
90 {
91 unsigned i;
92
93 fprintf(stderr, "STREAMOUT\n");
94 for (i = 0; i < so->num_outputs; i++) {
95 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
96 so->output[i].start_component;
97 fprintf(stderr, " %i: MEM_STREAM%d_BUF%i[%i..%i] <- OUT[%i].%s%s%s%s%s\n",
98 i,
99 so->output[i].stream,
100 so->output[i].output_buffer,
101 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
102 so->output[i].register_index,
103 mask & 1 ? "x" : "",
104 mask & 2 ? "y" : "",
105 mask & 4 ? "z" : "",
106 mask & 8 ? "w" : "",
107 so->output[i].dst_offset < so->output[i].start_component ? " (will lower)" : "");
108 }
109 }
110
111 static int store_shader(struct pipe_context *ctx,
112 struct r600_pipe_shader *shader)
113 {
114 struct r600_context *rctx = (struct r600_context *)ctx;
115 uint32_t *ptr, i;
116
117 if (shader->bo == NULL) {
118 shader->bo = (struct r600_resource*)
119 pipe_buffer_create(ctx->screen, PIPE_BIND_CUSTOM, PIPE_USAGE_IMMUTABLE, shader->shader.bc.ndw * 4);
120 if (shader->bo == NULL) {
121 return -ENOMEM;
122 }
123 ptr = r600_buffer_map_sync_with_rings(&rctx->b, shader->bo, PIPE_TRANSFER_WRITE);
124 if (R600_BIG_ENDIAN) {
125 for (i = 0; i < shader->shader.bc.ndw; ++i) {
126 ptr[i] = util_cpu_to_le32(shader->shader.bc.bytecode[i]);
127 }
128 } else {
129 memcpy(ptr, shader->shader.bc.bytecode, shader->shader.bc.ndw * sizeof(*ptr));
130 }
131 rctx->b.ws->buffer_unmap(shader->bo->cs_buf);
132 }
133
134 return 0;
135 }
136
137 int r600_pipe_shader_create(struct pipe_context *ctx,
138 struct r600_pipe_shader *shader,
139 union r600_shader_key key)
140 {
141 struct r600_context *rctx = (struct r600_context *)ctx;
142 struct r600_pipe_shader_selector *sel = shader->selector;
143 int r;
144 bool dump = r600_can_dump_shader(&rctx->screen->b, sel->tokens);
145 unsigned use_sb = !(rctx->screen->b.debug_flags & DBG_NO_SB);
146 unsigned sb_disasm = use_sb || (rctx->screen->b.debug_flags & DBG_SB_DISASM);
147 unsigned export_shader;
148
149 shader->shader.bc.isa = rctx->isa;
150
151 if (dump) {
152 fprintf(stderr, "--------------------------------------------------------------\n");
153 tgsi_dump(sel->tokens, 0);
154
155 if (sel->so.num_outputs) {
156 r600_dump_streamout(&sel->so);
157 }
158 }
159 r = r600_shader_from_tgsi(rctx, shader, key);
160 if (r) {
161 R600_ERR("translation from TGSI failed !\n");
162 goto error;
163 }
164
165 /* disable SB for shaders using doubles */
166 use_sb &= !shader->shader.uses_doubles;
167
168 /* Check if the bytecode has already been built. When using the llvm
169 * backend, r600_shader_from_tgsi() will take care of building the
170 * bytecode.
171 */
172 if (!shader->shader.bc.bytecode) {
173 r = r600_bytecode_build(&shader->shader.bc);
174 if (r) {
175 R600_ERR("building bytecode failed !\n");
176 goto error;
177 }
178 }
179
180 if (dump && !sb_disasm) {
181 fprintf(stderr, "--------------------------------------------------------------\n");
182 r600_bytecode_disasm(&shader->shader.bc);
183 fprintf(stderr, "______________________________________________________________\n");
184 } else if ((dump && sb_disasm) || use_sb) {
185 r = r600_sb_bytecode_process(rctx, &shader->shader.bc, &shader->shader,
186 dump, use_sb);
187 if (r) {
188 R600_ERR("r600_sb_bytecode_process failed !\n");
189 goto error;
190 }
191 }
192
193 if (shader->gs_copy_shader) {
194 if (dump) {
195 // dump copy shader
196 r = r600_sb_bytecode_process(rctx, &shader->gs_copy_shader->shader.bc,
197 &shader->gs_copy_shader->shader, dump, 0);
198 if (r)
199 goto error;
200 }
201
202 if ((r = store_shader(ctx, shader->gs_copy_shader)))
203 goto error;
204 }
205
206 /* Store the shader in a buffer. */
207 if ((r = store_shader(ctx, shader)))
208 goto error;
209
210 /* Build state. */
211 switch (shader->shader.processor_type) {
212 case TGSI_PROCESSOR_TESS_CTRL:
213 evergreen_update_hs_state(ctx, shader);
214 break;
215 case TGSI_PROCESSOR_TESS_EVAL:
216 if (key.tes.as_es)
217 evergreen_update_es_state(ctx, shader);
218 else
219 evergreen_update_vs_state(ctx, shader);
220 break;
221 case TGSI_PROCESSOR_GEOMETRY:
222 if (rctx->b.chip_class >= EVERGREEN) {
223 evergreen_update_gs_state(ctx, shader);
224 evergreen_update_vs_state(ctx, shader->gs_copy_shader);
225 } else {
226 r600_update_gs_state(ctx, shader);
227 r600_update_vs_state(ctx, shader->gs_copy_shader);
228 }
229 break;
230 case TGSI_PROCESSOR_VERTEX:
231 export_shader = key.vs.as_es;
232 if (rctx->b.chip_class >= EVERGREEN) {
233 if (key.vs.as_ls)
234 evergreen_update_ls_state(ctx, shader);
235 else if (key.vs.as_es)
236 evergreen_update_es_state(ctx, shader);
237 else
238 evergreen_update_vs_state(ctx, shader);
239 } else {
240 if (export_shader)
241 r600_update_es_state(ctx, shader);
242 else
243 r600_update_vs_state(ctx, shader);
244 }
245 break;
246 case TGSI_PROCESSOR_FRAGMENT:
247 if (rctx->b.chip_class >= EVERGREEN) {
248 evergreen_update_ps_state(ctx, shader);
249 } else {
250 r600_update_ps_state(ctx, shader);
251 }
252 break;
253 default:
254 r = -EINVAL;
255 goto error;
256 }
257 return 0;
258
259 error:
260 r600_pipe_shader_destroy(ctx, shader);
261 return r;
262 }
263
264 void r600_pipe_shader_destroy(struct pipe_context *ctx, struct r600_pipe_shader *shader)
265 {
266 pipe_resource_reference((struct pipe_resource**)&shader->bo, NULL);
267 r600_bytecode_clear(&shader->shader.bc);
268 r600_release_command_buffer(&shader->command_buffer);
269 }
270
271 /*
272 * tgsi -> r600 shader
273 */
274 struct r600_shader_tgsi_instruction;
275
276 struct r600_shader_src {
277 unsigned sel;
278 unsigned swizzle[4];
279 unsigned neg;
280 unsigned abs;
281 unsigned rel;
282 unsigned kc_bank;
283 boolean kc_rel; /* true if cache bank is indexed */
284 uint32_t value[4];
285 };
286
287 struct eg_interp {
288 boolean enabled;
289 unsigned ij_index;
290 };
291
292 struct r600_shader_ctx {
293 struct tgsi_shader_info info;
294 struct tgsi_parse_context parse;
295 const struct tgsi_token *tokens;
296 unsigned type;
297 unsigned file_offset[TGSI_FILE_COUNT];
298 unsigned temp_reg;
299 const struct r600_shader_tgsi_instruction *inst_info;
300 struct r600_bytecode *bc;
301 struct r600_shader *shader;
302 struct r600_shader_src src[4];
303 uint32_t *literals;
304 uint32_t nliterals;
305 uint32_t max_driver_temp_used;
306 boolean use_llvm;
307 /* needed for evergreen interpolation */
308 struct eg_interp eg_interpolators[6]; // indexed by Persp/Linear * 3 + sample/center/centroid
309 /* evergreen/cayman also store sample mask in face register */
310 int face_gpr;
311 /* sample id is .w component stored in fixed point position register */
312 int fixed_pt_position_gpr;
313 int colors_used;
314 boolean clip_vertex_write;
315 unsigned cv_output;
316 unsigned edgeflag_output;
317 int fragcoord_input;
318 int native_integers;
319 int next_ring_offset;
320 int gs_out_ring_offset;
321 int gs_next_vertex;
322 struct r600_shader *gs_for_vs;
323 int gs_export_gpr_tregs[4];
324 const struct pipe_stream_output_info *gs_stream_output_info;
325 unsigned enabled_stream_buffers_mask;
326 };
327
328 struct r600_shader_tgsi_instruction {
329 unsigned op;
330 int (*process)(struct r600_shader_ctx *ctx);
331 };
332
333 static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so, int stream, bool ind);
334 static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[];
335 static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx);
336 static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason);
337 static void fc_pushlevel(struct r600_shader_ctx *ctx, int type);
338 static int tgsi_else(struct r600_shader_ctx *ctx);
339 static int tgsi_endif(struct r600_shader_ctx *ctx);
340 static int tgsi_bgnloop(struct r600_shader_ctx *ctx);
341 static int tgsi_endloop(struct r600_shader_ctx *ctx);
342 static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx);
343 static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
344 unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,
345 unsigned int dst_reg);
346 static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
347 const struct r600_shader_src *shader_src,
348 unsigned chan);
349
350 static int tgsi_is_supported(struct r600_shader_ctx *ctx)
351 {
352 struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction;
353 int j;
354
355 if (i->Instruction.NumDstRegs > 1 && i->Instruction.Opcode != TGSI_OPCODE_DFRACEXP) {
356 R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs);
357 return -EINVAL;
358 }
359 if (i->Instruction.Predicate) {
360 R600_ERR("predicate unsupported\n");
361 return -EINVAL;
362 }
363 #if 0
364 if (i->Instruction.Label) {
365 R600_ERR("label unsupported\n");
366 return -EINVAL;
367 }
368 #endif
369 for (j = 0; j < i->Instruction.NumSrcRegs; j++) {
370 if (i->Src[j].Register.Dimension) {
371 switch (i->Src[j].Register.File) {
372 case TGSI_FILE_CONSTANT:
373 break;
374 case TGSI_FILE_INPUT:
375 if (ctx->type == TGSI_PROCESSOR_GEOMETRY)
376 break;
377 default:
378 R600_ERR("unsupported src %d (dimension %d)\n", j,
379 i->Src[j].Register.Dimension);
380 return -EINVAL;
381 }
382 }
383 }
384 for (j = 0; j < i->Instruction.NumDstRegs; j++) {
385 if (i->Dst[j].Register.Dimension) {
386 R600_ERR("unsupported dst (dimension)\n");
387 return -EINVAL;
388 }
389 }
390 return 0;
391 }
392
393 int eg_get_interpolator_index(unsigned interpolate, unsigned location)
394 {
395 if (interpolate == TGSI_INTERPOLATE_COLOR ||
396 interpolate == TGSI_INTERPOLATE_LINEAR ||
397 interpolate == TGSI_INTERPOLATE_PERSPECTIVE)
398 {
399 int is_linear = interpolate == TGSI_INTERPOLATE_LINEAR;
400 int loc;
401
402 switch(location) {
403 case TGSI_INTERPOLATE_LOC_CENTER:
404 loc = 1;
405 break;
406 case TGSI_INTERPOLATE_LOC_CENTROID:
407 loc = 2;
408 break;
409 case TGSI_INTERPOLATE_LOC_SAMPLE:
410 default:
411 loc = 0; break;
412 }
413
414 return is_linear * 3 + loc;
415 }
416
417 return -1;
418 }
419
420 static void evergreen_interp_assign_ij_index(struct r600_shader_ctx *ctx,
421 int input)
422 {
423 int i = eg_get_interpolator_index(
424 ctx->shader->input[input].interpolate,
425 ctx->shader->input[input].interpolate_location);
426 assert(i >= 0);
427 ctx->shader->input[input].ij_index = ctx->eg_interpolators[i].ij_index;
428 }
429
430 static int evergreen_interp_alu(struct r600_shader_ctx *ctx, int input)
431 {
432 int i, r;
433 struct r600_bytecode_alu alu;
434 int gpr = 0, base_chan = 0;
435 int ij_index = ctx->shader->input[input].ij_index;
436
437 /* work out gpr and base_chan from index */
438 gpr = ij_index / 2;
439 base_chan = (2 * (ij_index % 2)) + 1;
440
441 for (i = 0; i < 8; i++) {
442 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
443
444 if (i < 4)
445 alu.op = ALU_OP2_INTERP_ZW;
446 else
447 alu.op = ALU_OP2_INTERP_XY;
448
449 if ((i > 1) && (i < 6)) {
450 alu.dst.sel = ctx->shader->input[input].gpr;
451 alu.dst.write = 1;
452 }
453
454 alu.dst.chan = i % 4;
455
456 alu.src[0].sel = gpr;
457 alu.src[0].chan = (base_chan - (i % 2));
458
459 alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
460
461 alu.bank_swizzle_force = SQ_ALU_VEC_210;
462 if ((i % 4) == 3)
463 alu.last = 1;
464 r = r600_bytecode_add_alu(ctx->bc, &alu);
465 if (r)
466 return r;
467 }
468 return 0;
469 }
470
471 static int evergreen_interp_flat(struct r600_shader_ctx *ctx, int input)
472 {
473 int i, r;
474 struct r600_bytecode_alu alu;
475
476 for (i = 0; i < 4; i++) {
477 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
478
479 alu.op = ALU_OP1_INTERP_LOAD_P0;
480
481 alu.dst.sel = ctx->shader->input[input].gpr;
482 alu.dst.write = 1;
483
484 alu.dst.chan = i;
485
486 alu.src[0].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
487 alu.src[0].chan = i;
488
489 if (i == 3)
490 alu.last = 1;
491 r = r600_bytecode_add_alu(ctx->bc, &alu);
492 if (r)
493 return r;
494 }
495 return 0;
496 }
497
498 /*
499 * Special export handling in shaders
500 *
501 * shader export ARRAY_BASE for EXPORT_POS:
502 * 60 is position
503 * 61 is misc vector
504 * 62, 63 are clip distance vectors
505 *
506 * The use of the values exported in 61-63 are controlled by PA_CL_VS_OUT_CNTL:
507 * VS_OUT_MISC_VEC_ENA - enables the use of all fields in export 61
508 * USE_VTX_POINT_SIZE - point size in the X channel of export 61
509 * USE_VTX_EDGE_FLAG - edge flag in the Y channel of export 61
510 * USE_VTX_RENDER_TARGET_INDX - render target index in the Z channel of export 61
511 * USE_VTX_VIEWPORT_INDX - viewport index in the W channel of export 61
512 * USE_VTX_KILL_FLAG - kill flag in the Z channel of export 61 (mutually
513 * exclusive from render target index)
514 * VS_OUT_CCDIST0_VEC_ENA/VS_OUT_CCDIST1_VEC_ENA - enable clip distance vectors
515 *
516 *
517 * shader export ARRAY_BASE for EXPORT_PIXEL:
518 * 0-7 CB targets
519 * 61 computed Z vector
520 *
521 * The use of the values exported in the computed Z vector are controlled
522 * by DB_SHADER_CONTROL:
523 * Z_EXPORT_ENABLE - Z as a float in RED
524 * STENCIL_REF_EXPORT_ENABLE - stencil ref as int in GREEN
525 * COVERAGE_TO_MASK_ENABLE - alpha to mask in ALPHA
526 * MASK_EXPORT_ENABLE - pixel sample mask in BLUE
527 * DB_SOURCE_FORMAT - export control restrictions
528 *
529 */
530
531
532 /* Map name/sid pair from tgsi to the 8-bit semantic index for SPI setup */
533 static int r600_spi_sid(struct r600_shader_io * io)
534 {
535 int index, name = io->name;
536
537 /* These params are handled differently, they don't need
538 * semantic indices, so we'll use 0 for them.
539 */
540 if (name == TGSI_SEMANTIC_POSITION ||
541 name == TGSI_SEMANTIC_PSIZE ||
542 name == TGSI_SEMANTIC_EDGEFLAG ||
543 name == TGSI_SEMANTIC_FACE ||
544 name == TGSI_SEMANTIC_SAMPLEMASK)
545 index = 0;
546 else {
547 if (name == TGSI_SEMANTIC_GENERIC) {
548 /* For generic params simply use sid from tgsi */
549 index = io->sid;
550 } else {
551 /* For non-generic params - pack name and sid into 8 bits */
552 index = 0x80 | (name<<3) | (io->sid);
553 }
554
555 /* Make sure that all really used indices have nonzero value, so
556 * we can just compare it to 0 later instead of comparing the name
557 * with different values to detect special cases. */
558 index++;
559 }
560
561 return index;
562 };
563
564 /* turn input into interpolate on EG */
565 static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index)
566 {
567 int r = 0;
568
569 if (ctx->shader->input[index].spi_sid) {
570 ctx->shader->input[index].lds_pos = ctx->shader->nlds++;
571 if (ctx->shader->input[index].interpolate > 0) {
572 evergreen_interp_assign_ij_index(ctx, index);
573 if (!ctx->use_llvm)
574 r = evergreen_interp_alu(ctx, index);
575 } else {
576 if (!ctx->use_llvm)
577 r = evergreen_interp_flat(ctx, index);
578 }
579 }
580 return r;
581 }
582
583 static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back)
584 {
585 struct r600_bytecode_alu alu;
586 int i, r;
587 int gpr_front = ctx->shader->input[front].gpr;
588 int gpr_back = ctx->shader->input[back].gpr;
589
590 for (i = 0; i < 4; i++) {
591 memset(&alu, 0, sizeof(alu));
592 alu.op = ALU_OP3_CNDGT;
593 alu.is_op3 = 1;
594 alu.dst.write = 1;
595 alu.dst.sel = gpr_front;
596 alu.src[0].sel = ctx->face_gpr;
597 alu.src[1].sel = gpr_front;
598 alu.src[2].sel = gpr_back;
599
600 alu.dst.chan = i;
601 alu.src[1].chan = i;
602 alu.src[2].chan = i;
603 alu.last = (i==3);
604
605 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
606 return r;
607 }
608
609 return 0;
610 }
611
612 static inline int get_address_file_reg(struct r600_shader_ctx *ctx, int index)
613 {
614 return index > 0 ? ctx->bc->index_reg[index - 1] : ctx->bc->ar_reg;
615 }
616
617 static int vs_add_primid_output(struct r600_shader_ctx *ctx, int prim_id_sid)
618 {
619 int i;
620 i = ctx->shader->noutput++;
621 ctx->shader->output[i].name = TGSI_SEMANTIC_PRIMID;
622 ctx->shader->output[i].sid = 0;
623 ctx->shader->output[i].gpr = 0;
624 ctx->shader->output[i].interpolate = TGSI_INTERPOLATE_CONSTANT;
625 ctx->shader->output[i].write_mask = 0x4;
626 ctx->shader->output[i].spi_sid = prim_id_sid;
627
628 return 0;
629 }
630
631 static int tgsi_declaration(struct r600_shader_ctx *ctx)
632 {
633 struct tgsi_full_declaration *d = &ctx->parse.FullToken.FullDeclaration;
634 int r, i, j, count = d->Range.Last - d->Range.First + 1;
635
636 switch (d->Declaration.File) {
637 case TGSI_FILE_INPUT:
638 for (j = 0; j < count; j++) {
639 i = ctx->shader->ninput + j;
640 assert(i < Elements(ctx->shader->input));
641 ctx->shader->input[i].name = d->Semantic.Name;
642 ctx->shader->input[i].sid = d->Semantic.Index + j;
643 ctx->shader->input[i].interpolate = d->Interp.Interpolate;
644 ctx->shader->input[i].interpolate_location = d->Interp.Location;
645 ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First + j;
646 if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
647 ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]);
648 switch (ctx->shader->input[i].name) {
649 case TGSI_SEMANTIC_FACE:
650 if (ctx->face_gpr != -1)
651 ctx->shader->input[i].gpr = ctx->face_gpr; /* already allocated by allocate_system_value_inputs */
652 else
653 ctx->face_gpr = ctx->shader->input[i].gpr;
654 break;
655 case TGSI_SEMANTIC_COLOR:
656 ctx->colors_used++;
657 break;
658 case TGSI_SEMANTIC_POSITION:
659 ctx->fragcoord_input = i;
660 break;
661 case TGSI_SEMANTIC_PRIMID:
662 /* set this for now */
663 ctx->shader->gs_prim_id_input = true;
664 ctx->shader->ps_prim_id_input = i;
665 break;
666 }
667 if (ctx->bc->chip_class >= EVERGREEN) {
668 if ((r = evergreen_interp_input(ctx, i)))
669 return r;
670 }
671 } else if (ctx->type == TGSI_PROCESSOR_GEOMETRY) {
672 /* FIXME probably skip inputs if they aren't passed in the ring */
673 ctx->shader->input[i].ring_offset = ctx->next_ring_offset;
674 ctx->next_ring_offset += 16;
675 if (ctx->shader->input[i].name == TGSI_SEMANTIC_PRIMID)
676 ctx->shader->gs_prim_id_input = true;
677 }
678 }
679 ctx->shader->ninput += count;
680 break;
681 case TGSI_FILE_OUTPUT:
682 for (j = 0; j < count; j++) {
683 i = ctx->shader->noutput + j;
684 assert(i < Elements(ctx->shader->output));
685 ctx->shader->output[i].name = d->Semantic.Name;
686 ctx->shader->output[i].sid = d->Semantic.Index + j;
687 ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First + j;
688 ctx->shader->output[i].interpolate = d->Interp.Interpolate;
689 ctx->shader->output[i].write_mask = d->Declaration.UsageMask;
690 if (ctx->type == TGSI_PROCESSOR_VERTEX ||
691 ctx->type == TGSI_PROCESSOR_GEOMETRY) {
692 ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]);
693 switch (d->Semantic.Name) {
694 case TGSI_SEMANTIC_CLIPDIST:
695 ctx->shader->clip_dist_write |= d->Declaration.UsageMask <<
696 ((d->Semantic.Index + j) << 2);
697 break;
698 case TGSI_SEMANTIC_PSIZE:
699 ctx->shader->vs_out_misc_write = 1;
700 ctx->shader->vs_out_point_size = 1;
701 break;
702 case TGSI_SEMANTIC_EDGEFLAG:
703 ctx->shader->vs_out_misc_write = 1;
704 ctx->shader->vs_out_edgeflag = 1;
705 ctx->edgeflag_output = i;
706 break;
707 case TGSI_SEMANTIC_VIEWPORT_INDEX:
708 ctx->shader->vs_out_misc_write = 1;
709 ctx->shader->vs_out_viewport = 1;
710 break;
711 case TGSI_SEMANTIC_LAYER:
712 ctx->shader->vs_out_misc_write = 1;
713 ctx->shader->vs_out_layer = 1;
714 break;
715 case TGSI_SEMANTIC_CLIPVERTEX:
716 ctx->clip_vertex_write = TRUE;
717 ctx->cv_output = i;
718 break;
719 }
720 if (ctx->type == TGSI_PROCESSOR_GEOMETRY) {
721 ctx->gs_out_ring_offset += 16;
722 }
723 } else if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
724 switch (d->Semantic.Name) {
725 case TGSI_SEMANTIC_COLOR:
726 ctx->shader->nr_ps_max_color_exports++;
727 break;
728 }
729 }
730 }
731 ctx->shader->noutput += count;
732 break;
733 case TGSI_FILE_TEMPORARY:
734 if (ctx->info.indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
735 if (d->Array.ArrayID) {
736 r600_add_gpr_array(ctx->shader,
737 ctx->file_offset[TGSI_FILE_TEMPORARY] +
738 d->Range.First,
739 d->Range.Last - d->Range.First + 1, 0x0F);
740 }
741 }
742 break;
743
744 case TGSI_FILE_CONSTANT:
745 case TGSI_FILE_SAMPLER:
746 case TGSI_FILE_SAMPLER_VIEW:
747 case TGSI_FILE_ADDRESS:
748 break;
749
750 case TGSI_FILE_SYSTEM_VALUE:
751 if (d->Semantic.Name == TGSI_SEMANTIC_SAMPLEMASK ||
752 d->Semantic.Name == TGSI_SEMANTIC_SAMPLEID ||
753 d->Semantic.Name == TGSI_SEMANTIC_SAMPLEPOS) {
754 break; /* Already handled from allocate_system_value_inputs */
755 } else if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) {
756 if (!ctx->native_integers) {
757 struct r600_bytecode_alu alu;
758 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
759
760 alu.op = ALU_OP1_INT_TO_FLT;
761 alu.src[0].sel = 0;
762 alu.src[0].chan = 3;
763
764 alu.dst.sel = 0;
765 alu.dst.chan = 3;
766 alu.dst.write = 1;
767 alu.last = 1;
768
769 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
770 return r;
771 }
772 break;
773 } else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID)
774 break;
775 else if (d->Semantic.Name == TGSI_SEMANTIC_INVOCATIONID)
776 break;
777 default:
778 R600_ERR("unsupported file %d declaration\n", d->Declaration.File);
779 return -EINVAL;
780 }
781 return 0;
782 }
783
784 static int r600_get_temp(struct r600_shader_ctx *ctx)
785 {
786 return ctx->temp_reg + ctx->max_driver_temp_used++;
787 }
788
789 static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_offset)
790 {
791 struct tgsi_parse_context parse;
792 struct {
793 boolean enabled;
794 int *reg;
795 unsigned name, alternate_name;
796 } inputs[2] = {
797 { false, &ctx->face_gpr, TGSI_SEMANTIC_SAMPLEMASK, ~0u }, /* lives in Front Face GPR.z */
798
799 { false, &ctx->fixed_pt_position_gpr, TGSI_SEMANTIC_SAMPLEID, TGSI_SEMANTIC_SAMPLEPOS } /* SAMPLEID is in Fixed Point Position GPR.w */
800 };
801 int i, k, num_regs = 0;
802
803 if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) {
804 return 0;
805 }
806
807 /* need to scan shader for system values and interpolateAtSample/Offset/Centroid */
808 while (!tgsi_parse_end_of_tokens(&parse)) {
809 tgsi_parse_token(&parse);
810
811 if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) {
812 const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
813 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE ||
814 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
815 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID)
816 {
817 int interpolate, location, k;
818
819 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
820 location = TGSI_INTERPOLATE_LOC_CENTER;
821 inputs[1].enabled = true; /* needs SAMPLEID */
822 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
823 location = TGSI_INTERPOLATE_LOC_CENTER;
824 /* Needs sample positions, currently those are always available */
825 } else {
826 location = TGSI_INTERPOLATE_LOC_CENTROID;
827 }
828
829 interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index];
830 k = eg_get_interpolator_index(interpolate, location);
831 ctx->eg_interpolators[k].enabled = true;
832 }
833 } else if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_DECLARATION) {
834 struct tgsi_full_declaration *d = &parse.FullToken.FullDeclaration;
835 if (d->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
836 for (k = 0; k < Elements(inputs); k++) {
837 if (d->Semantic.Name == inputs[k].name ||
838 d->Semantic.Name == inputs[k].alternate_name) {
839 inputs[k].enabled = true;
840 }
841 }
842 }
843 }
844 }
845
846 tgsi_parse_free(&parse);
847
848 for (i = 0; i < Elements(inputs); i++) {
849 boolean enabled = inputs[i].enabled;
850 int *reg = inputs[i].reg;
851 unsigned name = inputs[i].name;
852
853 if (enabled) {
854 int gpr = gpr_offset + num_regs++;
855
856 // add to inputs, allocate a gpr
857 k = ctx->shader->ninput ++;
858 ctx->shader->input[k].name = name;
859 ctx->shader->input[k].sid = 0;
860 ctx->shader->input[k].interpolate = TGSI_INTERPOLATE_CONSTANT;
861 ctx->shader->input[k].interpolate_location = TGSI_INTERPOLATE_LOC_CENTER;
862 *reg = ctx->shader->input[k].gpr = gpr;
863 }
864 }
865
866 return gpr_offset + num_regs;
867 }
868
869 /*
870 * for evergreen we need to scan the shader to find the number of GPRs we need to
871 * reserve for interpolation and system values
872 *
873 * we need to know if we are going to emit
874 * any sample or centroid inputs
875 * if perspective and linear are required
876 */
877 static int evergreen_gpr_count(struct r600_shader_ctx *ctx)
878 {
879 int i;
880 int num_baryc;
881 struct tgsi_parse_context parse;
882
883 memset(&ctx->eg_interpolators, 0, sizeof(ctx->eg_interpolators));
884
885 for (i = 0; i < ctx->info.num_inputs; i++) {
886 int k;
887 /* skip position/face/mask/sampleid */
888 if (ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_POSITION ||
889 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_FACE ||
890 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEMASK ||
891 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEID)
892 continue;
893
894 k = eg_get_interpolator_index(
895 ctx->info.input_interpolate[i],
896 ctx->info.input_interpolate_loc[i]);
897 if (k >= 0)
898 ctx->eg_interpolators[k].enabled = TRUE;
899 }
900
901 if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) {
902 return 0;
903 }
904
905 /* need to scan shader for system values and interpolateAtSample/Offset/Centroid */
906 while (!tgsi_parse_end_of_tokens(&parse)) {
907 tgsi_parse_token(&parse);
908
909 if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) {
910 const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
911 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE ||
912 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
913 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID)
914 {
915 int interpolate, location, k;
916
917 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
918 location = TGSI_INTERPOLATE_LOC_CENTER;
919 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
920 location = TGSI_INTERPOLATE_LOC_CENTER;
921 } else {
922 location = TGSI_INTERPOLATE_LOC_CENTROID;
923 }
924
925 interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index];
926 k = eg_get_interpolator_index(interpolate, location);
927 ctx->eg_interpolators[k].enabled = true;
928 }
929 }
930 }
931
932 tgsi_parse_free(&parse);
933
934 /* assign gpr to each interpolator according to priority */
935 num_baryc = 0;
936 for (i = 0; i < Elements(ctx->eg_interpolators); i++) {
937 if (ctx->eg_interpolators[i].enabled) {
938 ctx->eg_interpolators[i].ij_index = num_baryc;
939 num_baryc ++;
940 }
941 }
942
943 /* XXX PULL MODEL and LINE STIPPLE */
944
945 num_baryc = (num_baryc + 1) >> 1;
946 return allocate_system_value_inputs(ctx, num_baryc);
947 }
948
949 /* sample_id_sel == NULL means fetch for current sample */
950 static int load_sample_position(struct r600_shader_ctx *ctx, struct r600_shader_src *sample_id, int chan_sel)
951 {
952 struct r600_bytecode_vtx vtx;
953 int r, t1;
954
955 assert(ctx->fixed_pt_position_gpr != -1);
956
957 t1 = r600_get_temp(ctx);
958
959 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
960 vtx.op = FETCH_OP_VFETCH;
961 vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;
962 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
963 if (sample_id == NULL) {
964 vtx.src_gpr = ctx->fixed_pt_position_gpr; // SAMPLEID is in .w;
965 vtx.src_sel_x = 3;
966 }
967 else {
968 struct r600_bytecode_alu alu;
969
970 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
971 alu.op = ALU_OP1_MOV;
972 r600_bytecode_src(&alu.src[0], sample_id, chan_sel);
973 alu.dst.sel = t1;
974 alu.dst.write = 1;
975 alu.last = 1;
976 r = r600_bytecode_add_alu(ctx->bc, &alu);
977 if (r)
978 return r;
979
980 vtx.src_gpr = t1;
981 vtx.src_sel_x = 0;
982 }
983 vtx.mega_fetch_count = 16;
984 vtx.dst_gpr = t1;
985 vtx.dst_sel_x = 0;
986 vtx.dst_sel_y = 1;
987 vtx.dst_sel_z = 2;
988 vtx.dst_sel_w = 3;
989 vtx.data_format = FMT_32_32_32_32_FLOAT;
990 vtx.num_format_all = 2;
991 vtx.format_comp_all = 1;
992 vtx.use_const_fields = 0;
993 vtx.offset = 1; // first element is size of buffer
994 vtx.endian = r600_endian_swap(32);
995 vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */
996
997 r = r600_bytecode_add_vtx(ctx->bc, &vtx);
998 if (r)
999 return r;
1000
1001 return t1;
1002 }
1003
1004 static void tgsi_src(struct r600_shader_ctx *ctx,
1005 const struct tgsi_full_src_register *tgsi_src,
1006 struct r600_shader_src *r600_src)
1007 {
1008 memset(r600_src, 0, sizeof(*r600_src));
1009 r600_src->swizzle[0] = tgsi_src->Register.SwizzleX;
1010 r600_src->swizzle[1] = tgsi_src->Register.SwizzleY;
1011 r600_src->swizzle[2] = tgsi_src->Register.SwizzleZ;
1012 r600_src->swizzle[3] = tgsi_src->Register.SwizzleW;
1013 r600_src->neg = tgsi_src->Register.Negate;
1014 r600_src->abs = tgsi_src->Register.Absolute;
1015
1016 if (tgsi_src->Register.File == TGSI_FILE_IMMEDIATE) {
1017 int index;
1018 if ((tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleY) &&
1019 (tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleZ) &&
1020 (tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleW)) {
1021
1022 index = tgsi_src->Register.Index * 4 + tgsi_src->Register.SwizzleX;
1023 r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel, &r600_src->neg, r600_src->abs);
1024 if (r600_src->sel != V_SQ_ALU_SRC_LITERAL)
1025 return;
1026 }
1027 index = tgsi_src->Register.Index;
1028 r600_src->sel = V_SQ_ALU_SRC_LITERAL;
1029 memcpy(r600_src->value, ctx->literals + index * 4, sizeof(r600_src->value));
1030 } else if (tgsi_src->Register.File == TGSI_FILE_SYSTEM_VALUE) {
1031 if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEMASK) {
1032 r600_src->swizzle[0] = 2; // Z value
1033 r600_src->swizzle[1] = 2;
1034 r600_src->swizzle[2] = 2;
1035 r600_src->swizzle[3] = 2;
1036 r600_src->sel = ctx->face_gpr;
1037 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEID) {
1038 r600_src->swizzle[0] = 3; // W value
1039 r600_src->swizzle[1] = 3;
1040 r600_src->swizzle[2] = 3;
1041 r600_src->swizzle[3] = 3;
1042 r600_src->sel = ctx->fixed_pt_position_gpr;
1043 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEPOS) {
1044 r600_src->swizzle[0] = 0;
1045 r600_src->swizzle[1] = 1;
1046 r600_src->swizzle[2] = 4;
1047 r600_src->swizzle[3] = 4;
1048 r600_src->sel = load_sample_position(ctx, NULL, -1);
1049 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INSTANCEID) {
1050 r600_src->swizzle[0] = 3;
1051 r600_src->swizzle[1] = 3;
1052 r600_src->swizzle[2] = 3;
1053 r600_src->swizzle[3] = 3;
1054 r600_src->sel = 0;
1055 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTEXID) {
1056 r600_src->swizzle[0] = 0;
1057 r600_src->swizzle[1] = 0;
1058 r600_src->swizzle[2] = 0;
1059 r600_src->swizzle[3] = 0;
1060 r600_src->sel = 0;
1061 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) {
1062 r600_src->swizzle[0] = 3;
1063 r600_src->swizzle[1] = 3;
1064 r600_src->swizzle[2] = 3;
1065 r600_src->swizzle[3] = 3;
1066 r600_src->sel = 1;
1067 }
1068 } else {
1069 if (tgsi_src->Register.Indirect)
1070 r600_src->rel = V_SQ_REL_RELATIVE;
1071 r600_src->sel = tgsi_src->Register.Index;
1072 r600_src->sel += ctx->file_offset[tgsi_src->Register.File];
1073 }
1074 if (tgsi_src->Register.File == TGSI_FILE_CONSTANT) {
1075 if (tgsi_src->Register.Dimension) {
1076 r600_src->kc_bank = tgsi_src->Dimension.Index;
1077 if (tgsi_src->Dimension.Indirect) {
1078 r600_src->kc_rel = 1;
1079 }
1080 }
1081 }
1082 }
1083
1084 static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
1085 unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,
1086 unsigned int dst_reg)
1087 {
1088 struct r600_bytecode_vtx vtx;
1089 unsigned int ar_reg;
1090 int r;
1091
1092 if (offset) {
1093 struct r600_bytecode_alu alu;
1094
1095 memset(&alu, 0, sizeof(alu));
1096
1097 alu.op = ALU_OP2_ADD_INT;
1098 alu.src[0].sel = ctx->bc->ar_reg;
1099 alu.src[0].chan = ar_chan;
1100
1101 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1102 alu.src[1].value = offset;
1103
1104 alu.dst.sel = dst_reg;
1105 alu.dst.chan = ar_chan;
1106 alu.dst.write = 1;
1107 alu.last = 1;
1108
1109 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1110 return r;
1111
1112 ar_reg = dst_reg;
1113 } else {
1114 ar_reg = ctx->bc->ar_reg;
1115 }
1116
1117 memset(&vtx, 0, sizeof(vtx));
1118 vtx.buffer_id = cb_idx;
1119 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1120 vtx.src_gpr = ar_reg;
1121 vtx.src_sel_x = ar_chan;
1122 vtx.mega_fetch_count = 16;
1123 vtx.dst_gpr = dst_reg;
1124 vtx.dst_sel_x = 0; /* SEL_X */
1125 vtx.dst_sel_y = 1; /* SEL_Y */
1126 vtx.dst_sel_z = 2; /* SEL_Z */
1127 vtx.dst_sel_w = 3; /* SEL_W */
1128 vtx.data_format = FMT_32_32_32_32_FLOAT;
1129 vtx.num_format_all = 2; /* NUM_FORMAT_SCALED */
1130 vtx.format_comp_all = 1; /* FORMAT_COMP_SIGNED */
1131 vtx.endian = r600_endian_swap(32);
1132 vtx.buffer_index_mode = cb_rel; // cb_rel ? V_SQ_CF_INDEX_0 : V_SQ_CF_INDEX_NONE;
1133
1134 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1135 return r;
1136
1137 return 0;
1138 }
1139
1140 static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1141 {
1142 struct r600_bytecode_vtx vtx;
1143 int r;
1144 unsigned index = src->Register.Index;
1145 unsigned vtx_id = src->Dimension.Index;
1146 int offset_reg = vtx_id / 3;
1147 int offset_chan = vtx_id % 3;
1148
1149 /* offsets of per-vertex data in ESGS ring are passed to GS in R0.x, R0.y,
1150 * R0.w, R1.x, R1.y, R1.z (it seems R0.z is used for PrimitiveID) */
1151
1152 if (offset_reg == 0 && offset_chan == 2)
1153 offset_chan = 3;
1154
1155 if (src->Dimension.Indirect) {
1156 int treg[3];
1157 int t2;
1158 struct r600_bytecode_alu alu;
1159 int r, i;
1160
1161 /* you have got to be shitting me -
1162 we have to put the R0.x/y/w into Rt.x Rt+1.x Rt+2.x then index reg from Rt.
1163 at least this is what fglrx seems to do. */
1164 for (i = 0; i < 3; i++) {
1165 treg[i] = r600_get_temp(ctx);
1166 }
1167 r600_add_gpr_array(ctx->shader, treg[0], 3, 0x0F);
1168
1169 t2 = r600_get_temp(ctx);
1170 for (i = 0; i < 3; i++) {
1171 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1172 alu.op = ALU_OP1_MOV;
1173 alu.src[0].sel = 0;
1174 alu.src[0].chan = i == 2 ? 3 : i;
1175 alu.dst.sel = treg[i];
1176 alu.dst.chan = 0;
1177 alu.dst.write = 1;
1178 alu.last = 1;
1179 r = r600_bytecode_add_alu(ctx->bc, &alu);
1180 if (r)
1181 return r;
1182 }
1183 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1184 alu.op = ALU_OP1_MOV;
1185 alu.src[0].sel = treg[0];
1186 alu.src[0].rel = 1;
1187 alu.dst.sel = t2;
1188 alu.dst.write = 1;
1189 alu.last = 1;
1190 r = r600_bytecode_add_alu(ctx->bc, &alu);
1191 if (r)
1192 return r;
1193 offset_reg = t2;
1194 }
1195
1196
1197 memset(&vtx, 0, sizeof(vtx));
1198 vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
1199 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1200 vtx.src_gpr = offset_reg;
1201 vtx.src_sel_x = offset_chan;
1202 vtx.offset = index * 16; /*bytes*/
1203 vtx.mega_fetch_count = 16;
1204 vtx.dst_gpr = dst_reg;
1205 vtx.dst_sel_x = 0; /* SEL_X */
1206 vtx.dst_sel_y = 1; /* SEL_Y */
1207 vtx.dst_sel_z = 2; /* SEL_Z */
1208 vtx.dst_sel_w = 3; /* SEL_W */
1209 if (ctx->bc->chip_class >= EVERGREEN) {
1210 vtx.use_const_fields = 1;
1211 } else {
1212 vtx.data_format = FMT_32_32_32_32_FLOAT;
1213 }
1214
1215 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1216 return r;
1217
1218 return 0;
1219 }
1220
1221 static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx)
1222 {
1223 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1224 int i;
1225
1226 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1227 struct tgsi_full_src_register *src = &inst->Src[i];
1228
1229 if (src->Register.File == TGSI_FILE_INPUT) {
1230 if (ctx->shader->input[src->Register.Index].name == TGSI_SEMANTIC_PRIMID) {
1231 /* primitive id is in R0.z */
1232 ctx->src[i].sel = 0;
1233 ctx->src[i].swizzle[0] = 2;
1234 }
1235 }
1236 if (src->Register.File == TGSI_FILE_INPUT && src->Register.Dimension) {
1237 int treg = r600_get_temp(ctx);
1238
1239 fetch_gs_input(ctx, src, treg);
1240 ctx->src[i].sel = treg;
1241 }
1242 }
1243 return 0;
1244 }
1245
1246 static int tgsi_split_constant(struct r600_shader_ctx *ctx)
1247 {
1248 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1249 struct r600_bytecode_alu alu;
1250 int i, j, k, nconst, r;
1251
1252 for (i = 0, nconst = 0; i < inst->Instruction.NumSrcRegs; i++) {
1253 if (inst->Src[i].Register.File == TGSI_FILE_CONSTANT) {
1254 nconst++;
1255 }
1256 tgsi_src(ctx, &inst->Src[i], &ctx->src[i]);
1257 }
1258 for (i = 0, j = nconst - 1; i < inst->Instruction.NumSrcRegs; i++) {
1259 if (inst->Src[i].Register.File != TGSI_FILE_CONSTANT) {
1260 continue;
1261 }
1262
1263 if (ctx->src[i].rel) {
1264 int chan = inst->Src[i].Indirect.Swizzle;
1265 int treg = r600_get_temp(ctx);
1266 if ((r = tgsi_fetch_rel_const(ctx, ctx->src[i].kc_bank, ctx->src[i].kc_rel, ctx->src[i].sel - 512, chan, treg)))
1267 return r;
1268
1269 ctx->src[i].kc_bank = 0;
1270 ctx->src[i].kc_rel = 0;
1271 ctx->src[i].sel = treg;
1272 ctx->src[i].rel = 0;
1273 j--;
1274 } else if (j > 0) {
1275 int treg = r600_get_temp(ctx);
1276 for (k = 0; k < 4; k++) {
1277 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1278 alu.op = ALU_OP1_MOV;
1279 alu.src[0].sel = ctx->src[i].sel;
1280 alu.src[0].chan = k;
1281 alu.src[0].rel = ctx->src[i].rel;
1282 alu.src[0].kc_bank = ctx->src[i].kc_bank;
1283 alu.src[0].kc_rel = ctx->src[i].kc_rel;
1284 alu.dst.sel = treg;
1285 alu.dst.chan = k;
1286 alu.dst.write = 1;
1287 if (k == 3)
1288 alu.last = 1;
1289 r = r600_bytecode_add_alu(ctx->bc, &alu);
1290 if (r)
1291 return r;
1292 }
1293 ctx->src[i].sel = treg;
1294 ctx->src[i].rel =0;
1295 j--;
1296 }
1297 }
1298 return 0;
1299 }
1300
1301 /* need to move any immediate into a temp - for trig functions which use literal for PI stuff */
1302 static int tgsi_split_literal_constant(struct r600_shader_ctx *ctx)
1303 {
1304 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1305 struct r600_bytecode_alu alu;
1306 int i, j, k, nliteral, r;
1307
1308 for (i = 0, nliteral = 0; i < inst->Instruction.NumSrcRegs; i++) {
1309 if (ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
1310 nliteral++;
1311 }
1312 }
1313 for (i = 0, j = nliteral - 1; i < inst->Instruction.NumSrcRegs; i++) {
1314 if (j > 0 && ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
1315 int treg = r600_get_temp(ctx);
1316 for (k = 0; k < 4; k++) {
1317 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1318 alu.op = ALU_OP1_MOV;
1319 alu.src[0].sel = ctx->src[i].sel;
1320 alu.src[0].chan = k;
1321 alu.src[0].value = ctx->src[i].value[k];
1322 alu.dst.sel = treg;
1323 alu.dst.chan = k;
1324 alu.dst.write = 1;
1325 if (k == 3)
1326 alu.last = 1;
1327 r = r600_bytecode_add_alu(ctx->bc, &alu);
1328 if (r)
1329 return r;
1330 }
1331 ctx->src[i].sel = treg;
1332 j--;
1333 }
1334 }
1335 return 0;
1336 }
1337
1338 static int process_twoside_color_inputs(struct r600_shader_ctx *ctx)
1339 {
1340 int i, r, count = ctx->shader->ninput;
1341
1342 for (i = 0; i < count; i++) {
1343 if (ctx->shader->input[i].name == TGSI_SEMANTIC_COLOR) {
1344 r = select_twoside_color(ctx, i, ctx->shader->input[i].back_color_input);
1345 if (r)
1346 return r;
1347 }
1348 }
1349 return 0;
1350 }
1351
1352 static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output_info *so,
1353 int stream, unsigned *stream_item_size)
1354 {
1355 unsigned so_gpr[PIPE_MAX_SHADER_OUTPUTS];
1356 unsigned start_comp[PIPE_MAX_SHADER_OUTPUTS];
1357 int i, j, r;
1358
1359 /* Sanity checking. */
1360 if (so->num_outputs > PIPE_MAX_SO_OUTPUTS) {
1361 R600_ERR("Too many stream outputs: %d\n", so->num_outputs);
1362 r = -EINVAL;
1363 goto out_err;
1364 }
1365 for (i = 0; i < so->num_outputs; i++) {
1366 if (so->output[i].output_buffer >= 4) {
1367 R600_ERR("Exceeded the max number of stream output buffers, got: %d\n",
1368 so->output[i].output_buffer);
1369 r = -EINVAL;
1370 goto out_err;
1371 }
1372 }
1373
1374 /* Initialize locations where the outputs are stored. */
1375 for (i = 0; i < so->num_outputs; i++) {
1376
1377 so_gpr[i] = ctx->shader->output[so->output[i].register_index].gpr;
1378 start_comp[i] = so->output[i].start_component;
1379 /* Lower outputs with dst_offset < start_component.
1380 *
1381 * We can only output 4D vectors with a write mask, e.g. we can
1382 * only output the W component at offset 3, etc. If we want
1383 * to store Y, Z, or W at buffer offset 0, we need to use MOV
1384 * to move it to X and output X. */
1385 if (so->output[i].dst_offset < so->output[i].start_component) {
1386 unsigned tmp = r600_get_temp(ctx);
1387
1388 for (j = 0; j < so->output[i].num_components; j++) {
1389 struct r600_bytecode_alu alu;
1390 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1391 alu.op = ALU_OP1_MOV;
1392 alu.src[0].sel = so_gpr[i];
1393 alu.src[0].chan = so->output[i].start_component + j;
1394
1395 alu.dst.sel = tmp;
1396 alu.dst.chan = j;
1397 alu.dst.write = 1;
1398 if (j == so->output[i].num_components - 1)
1399 alu.last = 1;
1400 r = r600_bytecode_add_alu(ctx->bc, &alu);
1401 if (r)
1402 return r;
1403 }
1404 start_comp[i] = 0;
1405 so_gpr[i] = tmp;
1406 }
1407 }
1408
1409 /* Write outputs to buffers. */
1410 for (i = 0; i < so->num_outputs; i++) {
1411 struct r600_bytecode_output output;
1412
1413 if (stream != -1 && stream != so->output[i].output_buffer)
1414 continue;
1415
1416 memset(&output, 0, sizeof(struct r600_bytecode_output));
1417 output.gpr = so_gpr[i];
1418 output.elem_size = so->output[i].num_components - 1;
1419 if (output.elem_size == 2)
1420 output.elem_size = 3; // 3 not supported, write 4 with junk at end
1421 output.array_base = so->output[i].dst_offset - start_comp[i];
1422 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
1423 output.burst_count = 1;
1424 /* array_size is an upper limit for the burst_count
1425 * with MEM_STREAM instructions */
1426 output.array_size = 0xFFF;
1427 output.comp_mask = ((1 << so->output[i].num_components) - 1) << start_comp[i];
1428
1429 if (ctx->bc->chip_class >= EVERGREEN) {
1430 switch (so->output[i].output_buffer) {
1431 case 0:
1432 output.op = CF_OP_MEM_STREAM0_BUF0;
1433 break;
1434 case 1:
1435 output.op = CF_OP_MEM_STREAM0_BUF1;
1436 break;
1437 case 2:
1438 output.op = CF_OP_MEM_STREAM0_BUF2;
1439 break;
1440 case 3:
1441 output.op = CF_OP_MEM_STREAM0_BUF3;
1442 break;
1443 }
1444 output.op += so->output[i].stream * 4;
1445 assert(output.op >= CF_OP_MEM_STREAM0_BUF0 && output.op <= CF_OP_MEM_STREAM3_BUF3);
1446 ctx->enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer) << so->output[i].stream * 4;
1447 } else {
1448 switch (so->output[i].output_buffer) {
1449 case 0:
1450 output.op = CF_OP_MEM_STREAM0;
1451 break;
1452 case 1:
1453 output.op = CF_OP_MEM_STREAM1;
1454 break;
1455 case 2:
1456 output.op = CF_OP_MEM_STREAM2;
1457 break;
1458 case 3:
1459 output.op = CF_OP_MEM_STREAM3;
1460 break;
1461 }
1462 ctx->enabled_stream_buffers_mask |= 1 << so->output[i].output_buffer;
1463 }
1464 r = r600_bytecode_add_output(ctx->bc, &output);
1465 if (r)
1466 goto out_err;
1467 }
1468 return 0;
1469 out_err:
1470 return r;
1471 }
1472
1473 static void convert_edgeflag_to_int(struct r600_shader_ctx *ctx)
1474 {
1475 struct r600_bytecode_alu alu;
1476 unsigned reg;
1477
1478 if (!ctx->shader->vs_out_edgeflag)
1479 return;
1480
1481 reg = ctx->shader->output[ctx->edgeflag_output].gpr;
1482
1483 /* clamp(x, 0, 1) */
1484 memset(&alu, 0, sizeof(alu));
1485 alu.op = ALU_OP1_MOV;
1486 alu.src[0].sel = reg;
1487 alu.dst.sel = reg;
1488 alu.dst.write = 1;
1489 alu.dst.clamp = 1;
1490 alu.last = 1;
1491 r600_bytecode_add_alu(ctx->bc, &alu);
1492
1493 memset(&alu, 0, sizeof(alu));
1494 alu.op = ALU_OP1_FLT_TO_INT;
1495 alu.src[0].sel = reg;
1496 alu.dst.sel = reg;
1497 alu.dst.write = 1;
1498 alu.last = 1;
1499 r600_bytecode_add_alu(ctx->bc, &alu);
1500 }
1501
1502 static int generate_gs_copy_shader(struct r600_context *rctx,
1503 struct r600_pipe_shader *gs,
1504 struct pipe_stream_output_info *so)
1505 {
1506 struct r600_shader_ctx ctx = {};
1507 struct r600_shader *gs_shader = &gs->shader;
1508 struct r600_pipe_shader *cshader;
1509 int ocnt = gs_shader->noutput;
1510 struct r600_bytecode_alu alu;
1511 struct r600_bytecode_vtx vtx;
1512 struct r600_bytecode_output output;
1513 struct r600_bytecode_cf *cf_jump, *cf_pop,
1514 *last_exp_pos = NULL, *last_exp_param = NULL;
1515 int i, j, next_clip_pos = 61, next_param = 0;
1516 int ring;
1517
1518 cshader = calloc(1, sizeof(struct r600_pipe_shader));
1519 if (!cshader)
1520 return 0;
1521
1522 memcpy(cshader->shader.output, gs_shader->output, ocnt *
1523 sizeof(struct r600_shader_io));
1524
1525 cshader->shader.noutput = ocnt;
1526
1527 ctx.shader = &cshader->shader;
1528 ctx.bc = &ctx.shader->bc;
1529 ctx.type = ctx.bc->type = TGSI_PROCESSOR_VERTEX;
1530
1531 r600_bytecode_init(ctx.bc, rctx->b.chip_class, rctx->b.family,
1532 rctx->screen->has_compressed_msaa_texturing);
1533
1534 ctx.bc->isa = rctx->isa;
1535
1536 cf_jump = NULL;
1537 memset(cshader->shader.ring_item_sizes, 0, sizeof(cshader->shader.ring_item_sizes));
1538
1539 /* R0.x = R0.x & 0x3fffffff */
1540 memset(&alu, 0, sizeof(alu));
1541 alu.op = ALU_OP2_AND_INT;
1542 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1543 alu.src[1].value = 0x3fffffff;
1544 alu.dst.write = 1;
1545 r600_bytecode_add_alu(ctx.bc, &alu);
1546
1547 /* R0.y = R0.x >> 30 */
1548 memset(&alu, 0, sizeof(alu));
1549 alu.op = ALU_OP2_LSHR_INT;
1550 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1551 alu.src[1].value = 0x1e;
1552 alu.dst.chan = 1;
1553 alu.dst.write = 1;
1554 alu.last = 1;
1555 r600_bytecode_add_alu(ctx.bc, &alu);
1556
1557 /* fetch vertex data from GSVS ring */
1558 for (i = 0; i < ocnt; ++i) {
1559 struct r600_shader_io *out = &ctx.shader->output[i];
1560
1561 out->gpr = i + 1;
1562 out->ring_offset = i * 16;
1563
1564 memset(&vtx, 0, sizeof(vtx));
1565 vtx.op = FETCH_OP_VFETCH;
1566 vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
1567 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1568 vtx.mega_fetch_count = 16;
1569 vtx.offset = out->ring_offset;
1570 vtx.dst_gpr = out->gpr;
1571 vtx.src_gpr = 0;
1572 vtx.dst_sel_x = 0;
1573 vtx.dst_sel_y = 1;
1574 vtx.dst_sel_z = 2;
1575 vtx.dst_sel_w = 3;
1576 if (rctx->b.chip_class >= EVERGREEN) {
1577 vtx.use_const_fields = 1;
1578 } else {
1579 vtx.data_format = FMT_32_32_32_32_FLOAT;
1580 }
1581
1582 r600_bytecode_add_vtx(ctx.bc, &vtx);
1583 }
1584 ctx.temp_reg = i + 1;
1585 for (ring = 3; ring >= 0; --ring) {
1586 bool enabled = false;
1587 for (i = 0; i < so->num_outputs; i++) {
1588 if (so->output[i].stream == ring) {
1589 enabled = true;
1590 break;
1591 }
1592 }
1593 if (ring != 0 && !enabled) {
1594 cshader->shader.ring_item_sizes[ring] = 0;
1595 continue;
1596 }
1597
1598 if (cf_jump) {
1599 // Patch up jump label
1600 r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
1601 cf_pop = ctx.bc->cf_last;
1602
1603 cf_jump->cf_addr = cf_pop->id + 2;
1604 cf_jump->pop_count = 1;
1605 cf_pop->cf_addr = cf_pop->id + 2;
1606 cf_pop->pop_count = 1;
1607 }
1608
1609 /* PRED_SETE_INT __, R0.y, ring */
1610 memset(&alu, 0, sizeof(alu));
1611 alu.op = ALU_OP2_PRED_SETE_INT;
1612 alu.src[0].chan = 1;
1613 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1614 alu.src[1].value = ring;
1615 alu.execute_mask = 1;
1616 alu.update_pred = 1;
1617 alu.last = 1;
1618 r600_bytecode_add_alu_type(ctx.bc, &alu, CF_OP_ALU_PUSH_BEFORE);
1619
1620 r600_bytecode_add_cfinst(ctx.bc, CF_OP_JUMP);
1621 cf_jump = ctx.bc->cf_last;
1622
1623 if (enabled)
1624 emit_streamout(&ctx, so, ring, &cshader->shader.ring_item_sizes[ring]);
1625 cshader->shader.ring_item_sizes[ring] = ocnt * 16;
1626 }
1627
1628 /* bc adds nops - copy it */
1629 if (ctx.bc->chip_class == R600) {
1630 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1631 alu.op = ALU_OP0_NOP;
1632 alu.last = 1;
1633 r600_bytecode_add_alu(ctx.bc, &alu);
1634
1635 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
1636 }
1637
1638 /* export vertex data */
1639 /* XXX factor out common code with r600_shader_from_tgsi ? */
1640 for (i = 0; i < ocnt; ++i) {
1641 struct r600_shader_io *out = &ctx.shader->output[i];
1642 bool instream0 = true;
1643 if (out->name == TGSI_SEMANTIC_CLIPVERTEX)
1644 continue;
1645
1646 for (j = 0; j < so->num_outputs; j++) {
1647 if (so->output[j].register_index == i) {
1648 if (so->output[j].stream == 0)
1649 break;
1650 if (so->output[j].stream > 0)
1651 instream0 = false;
1652 }
1653 }
1654 if (!instream0)
1655 continue;
1656 memset(&output, 0, sizeof(output));
1657 output.gpr = out->gpr;
1658 output.elem_size = 3;
1659 output.swizzle_x = 0;
1660 output.swizzle_y = 1;
1661 output.swizzle_z = 2;
1662 output.swizzle_w = 3;
1663 output.burst_count = 1;
1664 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
1665 output.op = CF_OP_EXPORT;
1666 switch (out->name) {
1667 case TGSI_SEMANTIC_POSITION:
1668 output.array_base = 60;
1669 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1670 break;
1671
1672 case TGSI_SEMANTIC_PSIZE:
1673 output.array_base = 61;
1674 if (next_clip_pos == 61)
1675 next_clip_pos = 62;
1676 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1677 output.swizzle_y = 7;
1678 output.swizzle_z = 7;
1679 output.swizzle_w = 7;
1680 ctx.shader->vs_out_misc_write = 1;
1681 ctx.shader->vs_out_point_size = 1;
1682 break;
1683 case TGSI_SEMANTIC_LAYER:
1684 if (out->spi_sid) {
1685 /* duplicate it as PARAM to pass to the pixel shader */
1686 output.array_base = next_param++;
1687 r600_bytecode_add_output(ctx.bc, &output);
1688 last_exp_param = ctx.bc->cf_last;
1689 }
1690 output.array_base = 61;
1691 if (next_clip_pos == 61)
1692 next_clip_pos = 62;
1693 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1694 output.swizzle_x = 7;
1695 output.swizzle_y = 7;
1696 output.swizzle_z = 0;
1697 output.swizzle_w = 7;
1698 ctx.shader->vs_out_misc_write = 1;
1699 ctx.shader->vs_out_layer = 1;
1700 break;
1701 case TGSI_SEMANTIC_VIEWPORT_INDEX:
1702 if (out->spi_sid) {
1703 /* duplicate it as PARAM to pass to the pixel shader */
1704 output.array_base = next_param++;
1705 r600_bytecode_add_output(ctx.bc, &output);
1706 last_exp_param = ctx.bc->cf_last;
1707 }
1708 output.array_base = 61;
1709 if (next_clip_pos == 61)
1710 next_clip_pos = 62;
1711 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1712 ctx.shader->vs_out_misc_write = 1;
1713 ctx.shader->vs_out_viewport = 1;
1714 output.swizzle_x = 7;
1715 output.swizzle_y = 7;
1716 output.swizzle_z = 7;
1717 output.swizzle_w = 0;
1718 break;
1719 case TGSI_SEMANTIC_CLIPDIST:
1720 /* spi_sid is 0 for clipdistance outputs that were generated
1721 * for clipvertex - we don't need to pass them to PS */
1722 ctx.shader->clip_dist_write = gs->shader.clip_dist_write;
1723 if (out->spi_sid) {
1724 /* duplicate it as PARAM to pass to the pixel shader */
1725 output.array_base = next_param++;
1726 r600_bytecode_add_output(ctx.bc, &output);
1727 last_exp_param = ctx.bc->cf_last;
1728 }
1729 output.array_base = next_clip_pos++;
1730 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1731 break;
1732 case TGSI_SEMANTIC_FOG:
1733 output.swizzle_y = 4; /* 0 */
1734 output.swizzle_z = 4; /* 0 */
1735 output.swizzle_w = 5; /* 1 */
1736 break;
1737 default:
1738 output.array_base = next_param++;
1739 break;
1740 }
1741 r600_bytecode_add_output(ctx.bc, &output);
1742 if (output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM)
1743 last_exp_param = ctx.bc->cf_last;
1744 else
1745 last_exp_pos = ctx.bc->cf_last;
1746 }
1747
1748 if (!last_exp_pos) {
1749 memset(&output, 0, sizeof(output));
1750 output.gpr = 0;
1751 output.elem_size = 3;
1752 output.swizzle_x = 7;
1753 output.swizzle_y = 7;
1754 output.swizzle_z = 7;
1755 output.swizzle_w = 7;
1756 output.burst_count = 1;
1757 output.type = 2;
1758 output.op = CF_OP_EXPORT;
1759 output.array_base = 60;
1760 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1761 r600_bytecode_add_output(ctx.bc, &output);
1762 last_exp_pos = ctx.bc->cf_last;
1763 }
1764
1765 if (!last_exp_param) {
1766 memset(&output, 0, sizeof(output));
1767 output.gpr = 0;
1768 output.elem_size = 3;
1769 output.swizzle_x = 7;
1770 output.swizzle_y = 7;
1771 output.swizzle_z = 7;
1772 output.swizzle_w = 7;
1773 output.burst_count = 1;
1774 output.type = 2;
1775 output.op = CF_OP_EXPORT;
1776 output.array_base = next_param++;
1777 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
1778 r600_bytecode_add_output(ctx.bc, &output);
1779 last_exp_param = ctx.bc->cf_last;
1780 }
1781
1782 last_exp_pos->op = CF_OP_EXPORT_DONE;
1783 last_exp_param->op = CF_OP_EXPORT_DONE;
1784
1785 r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
1786 cf_pop = ctx.bc->cf_last;
1787
1788 cf_jump->cf_addr = cf_pop->id + 2;
1789 cf_jump->pop_count = 1;
1790 cf_pop->cf_addr = cf_pop->id + 2;
1791 cf_pop->pop_count = 1;
1792
1793 if (ctx.bc->chip_class == CAYMAN)
1794 cm_bytecode_add_cf_end(ctx.bc);
1795 else {
1796 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
1797 ctx.bc->cf_last->end_of_program = 1;
1798 }
1799
1800 gs->gs_copy_shader = cshader;
1801 cshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
1802
1803 ctx.bc->nstack = 1;
1804
1805 return r600_bytecode_build(ctx.bc);
1806 }
1807
1808 static int emit_inc_ring_offset(struct r600_shader_ctx *ctx, int idx, bool ind)
1809 {
1810 if (ind) {
1811 struct r600_bytecode_alu alu;
1812 int r;
1813
1814 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1815 alu.op = ALU_OP2_ADD_INT;
1816 alu.src[0].sel = ctx->gs_export_gpr_tregs[idx];
1817 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1818 alu.src[1].value = ctx->gs_out_ring_offset >> 4;
1819 alu.dst.sel = ctx->gs_export_gpr_tregs[idx];
1820 alu.dst.write = 1;
1821 alu.last = 1;
1822 r = r600_bytecode_add_alu(ctx->bc, &alu);
1823 if (r)
1824 return r;
1825 }
1826 return 0;
1827 }
1828
1829 static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so, int stream, bool ind)
1830 {
1831 struct r600_bytecode_output output;
1832 int i, k, ring_offset;
1833 int effective_stream = stream == -1 ? 0 : stream;
1834 int idx = 0;
1835
1836 for (i = 0; i < ctx->shader->noutput; i++) {
1837 if (ctx->gs_for_vs) {
1838 /* for ES we need to lookup corresponding ring offset expected by GS
1839 * (map this output to GS input by name and sid) */
1840 /* FIXME precompute offsets */
1841 ring_offset = -1;
1842 for(k = 0; k < ctx->gs_for_vs->ninput; ++k) {
1843 struct r600_shader_io *in = &ctx->gs_for_vs->input[k];
1844 struct r600_shader_io *out = &ctx->shader->output[i];
1845 if (in->name == out->name && in->sid == out->sid)
1846 ring_offset = in->ring_offset;
1847 }
1848
1849 if (ring_offset == -1)
1850 continue;
1851 } else {
1852 ring_offset = idx * 16;
1853 idx++;
1854 }
1855
1856 if (stream > 0 && ctx->shader->output[i].name == TGSI_SEMANTIC_POSITION)
1857 continue;
1858 /* next_ring_offset after parsing input decls contains total size of
1859 * single vertex data, gs_next_vertex - current vertex index */
1860 if (!ind)
1861 ring_offset += ctx->gs_out_ring_offset * ctx->gs_next_vertex;
1862
1863 memset(&output, 0, sizeof(struct r600_bytecode_output));
1864 output.gpr = ctx->shader->output[i].gpr;
1865 output.elem_size = 3;
1866 output.comp_mask = 0xF;
1867 output.burst_count = 1;
1868
1869 if (ind)
1870 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
1871 else
1872 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
1873
1874 switch (stream) {
1875 default:
1876 case 0:
1877 output.op = CF_OP_MEM_RING; break;
1878 case 1:
1879 output.op = CF_OP_MEM_RING1; break;
1880 case 2:
1881 output.op = CF_OP_MEM_RING2; break;
1882 case 3:
1883 output.op = CF_OP_MEM_RING3; break;
1884 }
1885
1886 if (ind) {
1887 output.array_base = ring_offset >> 2; /* in dwords */
1888 output.array_size = 0xfff;
1889 output.index_gpr = ctx->gs_export_gpr_tregs[effective_stream];
1890 } else
1891 output.array_base = ring_offset >> 2; /* in dwords */
1892 r600_bytecode_add_output(ctx->bc, &output);
1893 }
1894
1895 ++ctx->gs_next_vertex;
1896 return 0;
1897 }
1898
1899 static int r600_shader_from_tgsi(struct r600_context *rctx,
1900 struct r600_pipe_shader *pipeshader,
1901 union r600_shader_key key)
1902 {
1903 struct r600_screen *rscreen = rctx->screen;
1904 struct r600_shader *shader = &pipeshader->shader;
1905 struct tgsi_token *tokens = pipeshader->selector->tokens;
1906 struct pipe_stream_output_info so = pipeshader->selector->so;
1907 struct tgsi_full_immediate *immediate;
1908 struct r600_shader_ctx ctx;
1909 struct r600_bytecode_output output[32];
1910 unsigned output_done, noutput;
1911 unsigned opcode;
1912 int i, j, k, r = 0;
1913 int next_param_base = 0, next_clip_base;
1914 int max_color_exports = MAX2(key.ps.nr_cbufs, 1);
1915 /* Declarations used by llvm code */
1916 bool use_llvm = false;
1917 bool indirect_gprs;
1918 bool ring_outputs = false;
1919 bool pos_emitted = false;
1920
1921 #ifdef R600_USE_LLVM
1922 use_llvm = rscreen->b.debug_flags & DBG_LLVM;
1923 #endif
1924 ctx.bc = &shader->bc;
1925 ctx.shader = shader;
1926 ctx.native_integers = true;
1927
1928
1929 r600_bytecode_init(ctx.bc, rscreen->b.chip_class, rscreen->b.family,
1930 rscreen->has_compressed_msaa_texturing);
1931 ctx.tokens = tokens;
1932 tgsi_scan_shader(tokens, &ctx.info);
1933 shader->indirect_files = ctx.info.indirect_files;
1934
1935 shader->uses_doubles = ctx.info.uses_doubles;
1936
1937 indirect_gprs = ctx.info.indirect_files & ~((1 << TGSI_FILE_CONSTANT) | (1 << TGSI_FILE_SAMPLER));
1938 tgsi_parse_init(&ctx.parse, tokens);
1939 ctx.type = ctx.info.processor;
1940 shader->processor_type = ctx.type;
1941 ctx.bc->type = shader->processor_type;
1942
1943 switch (ctx.type) {
1944 case TGSI_PROCESSOR_VERTEX:
1945 shader->vs_as_gs_a = key.vs.as_gs_a;
1946 shader->vs_as_es = key.vs.as_es;
1947 shader->vs_as_ls = key.vs.as_ls;
1948 if (shader->vs_as_es)
1949 ring_outputs = true;
1950 break;
1951 case TGSI_PROCESSOR_GEOMETRY:
1952 ring_outputs = true;
1953 break;
1954 case TGSI_PROCESSOR_TESS_CTRL:
1955 shader->tcs_prim_mode = key.tcs.prim_mode;
1956 break;
1957 case TGSI_PROCESSOR_TESS_EVAL:
1958 shader->tes_as_es = key.tes.as_es;
1959 if (shader->tes_as_es)
1960 ring_outputs = true;
1961 break;
1962 case TGSI_PROCESSOR_FRAGMENT:
1963 shader->two_side = key.ps.color_two_side;
1964 break;
1965 default:
1966 break;
1967 }
1968
1969 if (shader->vs_as_es || shader->tes_as_es) {
1970 ctx.gs_for_vs = &rctx->gs_shader->current->shader;
1971 } else {
1972 ctx.gs_for_vs = NULL;
1973 }
1974
1975 ctx.next_ring_offset = 0;
1976 ctx.gs_out_ring_offset = 0;
1977 ctx.gs_next_vertex = 0;
1978 ctx.gs_stream_output_info = &so;
1979
1980 ctx.face_gpr = -1;
1981 ctx.fixed_pt_position_gpr = -1;
1982 ctx.fragcoord_input = -1;
1983 ctx.colors_used = 0;
1984 ctx.clip_vertex_write = 0;
1985
1986 shader->nr_ps_color_exports = 0;
1987 shader->nr_ps_max_color_exports = 0;
1988
1989
1990 /* register allocations */
1991 /* Values [0,127] correspond to GPR[0..127].
1992 * Values [128,159] correspond to constant buffer bank 0
1993 * Values [160,191] correspond to constant buffer bank 1
1994 * Values [256,511] correspond to cfile constants c[0..255]. (Gone on EG)
1995 * Values [256,287] correspond to constant buffer bank 2 (EG)
1996 * Values [288,319] correspond to constant buffer bank 3 (EG)
1997 * Other special values are shown in the list below.
1998 * 244 ALU_SRC_1_DBL_L: special constant 1.0 double-float, LSW. (RV670+)
1999 * 245 ALU_SRC_1_DBL_M: special constant 1.0 double-float, MSW. (RV670+)
2000 * 246 ALU_SRC_0_5_DBL_L: special constant 0.5 double-float, LSW. (RV670+)
2001 * 247 ALU_SRC_0_5_DBL_M: special constant 0.5 double-float, MSW. (RV670+)
2002 * 248 SQ_ALU_SRC_0: special constant 0.0.
2003 * 249 SQ_ALU_SRC_1: special constant 1.0 float.
2004 * 250 SQ_ALU_SRC_1_INT: special constant 1 integer.
2005 * 251 SQ_ALU_SRC_M_1_INT: special constant -1 integer.
2006 * 252 SQ_ALU_SRC_0_5: special constant 0.5 float.
2007 * 253 SQ_ALU_SRC_LITERAL: literal constant.
2008 * 254 SQ_ALU_SRC_PV: previous vector result.
2009 * 255 SQ_ALU_SRC_PS: previous scalar result.
2010 */
2011 for (i = 0; i < TGSI_FILE_COUNT; i++) {
2012 ctx.file_offset[i] = 0;
2013 }
2014
2015 #ifdef R600_USE_LLVM
2016 if (use_llvm && ctx.info.indirect_files && (ctx.info.indirect_files & (1 << TGSI_FILE_CONSTANT)) != ctx.info.indirect_files) {
2017 fprintf(stderr, "Warning: R600 LLVM backend does not support "
2018 "indirect adressing. Falling back to TGSI "
2019 "backend.\n");
2020 use_llvm = 0;
2021 }
2022 #endif
2023 if (ctx.type == TGSI_PROCESSOR_VERTEX) {
2024 ctx.file_offset[TGSI_FILE_INPUT] = 1;
2025 if (!use_llvm) {
2026 r600_bytecode_add_cfinst(ctx.bc, CF_OP_CALL_FS);
2027 }
2028 }
2029 if (ctx.type == TGSI_PROCESSOR_FRAGMENT) {
2030 if (ctx.bc->chip_class >= EVERGREEN)
2031 ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx);
2032 else
2033 ctx.file_offset[TGSI_FILE_INPUT] = allocate_system_value_inputs(&ctx, ctx.file_offset[TGSI_FILE_INPUT]);
2034 }
2035 if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
2036 /* FIXME 1 would be enough in some cases (3 or less input vertices) */
2037 ctx.file_offset[TGSI_FILE_INPUT] = 2;
2038 }
2039 ctx.use_llvm = use_llvm;
2040
2041 if (use_llvm) {
2042 ctx.file_offset[TGSI_FILE_OUTPUT] =
2043 ctx.file_offset[TGSI_FILE_INPUT];
2044 } else {
2045 ctx.file_offset[TGSI_FILE_OUTPUT] =
2046 ctx.file_offset[TGSI_FILE_INPUT] +
2047 ctx.info.file_max[TGSI_FILE_INPUT] + 1;
2048 }
2049 ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] +
2050 ctx.info.file_max[TGSI_FILE_OUTPUT] + 1;
2051
2052 /* Outside the GPR range. This will be translated to one of the
2053 * kcache banks later. */
2054 ctx.file_offset[TGSI_FILE_CONSTANT] = 512;
2055
2056 ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL;
2057 ctx.bc->ar_reg = ctx.file_offset[TGSI_FILE_TEMPORARY] +
2058 ctx.info.file_max[TGSI_FILE_TEMPORARY] + 1;
2059 ctx.bc->index_reg[0] = ctx.bc->ar_reg + 1;
2060 ctx.bc->index_reg[1] = ctx.bc->ar_reg + 2;
2061
2062 if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
2063 ctx.gs_export_gpr_tregs[0] = ctx.bc->ar_reg + 3;
2064 ctx.gs_export_gpr_tregs[1] = ctx.bc->ar_reg + 4;
2065 ctx.gs_export_gpr_tregs[2] = ctx.bc->ar_reg + 5;
2066 ctx.gs_export_gpr_tregs[3] = ctx.bc->ar_reg + 6;
2067 ctx.temp_reg = ctx.bc->ar_reg + 7;
2068 } else {
2069 ctx.temp_reg = ctx.bc->ar_reg + 3;
2070 }
2071
2072 shader->max_arrays = 0;
2073 shader->num_arrays = 0;
2074 if (indirect_gprs) {
2075
2076 if (ctx.info.indirect_files & (1 << TGSI_FILE_INPUT)) {
2077 r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_INPUT],
2078 ctx.file_offset[TGSI_FILE_OUTPUT] -
2079 ctx.file_offset[TGSI_FILE_INPUT],
2080 0x0F);
2081 }
2082 if (ctx.info.indirect_files & (1 << TGSI_FILE_OUTPUT)) {
2083 r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_OUTPUT],
2084 ctx.file_offset[TGSI_FILE_TEMPORARY] -
2085 ctx.file_offset[TGSI_FILE_OUTPUT],
2086 0x0F);
2087 }
2088 }
2089
2090 ctx.nliterals = 0;
2091 ctx.literals = NULL;
2092
2093 shader->fs_write_all = ctx.info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS];
2094 shader->vs_position_window_space = ctx.info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
2095 shader->ps_conservative_z = (uint8_t)ctx.info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT];
2096
2097 if (shader->vs_as_gs_a)
2098 vs_add_primid_output(&ctx, key.vs.prim_id_out);
2099
2100 while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
2101 tgsi_parse_token(&ctx.parse);
2102 switch (ctx.parse.FullToken.Token.Type) {
2103 case TGSI_TOKEN_TYPE_IMMEDIATE:
2104 immediate = &ctx.parse.FullToken.FullImmediate;
2105 ctx.literals = realloc(ctx.literals, (ctx.nliterals + 1) * 16);
2106 if(ctx.literals == NULL) {
2107 r = -ENOMEM;
2108 goto out_err;
2109 }
2110 ctx.literals[ctx.nliterals * 4 + 0] = immediate->u[0].Uint;
2111 ctx.literals[ctx.nliterals * 4 + 1] = immediate->u[1].Uint;
2112 ctx.literals[ctx.nliterals * 4 + 2] = immediate->u[2].Uint;
2113 ctx.literals[ctx.nliterals * 4 + 3] = immediate->u[3].Uint;
2114 ctx.nliterals++;
2115 break;
2116 case TGSI_TOKEN_TYPE_DECLARATION:
2117 r = tgsi_declaration(&ctx);
2118 if (r)
2119 goto out_err;
2120 break;
2121 case TGSI_TOKEN_TYPE_INSTRUCTION:
2122 case TGSI_TOKEN_TYPE_PROPERTY:
2123 break;
2124 default:
2125 R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type);
2126 r = -EINVAL;
2127 goto out_err;
2128 }
2129 }
2130
2131 shader->ring_item_sizes[0] = ctx.next_ring_offset;
2132 shader->ring_item_sizes[1] = 0;
2133 shader->ring_item_sizes[2] = 0;
2134 shader->ring_item_sizes[3] = 0;
2135
2136 /* Process two side if needed */
2137 if (shader->two_side && ctx.colors_used) {
2138 int i, count = ctx.shader->ninput;
2139 unsigned next_lds_loc = ctx.shader->nlds;
2140
2141 /* additional inputs will be allocated right after the existing inputs,
2142 * we won't need them after the color selection, so we don't need to
2143 * reserve these gprs for the rest of the shader code and to adjust
2144 * output offsets etc. */
2145 int gpr = ctx.file_offset[TGSI_FILE_INPUT] +
2146 ctx.info.file_max[TGSI_FILE_INPUT] + 1;
2147
2148 /* if two sided and neither face or sample mask is used by shader, ensure face_gpr is emitted */
2149 if (ctx.face_gpr == -1) {
2150 i = ctx.shader->ninput++;
2151 ctx.shader->input[i].name = TGSI_SEMANTIC_FACE;
2152 ctx.shader->input[i].spi_sid = 0;
2153 ctx.shader->input[i].gpr = gpr++;
2154 ctx.face_gpr = ctx.shader->input[i].gpr;
2155 }
2156
2157 for (i = 0; i < count; i++) {
2158 if (ctx.shader->input[i].name == TGSI_SEMANTIC_COLOR) {
2159 int ni = ctx.shader->ninput++;
2160 memcpy(&ctx.shader->input[ni],&ctx.shader->input[i], sizeof(struct r600_shader_io));
2161 ctx.shader->input[ni].name = TGSI_SEMANTIC_BCOLOR;
2162 ctx.shader->input[ni].spi_sid = r600_spi_sid(&ctx.shader->input[ni]);
2163 ctx.shader->input[ni].gpr = gpr++;
2164 // TGSI to LLVM needs to know the lds position of inputs.
2165 // Non LLVM path computes it later (in process_twoside_color)
2166 ctx.shader->input[ni].lds_pos = next_lds_loc++;
2167 ctx.shader->input[i].back_color_input = ni;
2168 if (ctx.bc->chip_class >= EVERGREEN) {
2169 if ((r = evergreen_interp_input(&ctx, ni)))
2170 return r;
2171 }
2172 }
2173 }
2174 }
2175
2176 /* LLVM backend setup */
2177 #ifdef R600_USE_LLVM
2178 if (use_llvm) {
2179 struct radeon_llvm_context radeon_llvm_ctx;
2180 LLVMModuleRef mod;
2181 bool dump = r600_can_dump_shader(&rscreen->b, tokens);
2182 boolean use_kill = false;
2183
2184 memset(&radeon_llvm_ctx, 0, sizeof(radeon_llvm_ctx));
2185 radeon_llvm_ctx.type = ctx.type;
2186 radeon_llvm_ctx.two_side = shader->two_side;
2187 radeon_llvm_ctx.face_gpr = ctx.face_gpr;
2188 radeon_llvm_ctx.inputs_count = ctx.shader->ninput + 1;
2189 radeon_llvm_ctx.r600_inputs = ctx.shader->input;
2190 radeon_llvm_ctx.r600_outputs = ctx.shader->output;
2191 radeon_llvm_ctx.color_buffer_count = max_color_exports;
2192 radeon_llvm_ctx.chip_class = ctx.bc->chip_class;
2193 radeon_llvm_ctx.fs_color_all = shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN);
2194 radeon_llvm_ctx.stream_outputs = &so;
2195 radeon_llvm_ctx.alpha_to_one = key.ps.alpha_to_one;
2196 radeon_llvm_ctx.has_compressed_msaa_texturing =
2197 ctx.bc->has_compressed_msaa_texturing;
2198 mod = r600_tgsi_llvm(&radeon_llvm_ctx, tokens);
2199 ctx.shader->has_txq_cube_array_z_comp = radeon_llvm_ctx.has_txq_cube_array_z_comp;
2200 ctx.shader->uses_tex_buffers = radeon_llvm_ctx.uses_tex_buffers;
2201
2202 if (r600_llvm_compile(mod, rscreen->b.family, ctx.bc, &use_kill, dump)) {
2203 radeon_llvm_dispose(&radeon_llvm_ctx);
2204 use_llvm = 0;
2205 fprintf(stderr, "R600 LLVM backend failed to compile "
2206 "shader. Falling back to TGSI\n");
2207 } else {
2208 ctx.file_offset[TGSI_FILE_OUTPUT] =
2209 ctx.file_offset[TGSI_FILE_INPUT];
2210 }
2211 if (use_kill)
2212 ctx.shader->uses_kill = use_kill;
2213 radeon_llvm_dispose(&radeon_llvm_ctx);
2214 }
2215 #endif
2216 /* End of LLVM backend setup */
2217
2218 if (shader->fs_write_all && rscreen->b.chip_class >= EVERGREEN)
2219 shader->nr_ps_max_color_exports = 8;
2220
2221 if (!use_llvm) {
2222 if (ctx.fragcoord_input >= 0) {
2223 if (ctx.bc->chip_class == CAYMAN) {
2224 for (j = 0 ; j < 4; j++) {
2225 struct r600_bytecode_alu alu;
2226 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2227 alu.op = ALU_OP1_RECIP_IEEE;
2228 alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
2229 alu.src[0].chan = 3;
2230
2231 alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
2232 alu.dst.chan = j;
2233 alu.dst.write = (j == 3);
2234 alu.last = 1;
2235 if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
2236 return r;
2237 }
2238 } else {
2239 struct r600_bytecode_alu alu;
2240 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2241 alu.op = ALU_OP1_RECIP_IEEE;
2242 alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
2243 alu.src[0].chan = 3;
2244
2245 alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
2246 alu.dst.chan = 3;
2247 alu.dst.write = 1;
2248 alu.last = 1;
2249 if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
2250 return r;
2251 }
2252 }
2253
2254 if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
2255 struct r600_bytecode_alu alu;
2256 int r;
2257
2258 /* GS thread with no output workaround - emit a cut at start of GS */
2259 if (ctx.bc->chip_class == R600)
2260 r600_bytecode_add_cfinst(ctx.bc, CF_OP_CUT_VERTEX);
2261
2262 for (j = 0; j < 4; j++) {
2263 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2264 alu.op = ALU_OP1_MOV;
2265 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
2266 alu.src[0].value = 0;
2267 alu.dst.sel = ctx.gs_export_gpr_tregs[j];
2268 alu.dst.write = 1;
2269 alu.last = 1;
2270 r = r600_bytecode_add_alu(ctx.bc, &alu);
2271 if (r)
2272 return r;
2273 }
2274 }
2275 if (shader->two_side && ctx.colors_used) {
2276 if ((r = process_twoside_color_inputs(&ctx)))
2277 return r;
2278 }
2279
2280 tgsi_parse_init(&ctx.parse, tokens);
2281 while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
2282 tgsi_parse_token(&ctx.parse);
2283 switch (ctx.parse.FullToken.Token.Type) {
2284 case TGSI_TOKEN_TYPE_INSTRUCTION:
2285 r = tgsi_is_supported(&ctx);
2286 if (r)
2287 goto out_err;
2288 ctx.max_driver_temp_used = 0;
2289 /* reserve first tmp for everyone */
2290 r600_get_temp(&ctx);
2291
2292 opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode;
2293 if ((r = tgsi_split_constant(&ctx)))
2294 goto out_err;
2295 if ((r = tgsi_split_literal_constant(&ctx)))
2296 goto out_err;
2297 if (ctx.type == TGSI_PROCESSOR_GEOMETRY)
2298 if ((r = tgsi_split_gs_inputs(&ctx)))
2299 goto out_err;
2300 if (ctx.bc->chip_class == CAYMAN)
2301 ctx.inst_info = &cm_shader_tgsi_instruction[opcode];
2302 else if (ctx.bc->chip_class >= EVERGREEN)
2303 ctx.inst_info = &eg_shader_tgsi_instruction[opcode];
2304 else
2305 ctx.inst_info = &r600_shader_tgsi_instruction[opcode];
2306 r = ctx.inst_info->process(&ctx);
2307 if (r)
2308 goto out_err;
2309 break;
2310 default:
2311 break;
2312 }
2313 }
2314 }
2315
2316 /* Reset the temporary register counter. */
2317 ctx.max_driver_temp_used = 0;
2318
2319 noutput = shader->noutput;
2320
2321 if (!ring_outputs && ctx.clip_vertex_write) {
2322 unsigned clipdist_temp[2];
2323
2324 clipdist_temp[0] = r600_get_temp(&ctx);
2325 clipdist_temp[1] = r600_get_temp(&ctx);
2326
2327 /* need to convert a clipvertex write into clipdistance writes and not export
2328 the clip vertex anymore */
2329
2330 memset(&shader->output[noutput], 0, 2*sizeof(struct r600_shader_io));
2331 shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
2332 shader->output[noutput].gpr = clipdist_temp[0];
2333 noutput++;
2334 shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
2335 shader->output[noutput].gpr = clipdist_temp[1];
2336 noutput++;
2337
2338 /* reset spi_sid for clipvertex output to avoid confusing spi */
2339 shader->output[ctx.cv_output].spi_sid = 0;
2340
2341 shader->clip_dist_write = 0xFF;
2342
2343 for (i = 0; i < 8; i++) {
2344 int oreg = i >> 2;
2345 int ochan = i & 3;
2346
2347 for (j = 0; j < 4; j++) {
2348 struct r600_bytecode_alu alu;
2349 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2350 alu.op = ALU_OP2_DOT4;
2351 alu.src[0].sel = shader->output[ctx.cv_output].gpr;
2352 alu.src[0].chan = j;
2353
2354 alu.src[1].sel = 512 + i;
2355 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
2356 alu.src[1].chan = j;
2357
2358 alu.dst.sel = clipdist_temp[oreg];
2359 alu.dst.chan = j;
2360 alu.dst.write = (j == ochan);
2361 if (j == 3)
2362 alu.last = 1;
2363 if (!use_llvm)
2364 r = r600_bytecode_add_alu(ctx.bc, &alu);
2365 if (r)
2366 return r;
2367 }
2368 }
2369 }
2370
2371 /* Add stream outputs. */
2372 if (!ring_outputs && ctx.type == TGSI_PROCESSOR_VERTEX &&
2373 so.num_outputs && !use_llvm)
2374 emit_streamout(&ctx, &so, -1, NULL);
2375
2376 pipeshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
2377 convert_edgeflag_to_int(&ctx);
2378
2379 if (ring_outputs) {
2380 if (shader->vs_as_es || shader->tes_as_es) {
2381 ctx.gs_export_gpr_tregs[0] = r600_get_temp(&ctx);
2382 ctx.gs_export_gpr_tregs[1] = -1;
2383 ctx.gs_export_gpr_tregs[2] = -1;
2384 ctx.gs_export_gpr_tregs[3] = -1;
2385
2386 emit_gs_ring_writes(&ctx, &so, -1, FALSE);
2387 }
2388 } else {
2389 /* Export output */
2390 next_clip_base = shader->vs_out_misc_write ? 62 : 61;
2391
2392 for (i = 0, j = 0; i < noutput; i++, j++) {
2393 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
2394 output[j].gpr = shader->output[i].gpr;
2395 output[j].elem_size = 3;
2396 output[j].swizzle_x = 0;
2397 output[j].swizzle_y = 1;
2398 output[j].swizzle_z = 2;
2399 output[j].swizzle_w = 3;
2400 output[j].burst_count = 1;
2401 output[j].type = -1;
2402 output[j].op = CF_OP_EXPORT;
2403 switch (ctx.type) {
2404 case TGSI_PROCESSOR_VERTEX:
2405 switch (shader->output[i].name) {
2406 case TGSI_SEMANTIC_POSITION:
2407 output[j].array_base = 60;
2408 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2409 pos_emitted = true;
2410 break;
2411
2412 case TGSI_SEMANTIC_PSIZE:
2413 output[j].array_base = 61;
2414 output[j].swizzle_y = 7;
2415 output[j].swizzle_z = 7;
2416 output[j].swizzle_w = 7;
2417 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2418 pos_emitted = true;
2419 break;
2420 case TGSI_SEMANTIC_EDGEFLAG:
2421 output[j].array_base = 61;
2422 output[j].swizzle_x = 7;
2423 output[j].swizzle_y = 0;
2424 output[j].swizzle_z = 7;
2425 output[j].swizzle_w = 7;
2426 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2427 pos_emitted = true;
2428 break;
2429 case TGSI_SEMANTIC_LAYER:
2430 /* spi_sid is 0 for outputs that are
2431 * not consumed by PS */
2432 if (shader->output[i].spi_sid) {
2433 output[j].array_base = next_param_base++;
2434 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2435 j++;
2436 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
2437 }
2438 output[j].array_base = 61;
2439 output[j].swizzle_x = 7;
2440 output[j].swizzle_y = 7;
2441 output[j].swizzle_z = 0;
2442 output[j].swizzle_w = 7;
2443 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2444 pos_emitted = true;
2445 break;
2446 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2447 /* spi_sid is 0 for outputs that are
2448 * not consumed by PS */
2449 if (shader->output[i].spi_sid) {
2450 output[j].array_base = next_param_base++;
2451 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2452 j++;
2453 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
2454 }
2455 output[j].array_base = 61;
2456 output[j].swizzle_x = 7;
2457 output[j].swizzle_y = 7;
2458 output[j].swizzle_z = 7;
2459 output[j].swizzle_w = 0;
2460 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2461 pos_emitted = true;
2462 break;
2463 case TGSI_SEMANTIC_CLIPVERTEX:
2464 j--;
2465 break;
2466 case TGSI_SEMANTIC_CLIPDIST:
2467 output[j].array_base = next_clip_base++;
2468 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2469 pos_emitted = true;
2470 /* spi_sid is 0 for clipdistance outputs that were generated
2471 * for clipvertex - we don't need to pass them to PS */
2472 if (shader->output[i].spi_sid) {
2473 j++;
2474 /* duplicate it as PARAM to pass to the pixel shader */
2475 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
2476 output[j].array_base = next_param_base++;
2477 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2478 }
2479 break;
2480 case TGSI_SEMANTIC_FOG:
2481 output[j].swizzle_y = 4; /* 0 */
2482 output[j].swizzle_z = 4; /* 0 */
2483 output[j].swizzle_w = 5; /* 1 */
2484 break;
2485 case TGSI_SEMANTIC_PRIMID:
2486 output[j].swizzle_x = 2;
2487 output[j].swizzle_y = 4; /* 0 */
2488 output[j].swizzle_z = 4; /* 0 */
2489 output[j].swizzle_w = 4; /* 0 */
2490 break;
2491 }
2492
2493 break;
2494 case TGSI_PROCESSOR_FRAGMENT:
2495 if (shader->output[i].name == TGSI_SEMANTIC_COLOR) {
2496 /* never export more colors than the number of CBs */
2497 if (shader->output[i].sid >= max_color_exports) {
2498 /* skip export */
2499 j--;
2500 continue;
2501 }
2502 output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
2503 output[j].array_base = shader->output[i].sid;
2504 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2505 shader->nr_ps_color_exports++;
2506 if (shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN)) {
2507 for (k = 1; k < max_color_exports; k++) {
2508 j++;
2509 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
2510 output[j].gpr = shader->output[i].gpr;
2511 output[j].elem_size = 3;
2512 output[j].swizzle_x = 0;
2513 output[j].swizzle_y = 1;
2514 output[j].swizzle_z = 2;
2515 output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
2516 output[j].burst_count = 1;
2517 output[j].array_base = k;
2518 output[j].op = CF_OP_EXPORT;
2519 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2520 shader->nr_ps_color_exports++;
2521 }
2522 }
2523 } else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) {
2524 output[j].array_base = 61;
2525 output[j].swizzle_x = 2;
2526 output[j].swizzle_y = 7;
2527 output[j].swizzle_z = output[j].swizzle_w = 7;
2528 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2529 } else if (shader->output[i].name == TGSI_SEMANTIC_STENCIL) {
2530 output[j].array_base = 61;
2531 output[j].swizzle_x = 7;
2532 output[j].swizzle_y = 1;
2533 output[j].swizzle_z = output[j].swizzle_w = 7;
2534 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2535 } else if (shader->output[i].name == TGSI_SEMANTIC_SAMPLEMASK) {
2536 output[j].array_base = 61;
2537 output[j].swizzle_x = 7;
2538 output[j].swizzle_y = 7;
2539 output[j].swizzle_z = 0;
2540 output[j].swizzle_w = 7;
2541 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2542 } else {
2543 R600_ERR("unsupported fragment output name %d\n", shader->output[i].name);
2544 r = -EINVAL;
2545 goto out_err;
2546 }
2547 break;
2548 default:
2549 R600_ERR("unsupported processor type %d\n", ctx.type);
2550 r = -EINVAL;
2551 goto out_err;
2552 }
2553
2554 if (output[j].type==-1) {
2555 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2556 output[j].array_base = next_param_base++;
2557 }
2558 }
2559
2560 /* add fake position export */
2561 if (ctx.type == TGSI_PROCESSOR_VERTEX && pos_emitted == false) {
2562 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
2563 output[j].gpr = 0;
2564 output[j].elem_size = 3;
2565 output[j].swizzle_x = 7;
2566 output[j].swizzle_y = 7;
2567 output[j].swizzle_z = 7;
2568 output[j].swizzle_w = 7;
2569 output[j].burst_count = 1;
2570 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2571 output[j].array_base = 60;
2572 output[j].op = CF_OP_EXPORT;
2573 j++;
2574 }
2575
2576 /* add fake param output for vertex shader if no param is exported */
2577 if (ctx.type == TGSI_PROCESSOR_VERTEX && next_param_base == 0) {
2578 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
2579 output[j].gpr = 0;
2580 output[j].elem_size = 3;
2581 output[j].swizzle_x = 7;
2582 output[j].swizzle_y = 7;
2583 output[j].swizzle_z = 7;
2584 output[j].swizzle_w = 7;
2585 output[j].burst_count = 1;
2586 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2587 output[j].array_base = 0;
2588 output[j].op = CF_OP_EXPORT;
2589 j++;
2590 }
2591
2592 /* add fake pixel export */
2593 if (ctx.type == TGSI_PROCESSOR_FRAGMENT && shader->nr_ps_color_exports == 0) {
2594 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
2595 output[j].gpr = 0;
2596 output[j].elem_size = 3;
2597 output[j].swizzle_x = 7;
2598 output[j].swizzle_y = 7;
2599 output[j].swizzle_z = 7;
2600 output[j].swizzle_w = 7;
2601 output[j].burst_count = 1;
2602 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2603 output[j].array_base = 0;
2604 output[j].op = CF_OP_EXPORT;
2605 j++;
2606 shader->nr_ps_color_exports++;
2607 }
2608
2609 noutput = j;
2610
2611 /* set export done on last export of each type */
2612 for (i = noutput - 1, output_done = 0; i >= 0; i--) {
2613 if (!(output_done & (1 << output[i].type))) {
2614 output_done |= (1 << output[i].type);
2615 output[i].op = CF_OP_EXPORT_DONE;
2616 }
2617 }
2618 /* add output to bytecode */
2619 if (!use_llvm) {
2620 for (i = 0; i < noutput; i++) {
2621 r = r600_bytecode_add_output(ctx.bc, &output[i]);
2622 if (r)
2623 goto out_err;
2624 }
2625 }
2626 }
2627
2628 /* add program end */
2629 if (!use_llvm) {
2630 if (ctx.bc->chip_class == CAYMAN)
2631 cm_bytecode_add_cf_end(ctx.bc);
2632 else {
2633 const struct cf_op_info *last = NULL;
2634
2635 if (ctx.bc->cf_last)
2636 last = r600_isa_cf(ctx.bc->cf_last->op);
2637
2638 /* alu clause instructions don't have EOP bit, so add NOP */
2639 if (!last || last->flags & CF_ALU || ctx.bc->cf_last->op == CF_OP_LOOP_END || ctx.bc->cf_last->op == CF_OP_CALL_FS)
2640 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
2641
2642 ctx.bc->cf_last->end_of_program = 1;
2643 }
2644 }
2645
2646 /* check GPR limit - we have 124 = 128 - 4
2647 * (4 are reserved as alu clause temporary registers) */
2648 if (ctx.bc->ngpr > 124) {
2649 R600_ERR("GPR limit exceeded - shader requires %d registers\n", ctx.bc->ngpr);
2650 r = -ENOMEM;
2651 goto out_err;
2652 }
2653
2654 if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
2655 if ((r = generate_gs_copy_shader(rctx, pipeshader, &so)))
2656 return r;
2657 }
2658
2659 free(ctx.literals);
2660 tgsi_parse_free(&ctx.parse);
2661 return 0;
2662 out_err:
2663 free(ctx.literals);
2664 tgsi_parse_free(&ctx.parse);
2665 return r;
2666 }
2667
2668 static int tgsi_unsupported(struct r600_shader_ctx *ctx)
2669 {
2670 const unsigned tgsi_opcode =
2671 ctx->parse.FullToken.FullInstruction.Instruction.Opcode;
2672 R600_ERR("%s tgsi opcode unsupported\n",
2673 tgsi_get_opcode_name(tgsi_opcode));
2674 return -EINVAL;
2675 }
2676
2677 static int tgsi_end(struct r600_shader_ctx *ctx)
2678 {
2679 return 0;
2680 }
2681
2682 static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
2683 const struct r600_shader_src *shader_src,
2684 unsigned chan)
2685 {
2686 bc_src->sel = shader_src->sel;
2687 bc_src->chan = shader_src->swizzle[chan];
2688 bc_src->neg = shader_src->neg;
2689 bc_src->abs = shader_src->abs;
2690 bc_src->rel = shader_src->rel;
2691 bc_src->value = shader_src->value[bc_src->chan];
2692 bc_src->kc_bank = shader_src->kc_bank;
2693 bc_src->kc_rel = shader_src->kc_rel;
2694 }
2695
2696 static void r600_bytecode_src_set_abs(struct r600_bytecode_alu_src *bc_src)
2697 {
2698 bc_src->abs = 1;
2699 bc_src->neg = 0;
2700 }
2701
2702 static void r600_bytecode_src_toggle_neg(struct r600_bytecode_alu_src *bc_src)
2703 {
2704 bc_src->neg = !bc_src->neg;
2705 }
2706
2707 static void tgsi_dst(struct r600_shader_ctx *ctx,
2708 const struct tgsi_full_dst_register *tgsi_dst,
2709 unsigned swizzle,
2710 struct r600_bytecode_alu_dst *r600_dst)
2711 {
2712 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2713
2714 r600_dst->sel = tgsi_dst->Register.Index;
2715 r600_dst->sel += ctx->file_offset[tgsi_dst->Register.File];
2716 r600_dst->chan = swizzle;
2717 r600_dst->write = 1;
2718 if (tgsi_dst->Register.Indirect)
2719 r600_dst->rel = V_SQ_REL_RELATIVE;
2720 if (inst->Instruction.Saturate) {
2721 r600_dst->clamp = 1;
2722 }
2723 }
2724
2725 static int tgsi_last_instruction(unsigned writemask)
2726 {
2727 int i, lasti = 0;
2728
2729 for (i = 0; i < 4; i++) {
2730 if (writemask & (1 << i)) {
2731 lasti = i;
2732 }
2733 }
2734 return lasti;
2735 }
2736
2737
2738
2739 static int tgsi_op2_64_params(struct r600_shader_ctx *ctx, bool singledest, bool swap)
2740 {
2741 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2742 unsigned write_mask = inst->Dst[0].Register.WriteMask;
2743 struct r600_bytecode_alu alu;
2744 int i, j, r, lasti = tgsi_last_instruction(write_mask);
2745 int use_tmp = 0;
2746
2747 if (singledest) {
2748 switch (write_mask) {
2749 case 0x1:
2750 write_mask = 0x3;
2751 break;
2752 case 0x2:
2753 use_tmp = 1;
2754 write_mask = 0x3;
2755 break;
2756 case 0x4:
2757 write_mask = 0xc;
2758 break;
2759 case 0x8:
2760 write_mask = 0xc;
2761 use_tmp = 3;
2762 break;
2763 }
2764 }
2765
2766 lasti = tgsi_last_instruction(write_mask);
2767 for (i = 0; i <= lasti; i++) {
2768
2769 if (!(write_mask & (1 << i)))
2770 continue;
2771
2772 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2773
2774 if (singledest) {
2775 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2776 if (use_tmp) {
2777 alu.dst.sel = ctx->temp_reg;
2778 alu.dst.chan = i;
2779 alu.dst.write = 1;
2780 }
2781 if (i == 1 || i == 3)
2782 alu.dst.write = 0;
2783 } else
2784 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2785
2786 alu.op = ctx->inst_info->op;
2787 if (ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DABS) {
2788 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
2789 } else if (!swap) {
2790 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
2791 r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
2792 }
2793 } else {
2794 r600_bytecode_src(&alu.src[0], &ctx->src[1], fp64_switch(i));
2795 r600_bytecode_src(&alu.src[1], &ctx->src[0], fp64_switch(i));
2796 }
2797
2798 /* handle some special cases */
2799 if (i == 1 || i == 3) {
2800 switch (ctx->parse.FullToken.FullInstruction.Instruction.Opcode) {
2801 case TGSI_OPCODE_SUB:
2802 r600_bytecode_src_toggle_neg(&alu.src[1]);
2803 break;
2804 case TGSI_OPCODE_DABS:
2805 r600_bytecode_src_set_abs(&alu.src[0]);
2806 break;
2807 default:
2808 break;
2809 }
2810 }
2811 if (i == lasti) {
2812 alu.last = 1;
2813 }
2814 r = r600_bytecode_add_alu(ctx->bc, &alu);
2815 if (r)
2816 return r;
2817 }
2818
2819 if (use_tmp) {
2820 write_mask = inst->Dst[0].Register.WriteMask;
2821
2822 /* move result from temp to dst */
2823 for (i = 0; i <= lasti; i++) {
2824 if (!(write_mask & (1 << i)))
2825 continue;
2826
2827 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2828 alu.op = ALU_OP1_MOV;
2829 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2830 alu.src[0].sel = ctx->temp_reg;
2831 alu.src[0].chan = use_tmp - 1;
2832 alu.last = (i == lasti);
2833
2834 r = r600_bytecode_add_alu(ctx->bc, &alu);
2835 if (r)
2836 return r;
2837 }
2838 }
2839 return 0;
2840 }
2841
2842 static int tgsi_op2_64(struct r600_shader_ctx *ctx)
2843 {
2844 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2845 unsigned write_mask = inst->Dst[0].Register.WriteMask;
2846 /* confirm writemasking */
2847 if ((write_mask & 0x3) != 0x3 &&
2848 (write_mask & 0xc) != 0xc) {
2849 fprintf(stderr, "illegal writemask for 64-bit: 0x%x\n", write_mask);
2850 return -1;
2851 }
2852 return tgsi_op2_64_params(ctx, false, false);
2853 }
2854
2855 static int tgsi_op2_64_single_dest(struct r600_shader_ctx *ctx)
2856 {
2857 return tgsi_op2_64_params(ctx, true, false);
2858 }
2859
2860 static int tgsi_op2_64_single_dest_s(struct r600_shader_ctx *ctx)
2861 {
2862 return tgsi_op2_64_params(ctx, true, true);
2863 }
2864
2865 static int tgsi_op3_64(struct r600_shader_ctx *ctx)
2866 {
2867 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2868 struct r600_bytecode_alu alu;
2869 int i, j, r;
2870 int lasti = 3;
2871 int tmp = r600_get_temp(ctx);
2872
2873 for (i = 0; i < lasti + 1; i++) {
2874
2875 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2876 alu.op = ctx->inst_info->op;
2877 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
2878 r600_bytecode_src(&alu.src[j], &ctx->src[j], i == 3 ? 0 : 1);
2879 }
2880
2881 if (inst->Dst[0].Register.WriteMask & (1 << i))
2882 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2883 else
2884 alu.dst.sel = tmp;
2885
2886 alu.dst.chan = i;
2887 alu.is_op3 = 1;
2888 if (i == lasti) {
2889 alu.last = 1;
2890 }
2891 r = r600_bytecode_add_alu(ctx->bc, &alu);
2892 if (r)
2893 return r;
2894 }
2895 return 0;
2896 }
2897
2898 static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap, int trans_only)
2899 {
2900 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2901 struct r600_bytecode_alu alu;
2902 unsigned write_mask = inst->Dst[0].Register.WriteMask;
2903 int i, j, r, lasti = tgsi_last_instruction(write_mask);
2904 /* use temp register if trans_only and more than one dst component */
2905 int use_tmp = trans_only && (write_mask ^ (1 << lasti));
2906
2907 for (i = 0; i <= lasti; i++) {
2908 if (!(write_mask & (1 << i)))
2909 continue;
2910
2911 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2912 if (use_tmp) {
2913 alu.dst.sel = ctx->temp_reg;
2914 alu.dst.chan = i;
2915 alu.dst.write = 1;
2916 } else
2917 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2918
2919 alu.op = ctx->inst_info->op;
2920 if (!swap) {
2921 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
2922 r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
2923 }
2924 } else {
2925 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
2926 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
2927 }
2928 /* handle some special cases */
2929 switch (inst->Instruction.Opcode) {
2930 case TGSI_OPCODE_SUB:
2931 r600_bytecode_src_toggle_neg(&alu.src[1]);
2932 break;
2933 case TGSI_OPCODE_ABS:
2934 r600_bytecode_src_set_abs(&alu.src[0]);
2935 break;
2936 default:
2937 break;
2938 }
2939 if (i == lasti || trans_only) {
2940 alu.last = 1;
2941 }
2942 r = r600_bytecode_add_alu(ctx->bc, &alu);
2943 if (r)
2944 return r;
2945 }
2946
2947 if (use_tmp) {
2948 /* move result from temp to dst */
2949 for (i = 0; i <= lasti; i++) {
2950 if (!(write_mask & (1 << i)))
2951 continue;
2952
2953 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2954 alu.op = ALU_OP1_MOV;
2955 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2956 alu.src[0].sel = ctx->temp_reg;
2957 alu.src[0].chan = i;
2958 alu.last = (i == lasti);
2959
2960 r = r600_bytecode_add_alu(ctx->bc, &alu);
2961 if (r)
2962 return r;
2963 }
2964 }
2965 return 0;
2966 }
2967
2968 static int tgsi_op2(struct r600_shader_ctx *ctx)
2969 {
2970 return tgsi_op2_s(ctx, 0, 0);
2971 }
2972
2973 static int tgsi_op2_swap(struct r600_shader_ctx *ctx)
2974 {
2975 return tgsi_op2_s(ctx, 1, 0);
2976 }
2977
2978 static int tgsi_op2_trans(struct r600_shader_ctx *ctx)
2979 {
2980 return tgsi_op2_s(ctx, 0, 1);
2981 }
2982
2983 static int tgsi_ineg(struct r600_shader_ctx *ctx)
2984 {
2985 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2986 struct r600_bytecode_alu alu;
2987 int i, r;
2988 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
2989
2990 for (i = 0; i < lasti + 1; i++) {
2991
2992 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
2993 continue;
2994 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2995 alu.op = ctx->inst_info->op;
2996
2997 alu.src[0].sel = V_SQ_ALU_SRC_0;
2998
2999 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3000
3001 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3002
3003 if (i == lasti) {
3004 alu.last = 1;
3005 }
3006 r = r600_bytecode_add_alu(ctx->bc, &alu);
3007 if (r)
3008 return r;
3009 }
3010 return 0;
3011
3012 }
3013
3014 static int tgsi_dneg(struct r600_shader_ctx *ctx)
3015 {
3016 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3017 struct r600_bytecode_alu alu;
3018 int i, r;
3019 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3020
3021 for (i = 0; i < lasti + 1; i++) {
3022
3023 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
3024 continue;
3025 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3026 alu.op = ALU_OP1_MOV;
3027
3028 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3029
3030 if (i == 1 || i == 3)
3031 r600_bytecode_src_toggle_neg(&alu.src[0]);
3032 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3033
3034 if (i == lasti) {
3035 alu.last = 1;
3036 }
3037 r = r600_bytecode_add_alu(ctx->bc, &alu);
3038 if (r)
3039 return r;
3040 }
3041 return 0;
3042
3043 }
3044
3045 static int tgsi_dfracexp(struct r600_shader_ctx *ctx)
3046 {
3047 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3048 struct r600_bytecode_alu alu;
3049 unsigned write_mask = inst->Dst[0].Register.WriteMask;
3050 int i, j, r;
3051 int firsti = write_mask == 0xc ? 2 : 0;
3052
3053 for (i = 0; i <= 3; i++) {
3054 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3055 alu.op = ctx->inst_info->op;
3056
3057 alu.dst.sel = ctx->temp_reg;
3058 alu.dst.chan = i;
3059 alu.dst.write = 1;
3060 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3061 r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
3062 }
3063
3064 if (i == 3)
3065 alu.last = 1;
3066
3067 r = r600_bytecode_add_alu(ctx->bc, &alu);
3068 if (r)
3069 return r;
3070 }
3071
3072 /* MOV first two channels to writemask dst0 */
3073 for (i = 0; i <= 1; i++) {
3074 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3075 alu.op = ALU_OP1_MOV;
3076 alu.src[0].chan = i + 2;
3077 alu.src[0].sel = ctx->temp_reg;
3078
3079 tgsi_dst(ctx, &inst->Dst[0], firsti + i, &alu.dst);
3080 alu.dst.write = (inst->Dst[0].Register.WriteMask >> (firsti + i)) & 1;
3081 alu.last = 1;
3082 r = r600_bytecode_add_alu(ctx->bc, &alu);
3083 if (r)
3084 return r;
3085 }
3086
3087 for (i = 0; i <= 3; i++) {
3088 if (inst->Dst[1].Register.WriteMask & (1 << i)) {
3089 /* MOV third channels to writemask dst1 */
3090 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3091 alu.op = ALU_OP1_MOV;
3092 alu.src[0].chan = 1;
3093 alu.src[0].sel = ctx->temp_reg;
3094
3095 tgsi_dst(ctx, &inst->Dst[1], i, &alu.dst);
3096 alu.last = 1;
3097 r = r600_bytecode_add_alu(ctx->bc, &alu);
3098 if (r)
3099 return r;
3100 break;
3101 }
3102 }
3103 return 0;
3104 }
3105
3106
3107 static int egcm_int_to_double(struct r600_shader_ctx *ctx)
3108 {
3109 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3110 struct r600_bytecode_alu alu;
3111 int i, r;
3112 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3113
3114 assert(inst->Instruction.Opcode == TGSI_OPCODE_I2D ||
3115 inst->Instruction.Opcode == TGSI_OPCODE_U2D);
3116
3117 for (i = 0; i <= (lasti+1)/2; i++) {
3118 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3119 alu.op = ctx->inst_info->op;
3120
3121 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3122 alu.dst.sel = ctx->temp_reg;
3123 alu.dst.chan = i;
3124 alu.dst.write = 1;
3125 alu.last = 1;
3126
3127 r = r600_bytecode_add_alu(ctx->bc, &alu);
3128 if (r)
3129 return r;
3130 }
3131
3132 for (i = 0; i <= lasti; i++) {
3133 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3134 alu.op = ALU_OP1_FLT32_TO_FLT64;
3135
3136 alu.src[0].chan = i/2;
3137 if (i%2 == 0)
3138 alu.src[0].sel = ctx->temp_reg;
3139 else {
3140 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3141 alu.src[0].value = 0x0;
3142 }
3143 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3144 alu.last = i == lasti;
3145
3146 r = r600_bytecode_add_alu(ctx->bc, &alu);
3147 if (r)
3148 return r;
3149 }
3150
3151 return 0;
3152 }
3153
3154 static int egcm_double_to_int(struct r600_shader_ctx *ctx)
3155 {
3156 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3157 struct r600_bytecode_alu alu;
3158 int i, r;
3159 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3160
3161 assert(inst->Instruction.Opcode == TGSI_OPCODE_D2I ||
3162 inst->Instruction.Opcode == TGSI_OPCODE_D2U);
3163
3164 for (i = 0; i <= lasti; i++) {
3165 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3166 alu.op = ALU_OP1_FLT64_TO_FLT32;
3167
3168 r600_bytecode_src(&alu.src[0], &ctx->src[0], fp64_switch(i));
3169 alu.dst.chan = i;
3170 alu.dst.sel = ctx->temp_reg;
3171 alu.dst.write = i%2 == 0;
3172 alu.last = i == lasti;
3173
3174 r = r600_bytecode_add_alu(ctx->bc, &alu);
3175 if (r)
3176 return r;
3177 }
3178
3179 for (i = 0; i <= (lasti+1)/2; i++) {
3180 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3181 alu.op = ctx->inst_info->op;
3182
3183 alu.src[0].chan = i*2;
3184 alu.src[0].sel = ctx->temp_reg;
3185 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
3186 alu.last = 1;
3187
3188 r = r600_bytecode_add_alu(ctx->bc, &alu);
3189 if (r)
3190 return r;
3191 }
3192
3193 return 0;
3194 }
3195
3196 static int cayman_emit_double_instr(struct r600_shader_ctx *ctx)
3197 {
3198 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3199 int i, r;
3200 struct r600_bytecode_alu alu;
3201 int last_slot = 3;
3202 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3203 int t1 = ctx->temp_reg;
3204
3205 /* these have to write the result to X/Y by the looks of it */
3206 for (i = 0 ; i < last_slot; i++) {
3207 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3208 alu.op = ctx->inst_info->op;
3209
3210 /* should only be one src regs */
3211 assert (inst->Instruction.NumSrcRegs == 1);
3212
3213 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
3214 r600_bytecode_src(&alu.src[1], &ctx->src[0], 0);
3215
3216 /* RSQ should take the absolute value of src */
3217 if (ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DRSQ ||
3218 ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DSQRT) {
3219 r600_bytecode_src_set_abs(&alu.src[1]);
3220 }
3221 alu.dst.sel = t1;
3222 alu.dst.chan = i;
3223 alu.dst.write = (i == 0 || i == 1);
3224
3225 if (ctx->bc->chip_class != CAYMAN || i == last_slot - 1)
3226 alu.last = 1;
3227 r = r600_bytecode_add_alu(ctx->bc, &alu);
3228 if (r)
3229 return r;
3230 }
3231
3232 for (i = 0 ; i <= lasti; i++) {
3233 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
3234 continue;
3235 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3236 alu.op = ALU_OP1_MOV;
3237 alu.src[0].sel = t1;
3238 alu.src[0].chan = (i == 0 || i == 2) ? 0 : 1;
3239 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3240 alu.dst.write = 1;
3241 if (i == lasti)
3242 alu.last = 1;
3243 r = r600_bytecode_add_alu(ctx->bc, &alu);
3244 if (r)
3245 return r;
3246 }
3247 return 0;
3248 }
3249
3250 static int cayman_emit_float_instr(struct r600_shader_ctx *ctx)
3251 {
3252 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3253 int i, j, r;
3254 struct r600_bytecode_alu alu;
3255 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
3256
3257 for (i = 0 ; i < last_slot; i++) {
3258 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3259 alu.op = ctx->inst_info->op;
3260 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3261 r600_bytecode_src(&alu.src[j], &ctx->src[j], 0);
3262
3263 /* RSQ should take the absolute value of src */
3264 if (inst->Instruction.Opcode == TGSI_OPCODE_RSQ) {
3265 r600_bytecode_src_set_abs(&alu.src[j]);
3266 }
3267 }
3268 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3269 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
3270
3271 if (i == last_slot - 1)
3272 alu.last = 1;
3273 r = r600_bytecode_add_alu(ctx->bc, &alu);
3274 if (r)
3275 return r;
3276 }
3277 return 0;
3278 }
3279
3280 static int cayman_mul_int_instr(struct r600_shader_ctx *ctx)
3281 {
3282 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3283 int i, j, k, r;
3284 struct r600_bytecode_alu alu;
3285 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3286 int t1 = ctx->temp_reg;
3287
3288 for (k = 0; k <= lasti; k++) {
3289 if (!(inst->Dst[0].Register.WriteMask & (1 << k)))
3290 continue;
3291
3292 for (i = 0 ; i < 4; i++) {
3293 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3294 alu.op = ctx->inst_info->op;
3295 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3296 r600_bytecode_src(&alu.src[j], &ctx->src[j], k);
3297 }
3298 alu.dst.sel = t1;
3299 alu.dst.chan = i;
3300 alu.dst.write = (i == k);
3301 if (i == 3)
3302 alu.last = 1;
3303 r = r600_bytecode_add_alu(ctx->bc, &alu);
3304 if (r)
3305 return r;
3306 }
3307 }
3308
3309 for (i = 0 ; i <= lasti; i++) {
3310 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
3311 continue;
3312 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3313 alu.op = ALU_OP1_MOV;
3314 alu.src[0].sel = t1;
3315 alu.src[0].chan = i;
3316 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3317 alu.dst.write = 1;
3318 if (i == lasti)
3319 alu.last = 1;
3320 r = r600_bytecode_add_alu(ctx->bc, &alu);
3321 if (r)
3322 return r;
3323 }
3324
3325 return 0;
3326 }
3327
3328
3329 static int cayman_mul_double_instr(struct r600_shader_ctx *ctx)
3330 {
3331 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3332 int i, j, k, r;
3333 struct r600_bytecode_alu alu;
3334 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3335 int t1 = ctx->temp_reg;
3336
3337 for (k = 0; k < 2; k++) {
3338 if (!(inst->Dst[0].Register.WriteMask & (0x3 << (k * 2))))
3339 continue;
3340
3341 for (i = 0; i < 4; i++) {
3342 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3343 alu.op = ctx->inst_info->op;
3344 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3345 r600_bytecode_src(&alu.src[j], &ctx->src[j], k * 2 + ((i == 3) ? 0 : 1));;
3346 }
3347 alu.dst.sel = t1;
3348 alu.dst.chan = i;
3349 alu.dst.write = 1;
3350 if (i == 3)
3351 alu.last = 1;
3352 r = r600_bytecode_add_alu(ctx->bc, &alu);
3353 if (r)
3354 return r;
3355 }
3356 }
3357
3358 for (i = 0; i <= lasti; i++) {
3359 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
3360 continue;
3361 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3362 alu.op = ALU_OP1_MOV;
3363 alu.src[0].sel = t1;
3364 alu.src[0].chan = i;
3365 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3366 alu.dst.write = 1;
3367 if (i == lasti)
3368 alu.last = 1;
3369 r = r600_bytecode_add_alu(ctx->bc, &alu);
3370 if (r)
3371 return r;
3372 }
3373
3374 return 0;
3375 }
3376
3377 /*
3378 * r600 - trunc to -PI..PI range
3379 * r700 - normalize by dividing by 2PI
3380 * see fdo bug 27901
3381 */
3382 static int tgsi_setup_trig(struct r600_shader_ctx *ctx)
3383 {
3384 static float half_inv_pi = 1.0 /(3.1415926535 * 2);
3385 static float double_pi = 3.1415926535 * 2;
3386 static float neg_pi = -3.1415926535;
3387
3388 int r;
3389 struct r600_bytecode_alu alu;
3390
3391 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3392 alu.op = ALU_OP3_MULADD;
3393 alu.is_op3 = 1;
3394
3395 alu.dst.chan = 0;
3396 alu.dst.sel = ctx->temp_reg;
3397 alu.dst.write = 1;
3398
3399 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
3400
3401 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
3402 alu.src[1].chan = 0;
3403 alu.src[1].value = *(uint32_t *)&half_inv_pi;
3404 alu.src[2].sel = V_SQ_ALU_SRC_0_5;
3405 alu.src[2].chan = 0;
3406 alu.last = 1;
3407 r = r600_bytecode_add_alu(ctx->bc, &alu);
3408 if (r)
3409 return r;
3410
3411 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3412 alu.op = ALU_OP1_FRACT;
3413
3414 alu.dst.chan = 0;
3415 alu.dst.sel = ctx->temp_reg;
3416 alu.dst.write = 1;
3417
3418 alu.src[0].sel = ctx->temp_reg;
3419 alu.src[0].chan = 0;
3420 alu.last = 1;
3421 r = r600_bytecode_add_alu(ctx->bc, &alu);
3422 if (r)
3423 return r;
3424
3425 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3426 alu.op = ALU_OP3_MULADD;
3427 alu.is_op3 = 1;
3428
3429 alu.dst.chan = 0;
3430 alu.dst.sel = ctx->temp_reg;
3431 alu.dst.write = 1;
3432
3433 alu.src[0].sel = ctx->temp_reg;
3434 alu.src[0].chan = 0;
3435
3436 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
3437 alu.src[1].chan = 0;
3438 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
3439 alu.src[2].chan = 0;
3440
3441 if (ctx->bc->chip_class == R600) {
3442 alu.src[1].value = *(uint32_t *)&double_pi;
3443 alu.src[2].value = *(uint32_t *)&neg_pi;
3444 } else {
3445 alu.src[1].sel = V_SQ_ALU_SRC_1;
3446 alu.src[2].sel = V_SQ_ALU_SRC_0_5;
3447 alu.src[2].neg = 1;
3448 }
3449
3450 alu.last = 1;
3451 r = r600_bytecode_add_alu(ctx->bc, &alu);
3452 if (r)
3453 return r;
3454 return 0;
3455 }
3456
3457 static int cayman_trig(struct r600_shader_ctx *ctx)
3458 {
3459 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3460 struct r600_bytecode_alu alu;
3461 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
3462 int i, r;
3463
3464 r = tgsi_setup_trig(ctx);
3465 if (r)
3466 return r;
3467
3468
3469 for (i = 0; i < last_slot; i++) {
3470 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3471 alu.op = ctx->inst_info->op;
3472 alu.dst.chan = i;
3473
3474 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3475 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
3476
3477 alu.src[0].sel = ctx->temp_reg;
3478 alu.src[0].chan = 0;
3479 if (i == last_slot - 1)
3480 alu.last = 1;
3481 r = r600_bytecode_add_alu(ctx->bc, &alu);
3482 if (r)
3483 return r;
3484 }
3485 return 0;
3486 }
3487
3488 static int tgsi_trig(struct r600_shader_ctx *ctx)
3489 {
3490 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3491 struct r600_bytecode_alu alu;
3492 int i, r;
3493 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3494
3495 r = tgsi_setup_trig(ctx);
3496 if (r)
3497 return r;
3498
3499 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3500 alu.op = ctx->inst_info->op;
3501 alu.dst.chan = 0;
3502 alu.dst.sel = ctx->temp_reg;
3503 alu.dst.write = 1;
3504
3505 alu.src[0].sel = ctx->temp_reg;
3506 alu.src[0].chan = 0;
3507 alu.last = 1;
3508 r = r600_bytecode_add_alu(ctx->bc, &alu);
3509 if (r)
3510 return r;
3511
3512 /* replicate result */
3513 for (i = 0; i < lasti + 1; i++) {
3514 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
3515 continue;
3516
3517 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3518 alu.op = ALU_OP1_MOV;
3519
3520 alu.src[0].sel = ctx->temp_reg;
3521 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3522 if (i == lasti)
3523 alu.last = 1;
3524 r = r600_bytecode_add_alu(ctx->bc, &alu);
3525 if (r)
3526 return r;
3527 }
3528 return 0;
3529 }
3530
3531 static int tgsi_scs(struct r600_shader_ctx *ctx)
3532 {
3533 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3534 struct r600_bytecode_alu alu;
3535 int i, r;
3536
3537 /* We'll only need the trig stuff if we are going to write to the
3538 * X or Y components of the destination vector.
3539 */
3540 if (likely(inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY)) {
3541 r = tgsi_setup_trig(ctx);
3542 if (r)
3543 return r;
3544 }
3545
3546 /* dst.x = COS */
3547 if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
3548 if (ctx->bc->chip_class == CAYMAN) {
3549 for (i = 0 ; i < 3; i++) {
3550 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3551 alu.op = ALU_OP1_COS;
3552 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3553
3554 if (i == 0)
3555 alu.dst.write = 1;
3556 else
3557 alu.dst.write = 0;
3558 alu.src[0].sel = ctx->temp_reg;
3559 alu.src[0].chan = 0;
3560 if (i == 2)
3561 alu.last = 1;
3562 r = r600_bytecode_add_alu(ctx->bc, &alu);
3563 if (r)
3564 return r;
3565 }
3566 } else {
3567 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3568 alu.op = ALU_OP1_COS;
3569 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
3570
3571 alu.src[0].sel = ctx->temp_reg;
3572 alu.src[0].chan = 0;
3573 alu.last = 1;
3574 r = r600_bytecode_add_alu(ctx->bc, &alu);
3575 if (r)
3576 return r;
3577 }
3578 }
3579
3580 /* dst.y = SIN */
3581 if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3582 if (ctx->bc->chip_class == CAYMAN) {
3583 for (i = 0 ; i < 3; i++) {
3584 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3585 alu.op = ALU_OP1_SIN;
3586 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3587 if (i == 1)
3588 alu.dst.write = 1;
3589 else
3590 alu.dst.write = 0;
3591 alu.src[0].sel = ctx->temp_reg;
3592 alu.src[0].chan = 0;
3593 if (i == 2)
3594 alu.last = 1;
3595 r = r600_bytecode_add_alu(ctx->bc, &alu);
3596 if (r)
3597 return r;
3598 }
3599 } else {
3600 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3601 alu.op = ALU_OP1_SIN;
3602 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
3603
3604 alu.src[0].sel = ctx->temp_reg;
3605 alu.src[0].chan = 0;
3606 alu.last = 1;
3607 r = r600_bytecode_add_alu(ctx->bc, &alu);
3608 if (r)
3609 return r;
3610 }
3611 }
3612
3613 /* dst.z = 0.0; */
3614 if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3615 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3616
3617 alu.op = ALU_OP1_MOV;
3618
3619 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
3620
3621 alu.src[0].sel = V_SQ_ALU_SRC_0;
3622 alu.src[0].chan = 0;
3623
3624 alu.last = 1;
3625
3626 r = r600_bytecode_add_alu(ctx->bc, &alu);
3627 if (r)
3628 return r;
3629 }
3630
3631 /* dst.w = 1.0; */
3632 if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3633 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3634
3635 alu.op = ALU_OP1_MOV;
3636
3637 tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
3638
3639 alu.src[0].sel = V_SQ_ALU_SRC_1;
3640 alu.src[0].chan = 0;
3641
3642 alu.last = 1;
3643
3644 r = r600_bytecode_add_alu(ctx->bc, &alu);
3645 if (r)
3646 return r;
3647 }
3648
3649 return 0;
3650 }
3651
3652 static int tgsi_kill(struct r600_shader_ctx *ctx)
3653 {
3654 const struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3655 struct r600_bytecode_alu alu;
3656 int i, r;
3657
3658 for (i = 0; i < 4; i++) {
3659 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3660 alu.op = ctx->inst_info->op;
3661
3662 alu.dst.chan = i;
3663
3664 alu.src[0].sel = V_SQ_ALU_SRC_0;
3665
3666 if (inst->Instruction.Opcode == TGSI_OPCODE_KILL) {
3667 alu.src[1].sel = V_SQ_ALU_SRC_1;
3668 alu.src[1].neg = 1;
3669 } else {
3670 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3671 }
3672 if (i == 3) {
3673 alu.last = 1;
3674 }
3675 r = r600_bytecode_add_alu(ctx->bc, &alu);
3676 if (r)
3677 return r;
3678 }
3679
3680 /* kill must be last in ALU */
3681 ctx->bc->force_add_cf = 1;
3682 ctx->shader->uses_kill = TRUE;
3683 return 0;
3684 }
3685
3686 static int tgsi_lit(struct r600_shader_ctx *ctx)
3687 {
3688 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3689 struct r600_bytecode_alu alu;
3690 int r;
3691
3692 /* tmp.x = max(src.y, 0.0) */
3693 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3694 alu.op = ALU_OP2_MAX;
3695 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
3696 alu.src[1].sel = V_SQ_ALU_SRC_0; /*0.0*/
3697 alu.src[1].chan = 1;
3698
3699 alu.dst.sel = ctx->temp_reg;
3700 alu.dst.chan = 0;
3701 alu.dst.write = 1;
3702
3703 alu.last = 1;
3704 r = r600_bytecode_add_alu(ctx->bc, &alu);
3705 if (r)
3706 return r;
3707
3708 if (inst->Dst[0].Register.WriteMask & (1 << 2))
3709 {
3710 int chan;
3711 int sel;
3712 int i;
3713
3714 if (ctx->bc->chip_class == CAYMAN) {
3715 for (i = 0; i < 3; i++) {
3716 /* tmp.z = log(tmp.x) */
3717 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3718 alu.op = ALU_OP1_LOG_CLAMPED;
3719 alu.src[0].sel = ctx->temp_reg;
3720 alu.src[0].chan = 0;
3721 alu.dst.sel = ctx->temp_reg;
3722 alu.dst.chan = i;
3723 if (i == 2) {
3724 alu.dst.write = 1;
3725 alu.last = 1;
3726 } else
3727 alu.dst.write = 0;
3728
3729 r = r600_bytecode_add_alu(ctx->bc, &alu);
3730 if (r)
3731 return r;
3732 }
3733 } else {
3734 /* tmp.z = log(tmp.x) */
3735 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3736 alu.op = ALU_OP1_LOG_CLAMPED;
3737 alu.src[0].sel = ctx->temp_reg;
3738 alu.src[0].chan = 0;
3739 alu.dst.sel = ctx->temp_reg;
3740 alu.dst.chan = 2;
3741 alu.dst.write = 1;
3742 alu.last = 1;
3743 r = r600_bytecode_add_alu(ctx->bc, &alu);
3744 if (r)
3745 return r;
3746 }
3747
3748 chan = alu.dst.chan;
3749 sel = alu.dst.sel;
3750
3751 /* tmp.x = amd MUL_LIT(tmp.z, src.w, src.x ) */
3752 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3753 alu.op = ALU_OP3_MUL_LIT;
3754 alu.src[0].sel = sel;
3755 alu.src[0].chan = chan;
3756 r600_bytecode_src(&alu.src[1], &ctx->src[0], 3);
3757 r600_bytecode_src(&alu.src[2], &ctx->src[0], 0);
3758 alu.dst.sel = ctx->temp_reg;
3759 alu.dst.chan = 0;
3760 alu.dst.write = 1;
3761 alu.is_op3 = 1;
3762 alu.last = 1;
3763 r = r600_bytecode_add_alu(ctx->bc, &alu);
3764 if (r)
3765 return r;
3766
3767 if (ctx->bc->chip_class == CAYMAN) {
3768 for (i = 0; i < 3; i++) {
3769 /* dst.z = exp(tmp.x) */
3770 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3771 alu.op = ALU_OP1_EXP_IEEE;
3772 alu.src[0].sel = ctx->temp_reg;
3773 alu.src[0].chan = 0;
3774 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3775 if (i == 2) {
3776 alu.dst.write = 1;
3777 alu.last = 1;
3778 } else
3779 alu.dst.write = 0;
3780 r = r600_bytecode_add_alu(ctx->bc, &alu);
3781 if (r)
3782 return r;
3783 }
3784 } else {
3785 /* dst.z = exp(tmp.x) */
3786 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3787 alu.op = ALU_OP1_EXP_IEEE;
3788 alu.src[0].sel = ctx->temp_reg;
3789 alu.src[0].chan = 0;
3790 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
3791 alu.last = 1;
3792 r = r600_bytecode_add_alu(ctx->bc, &alu);
3793 if (r)
3794 return r;
3795 }
3796 }
3797
3798 /* dst.x, <- 1.0 */
3799 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3800 alu.op = ALU_OP1_MOV;
3801 alu.src[0].sel = V_SQ_ALU_SRC_1; /*1.0*/
3802 alu.src[0].chan = 0;
3803 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
3804 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 0) & 1;
3805 r = r600_bytecode_add_alu(ctx->bc, &alu);
3806 if (r)
3807 return r;
3808
3809 /* dst.y = max(src.x, 0.0) */
3810 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3811 alu.op = ALU_OP2_MAX;
3812 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
3813 alu.src[1].sel = V_SQ_ALU_SRC_0; /*0.0*/
3814 alu.src[1].chan = 0;
3815 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
3816 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 1) & 1;
3817 r = r600_bytecode_add_alu(ctx->bc, &alu);
3818 if (r)
3819 return r;
3820
3821 /* dst.w, <- 1.0 */
3822 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3823 alu.op = ALU_OP1_MOV;
3824 alu.src[0].sel = V_SQ_ALU_SRC_1;
3825 alu.src[0].chan = 0;
3826 tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
3827 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 3) & 1;
3828 alu.last = 1;
3829 r = r600_bytecode_add_alu(ctx->bc, &alu);
3830 if (r)
3831 return r;
3832
3833 return 0;
3834 }
3835
3836 static int tgsi_rsq(struct r600_shader_ctx *ctx)
3837 {
3838 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3839 struct r600_bytecode_alu alu;
3840 int i, r;
3841
3842 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3843
3844 /* XXX:
3845 * For state trackers other than OpenGL, we'll want to use
3846 * _RECIPSQRT_IEEE instead.
3847 */
3848 alu.op = ALU_OP1_RECIPSQRT_CLAMPED;
3849
3850 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
3851 r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
3852 r600_bytecode_src_set_abs(&alu.src[i]);
3853 }
3854 alu.dst.sel = ctx->temp_reg;
3855 alu.dst.write = 1;
3856 alu.last = 1;
3857 r = r600_bytecode_add_alu(ctx->bc, &alu);
3858 if (r)
3859 return r;
3860 /* replicate result */
3861 return tgsi_helper_tempx_replicate(ctx);
3862 }
3863
3864 static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx)
3865 {
3866 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3867 struct r600_bytecode_alu alu;
3868 int i, r;
3869
3870 for (i = 0; i < 4; i++) {
3871 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3872 alu.src[0].sel = ctx->temp_reg;
3873 alu.op = ALU_OP1_MOV;
3874 alu.dst.chan = i;
3875 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3876 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
3877 if (i == 3)
3878 alu.last = 1;
3879 r = r600_bytecode_add_alu(ctx->bc, &alu);
3880 if (r)
3881 return r;
3882 }
3883 return 0;
3884 }
3885
3886 static int tgsi_trans_srcx_replicate(struct r600_shader_ctx *ctx)
3887 {
3888 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3889 struct r600_bytecode_alu alu;
3890 int i, r;
3891
3892 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3893 alu.op = ctx->inst_info->op;
3894 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
3895 r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
3896 }
3897 alu.dst.sel = ctx->temp_reg;
3898 alu.dst.write = 1;
3899 alu.last = 1;
3900 r = r600_bytecode_add_alu(ctx->bc, &alu);
3901 if (r)
3902 return r;
3903 /* replicate result */
3904 return tgsi_helper_tempx_replicate(ctx);
3905 }
3906
3907 static int cayman_pow(struct r600_shader_ctx *ctx)
3908 {
3909 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3910 int i, r;
3911 struct r600_bytecode_alu alu;
3912 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
3913
3914 for (i = 0; i < 3; i++) {
3915 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3916 alu.op = ALU_OP1_LOG_IEEE;
3917 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
3918 alu.dst.sel = ctx->temp_reg;
3919 alu.dst.chan = i;
3920 alu.dst.write = 1;
3921 if (i == 2)
3922 alu.last = 1;
3923 r = r600_bytecode_add_alu(ctx->bc, &alu);
3924 if (r)
3925 return r;
3926 }
3927
3928 /* b * LOG2(a) */
3929 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3930 alu.op = ALU_OP2_MUL;
3931 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
3932 alu.src[1].sel = ctx->temp_reg;
3933 alu.dst.sel = ctx->temp_reg;
3934 alu.dst.write = 1;
3935 alu.last = 1;
3936 r = r600_bytecode_add_alu(ctx->bc, &alu);
3937 if (r)
3938 return r;
3939
3940 for (i = 0; i < last_slot; i++) {
3941 /* POW(a,b) = EXP2(b * LOG2(a))*/
3942 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3943 alu.op = ALU_OP1_EXP_IEEE;
3944 alu.src[0].sel = ctx->temp_reg;
3945
3946 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3947 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
3948 if (i == last_slot - 1)
3949 alu.last = 1;
3950 r = r600_bytecode_add_alu(ctx->bc, &alu);
3951 if (r)
3952 return r;
3953 }
3954 return 0;
3955 }
3956
3957 static int tgsi_pow(struct r600_shader_ctx *ctx)
3958 {
3959 struct r600_bytecode_alu alu;
3960 int r;
3961
3962 /* LOG2(a) */
3963 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3964 alu.op = ALU_OP1_LOG_IEEE;
3965 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
3966 alu.dst.sel = ctx->temp_reg;
3967 alu.dst.write = 1;
3968 alu.last = 1;
3969 r = r600_bytecode_add_alu(ctx->bc, &alu);
3970 if (r)
3971 return r;
3972 /* b * LOG2(a) */
3973 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3974 alu.op = ALU_OP2_MUL;
3975 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
3976 alu.src[1].sel = ctx->temp_reg;
3977 alu.dst.sel = ctx->temp_reg;
3978 alu.dst.write = 1;
3979 alu.last = 1;
3980 r = r600_bytecode_add_alu(ctx->bc, &alu);
3981 if (r)
3982 return r;
3983 /* POW(a,b) = EXP2(b * LOG2(a))*/
3984 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3985 alu.op = ALU_OP1_EXP_IEEE;
3986 alu.src[0].sel = ctx->temp_reg;
3987 alu.dst.sel = ctx->temp_reg;
3988 alu.dst.write = 1;
3989 alu.last = 1;
3990 r = r600_bytecode_add_alu(ctx->bc, &alu);
3991 if (r)
3992 return r;
3993 return tgsi_helper_tempx_replicate(ctx);
3994 }
3995
3996 static int tgsi_divmod(struct r600_shader_ctx *ctx, int mod, int signed_op)
3997 {
3998 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3999 struct r600_bytecode_alu alu;
4000 int i, r, j;
4001 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4002 int tmp0 = ctx->temp_reg;
4003 int tmp1 = r600_get_temp(ctx);
4004 int tmp2 = r600_get_temp(ctx);
4005 int tmp3 = r600_get_temp(ctx);
4006 /* Unsigned path:
4007 *
4008 * we need to represent src1 as src2*q + r, where q - quotient, r - remainder
4009 *
4010 * 1. tmp0.x = rcp (src2) = 2^32/src2 + e, where e is rounding error
4011 * 2. tmp0.z = lo (tmp0.x * src2)
4012 * 3. tmp0.w = -tmp0.z
4013 * 4. tmp0.y = hi (tmp0.x * src2)
4014 * 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z) = abs(lo(rcp*src2))
4015 * 6. tmp0.w = hi (tmp0.z * tmp0.x) = e, rounding error
4016 * 7. tmp1.x = tmp0.x - tmp0.w
4017 * 8. tmp1.y = tmp0.x + tmp0.w
4018 * 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x)
4019 * 10. tmp0.z = hi(tmp0.x * src1) = q
4020 * 11. tmp0.y = lo (tmp0.z * src2) = src2*q = src1 - r
4021 *
4022 * 12. tmp0.w = src1 - tmp0.y = r
4023 * 13. tmp1.x = tmp0.w >= src2 = r >= src2 (uint comparison)
4024 * 14. tmp1.y = src1 >= tmp0.y = r >= 0 (uint comparison)
4025 *
4026 * if DIV
4027 *
4028 * 15. tmp1.z = tmp0.z + 1 = q + 1
4029 * 16. tmp1.w = tmp0.z - 1 = q - 1
4030 *
4031 * else MOD
4032 *
4033 * 15. tmp1.z = tmp0.w - src2 = r - src2
4034 * 16. tmp1.w = tmp0.w + src2 = r + src2
4035 *
4036 * endif
4037 *
4038 * 17. tmp1.x = tmp1.x & tmp1.y
4039 *
4040 * DIV: 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z
4041 * MOD: 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z
4042 *
4043 * 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z
4044 * 20. dst = src2==0 ? MAX_UINT : tmp0.z
4045 *
4046 * Signed path:
4047 *
4048 * Same as unsigned, using abs values of the operands,
4049 * and fixing the sign of the result in the end.
4050 */
4051
4052 for (i = 0; i < 4; i++) {
4053 if (!(write_mask & (1<<i)))
4054 continue;
4055
4056 if (signed_op) {
4057
4058 /* tmp2.x = -src0 */
4059 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4060 alu.op = ALU_OP2_SUB_INT;
4061
4062 alu.dst.sel = tmp2;
4063 alu.dst.chan = 0;
4064 alu.dst.write = 1;
4065
4066 alu.src[0].sel = V_SQ_ALU_SRC_0;
4067
4068 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4069
4070 alu.last = 1;
4071 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4072 return r;
4073
4074 /* tmp2.y = -src1 */
4075 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4076 alu.op = ALU_OP2_SUB_INT;
4077
4078 alu.dst.sel = tmp2;
4079 alu.dst.chan = 1;
4080 alu.dst.write = 1;
4081
4082 alu.src[0].sel = V_SQ_ALU_SRC_0;
4083
4084 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4085
4086 alu.last = 1;
4087 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4088 return r;
4089
4090 /* tmp2.z sign bit is set if src0 and src2 signs are different */
4091 /* it will be a sign of the quotient */
4092 if (!mod) {
4093
4094 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4095 alu.op = ALU_OP2_XOR_INT;
4096
4097 alu.dst.sel = tmp2;
4098 alu.dst.chan = 2;
4099 alu.dst.write = 1;
4100
4101 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4102 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4103
4104 alu.last = 1;
4105 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4106 return r;
4107 }
4108
4109 /* tmp2.x = |src0| */
4110 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4111 alu.op = ALU_OP3_CNDGE_INT;
4112 alu.is_op3 = 1;
4113
4114 alu.dst.sel = tmp2;
4115 alu.dst.chan = 0;
4116 alu.dst.write = 1;
4117
4118 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4119 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4120 alu.src[2].sel = tmp2;
4121 alu.src[2].chan = 0;
4122
4123 alu.last = 1;
4124 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4125 return r;
4126
4127 /* tmp2.y = |src1| */
4128 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4129 alu.op = ALU_OP3_CNDGE_INT;
4130 alu.is_op3 = 1;
4131
4132 alu.dst.sel = tmp2;
4133 alu.dst.chan = 1;
4134 alu.dst.write = 1;
4135
4136 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4137 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4138 alu.src[2].sel = tmp2;
4139 alu.src[2].chan = 1;
4140
4141 alu.last = 1;
4142 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4143 return r;
4144
4145 }
4146
4147 /* 1. tmp0.x = rcp_u (src2) = 2^32/src2 + e, where e is rounding error */
4148 if (ctx->bc->chip_class == CAYMAN) {
4149 /* tmp3.x = u2f(src2) */
4150 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4151 alu.op = ALU_OP1_UINT_TO_FLT;
4152
4153 alu.dst.sel = tmp3;
4154 alu.dst.chan = 0;
4155 alu.dst.write = 1;
4156
4157 if (signed_op) {
4158 alu.src[0].sel = tmp2;
4159 alu.src[0].chan = 1;
4160 } else {
4161 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4162 }
4163
4164 alu.last = 1;
4165 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4166 return r;
4167
4168 /* tmp0.x = recip(tmp3.x) */
4169 for (j = 0 ; j < 3; j++) {
4170 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4171 alu.op = ALU_OP1_RECIP_IEEE;
4172
4173 alu.dst.sel = tmp0;
4174 alu.dst.chan = j;
4175 alu.dst.write = (j == 0);
4176
4177 alu.src[0].sel = tmp3;
4178 alu.src[0].chan = 0;
4179
4180 if (j == 2)
4181 alu.last = 1;
4182 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4183 return r;
4184 }
4185
4186 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4187 alu.op = ALU_OP2_MUL;
4188
4189 alu.src[0].sel = tmp0;
4190 alu.src[0].chan = 0;
4191
4192 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4193 alu.src[1].value = 0x4f800000;
4194
4195 alu.dst.sel = tmp3;
4196 alu.dst.write = 1;
4197 alu.last = 1;
4198 r = r600_bytecode_add_alu(ctx->bc, &alu);
4199 if (r)
4200 return r;
4201
4202 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4203 alu.op = ALU_OP1_FLT_TO_UINT;
4204
4205 alu.dst.sel = tmp0;
4206 alu.dst.chan = 0;
4207 alu.dst.write = 1;
4208
4209 alu.src[0].sel = tmp3;
4210 alu.src[0].chan = 0;
4211
4212 alu.last = 1;
4213 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4214 return r;
4215
4216 } else {
4217 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4218 alu.op = ALU_OP1_RECIP_UINT;
4219
4220 alu.dst.sel = tmp0;
4221 alu.dst.chan = 0;
4222 alu.dst.write = 1;
4223
4224 if (signed_op) {
4225 alu.src[0].sel = tmp2;
4226 alu.src[0].chan = 1;
4227 } else {
4228 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4229 }
4230
4231 alu.last = 1;
4232 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4233 return r;
4234 }
4235
4236 /* 2. tmp0.z = lo (tmp0.x * src2) */
4237 if (ctx->bc->chip_class == CAYMAN) {
4238 for (j = 0 ; j < 4; j++) {
4239 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4240 alu.op = ALU_OP2_MULLO_UINT;
4241
4242 alu.dst.sel = tmp0;
4243 alu.dst.chan = j;
4244 alu.dst.write = (j == 2);
4245
4246 alu.src[0].sel = tmp0;
4247 alu.src[0].chan = 0;
4248 if (signed_op) {
4249 alu.src[1].sel = tmp2;
4250 alu.src[1].chan = 1;
4251 } else {
4252 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4253 }
4254
4255 alu.last = (j == 3);
4256 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4257 return r;
4258 }
4259 } else {
4260 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4261 alu.op = ALU_OP2_MULLO_UINT;
4262
4263 alu.dst.sel = tmp0;
4264 alu.dst.chan = 2;
4265 alu.dst.write = 1;
4266
4267 alu.src[0].sel = tmp0;
4268 alu.src[0].chan = 0;
4269 if (signed_op) {
4270 alu.src[1].sel = tmp2;
4271 alu.src[1].chan = 1;
4272 } else {
4273 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4274 }
4275
4276 alu.last = 1;
4277 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4278 return r;
4279 }
4280
4281 /* 3. tmp0.w = -tmp0.z */
4282 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4283 alu.op = ALU_OP2_SUB_INT;
4284
4285 alu.dst.sel = tmp0;
4286 alu.dst.chan = 3;
4287 alu.dst.write = 1;
4288
4289 alu.src[0].sel = V_SQ_ALU_SRC_0;
4290 alu.src[1].sel = tmp0;
4291 alu.src[1].chan = 2;
4292
4293 alu.last = 1;
4294 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4295 return r;
4296
4297 /* 4. tmp0.y = hi (tmp0.x * src2) */
4298 if (ctx->bc->chip_class == CAYMAN) {
4299 for (j = 0 ; j < 4; j++) {
4300 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4301 alu.op = ALU_OP2_MULHI_UINT;
4302
4303 alu.dst.sel = tmp0;
4304 alu.dst.chan = j;
4305 alu.dst.write = (j == 1);
4306
4307 alu.src[0].sel = tmp0;
4308 alu.src[0].chan = 0;
4309
4310 if (signed_op) {
4311 alu.src[1].sel = tmp2;
4312 alu.src[1].chan = 1;
4313 } else {
4314 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4315 }
4316 alu.last = (j == 3);
4317 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4318 return r;
4319 }
4320 } else {
4321 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4322 alu.op = ALU_OP2_MULHI_UINT;
4323
4324 alu.dst.sel = tmp0;
4325 alu.dst.chan = 1;
4326 alu.dst.write = 1;
4327
4328 alu.src[0].sel = tmp0;
4329 alu.src[0].chan = 0;
4330
4331 if (signed_op) {
4332 alu.src[1].sel = tmp2;
4333 alu.src[1].chan = 1;
4334 } else {
4335 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4336 }
4337
4338 alu.last = 1;
4339 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4340 return r;
4341 }
4342
4343 /* 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z) = abs(lo(rcp*src)) */
4344 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4345 alu.op = ALU_OP3_CNDE_INT;
4346 alu.is_op3 = 1;
4347
4348 alu.dst.sel = tmp0;
4349 alu.dst.chan = 2;
4350 alu.dst.write = 1;
4351
4352 alu.src[0].sel = tmp0;
4353 alu.src[0].chan = 1;
4354 alu.src[1].sel = tmp0;
4355 alu.src[1].chan = 3;
4356 alu.src[2].sel = tmp0;
4357 alu.src[2].chan = 2;
4358
4359 alu.last = 1;
4360 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4361 return r;
4362
4363 /* 6. tmp0.w = hi (tmp0.z * tmp0.x) = e, rounding error */
4364 if (ctx->bc->chip_class == CAYMAN) {
4365 for (j = 0 ; j < 4; j++) {
4366 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4367 alu.op = ALU_OP2_MULHI_UINT;
4368
4369 alu.dst.sel = tmp0;
4370 alu.dst.chan = j;
4371 alu.dst.write = (j == 3);
4372
4373 alu.src[0].sel = tmp0;
4374 alu.src[0].chan = 2;
4375
4376 alu.src[1].sel = tmp0;
4377 alu.src[1].chan = 0;
4378
4379 alu.last = (j == 3);
4380 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4381 return r;
4382 }
4383 } else {
4384 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4385 alu.op = ALU_OP2_MULHI_UINT;
4386
4387 alu.dst.sel = tmp0;
4388 alu.dst.chan = 3;
4389 alu.dst.write = 1;
4390
4391 alu.src[0].sel = tmp0;
4392 alu.src[0].chan = 2;
4393
4394 alu.src[1].sel = tmp0;
4395 alu.src[1].chan = 0;
4396
4397 alu.last = 1;
4398 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4399 return r;
4400 }
4401
4402 /* 7. tmp1.x = tmp0.x - tmp0.w */
4403 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4404 alu.op = ALU_OP2_SUB_INT;
4405
4406 alu.dst.sel = tmp1;
4407 alu.dst.chan = 0;
4408 alu.dst.write = 1;
4409
4410 alu.src[0].sel = tmp0;
4411 alu.src[0].chan = 0;
4412 alu.src[1].sel = tmp0;
4413 alu.src[1].chan = 3;
4414
4415 alu.last = 1;
4416 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4417 return r;
4418
4419 /* 8. tmp1.y = tmp0.x + tmp0.w */
4420 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4421 alu.op = ALU_OP2_ADD_INT;
4422
4423 alu.dst.sel = tmp1;
4424 alu.dst.chan = 1;
4425 alu.dst.write = 1;
4426
4427 alu.src[0].sel = tmp0;
4428 alu.src[0].chan = 0;
4429 alu.src[1].sel = tmp0;
4430 alu.src[1].chan = 3;
4431
4432 alu.last = 1;
4433 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4434 return r;
4435
4436 /* 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) */
4437 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4438 alu.op = ALU_OP3_CNDE_INT;
4439 alu.is_op3 = 1;
4440
4441 alu.dst.sel = tmp0;
4442 alu.dst.chan = 0;
4443 alu.dst.write = 1;
4444
4445 alu.src[0].sel = tmp0;
4446 alu.src[0].chan = 1;
4447 alu.src[1].sel = tmp1;
4448 alu.src[1].chan = 1;
4449 alu.src[2].sel = tmp1;
4450 alu.src[2].chan = 0;
4451
4452 alu.last = 1;
4453 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4454 return r;
4455
4456 /* 10. tmp0.z = hi(tmp0.x * src1) = q */
4457 if (ctx->bc->chip_class == CAYMAN) {
4458 for (j = 0 ; j < 4; j++) {
4459 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4460 alu.op = ALU_OP2_MULHI_UINT;
4461
4462 alu.dst.sel = tmp0;
4463 alu.dst.chan = j;
4464 alu.dst.write = (j == 2);
4465
4466 alu.src[0].sel = tmp0;
4467 alu.src[0].chan = 0;
4468
4469 if (signed_op) {
4470 alu.src[1].sel = tmp2;
4471 alu.src[1].chan = 0;
4472 } else {
4473 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4474 }
4475
4476 alu.last = (j == 3);
4477 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4478 return r;
4479 }
4480 } else {
4481 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4482 alu.op = ALU_OP2_MULHI_UINT;
4483
4484 alu.dst.sel = tmp0;
4485 alu.dst.chan = 2;
4486 alu.dst.write = 1;
4487
4488 alu.src[0].sel = tmp0;
4489 alu.src[0].chan = 0;
4490
4491 if (signed_op) {
4492 alu.src[1].sel = tmp2;
4493 alu.src[1].chan = 0;
4494 } else {
4495 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4496 }
4497
4498 alu.last = 1;
4499 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4500 return r;
4501 }
4502
4503 /* 11. tmp0.y = lo (src2 * tmp0.z) = src2*q = src1 - r */
4504 if (ctx->bc->chip_class == CAYMAN) {
4505 for (j = 0 ; j < 4; j++) {
4506 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4507 alu.op = ALU_OP2_MULLO_UINT;
4508
4509 alu.dst.sel = tmp0;
4510 alu.dst.chan = j;
4511 alu.dst.write = (j == 1);
4512
4513 if (signed_op) {
4514 alu.src[0].sel = tmp2;
4515 alu.src[0].chan = 1;
4516 } else {
4517 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4518 }
4519
4520 alu.src[1].sel = tmp0;
4521 alu.src[1].chan = 2;
4522
4523 alu.last = (j == 3);
4524 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4525 return r;
4526 }
4527 } else {
4528 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4529 alu.op = ALU_OP2_MULLO_UINT;
4530
4531 alu.dst.sel = tmp0;
4532 alu.dst.chan = 1;
4533 alu.dst.write = 1;
4534
4535 if (signed_op) {
4536 alu.src[0].sel = tmp2;
4537 alu.src[0].chan = 1;
4538 } else {
4539 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4540 }
4541
4542 alu.src[1].sel = tmp0;
4543 alu.src[1].chan = 2;
4544
4545 alu.last = 1;
4546 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4547 return r;
4548 }
4549
4550 /* 12. tmp0.w = src1 - tmp0.y = r */
4551 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4552 alu.op = ALU_OP2_SUB_INT;
4553
4554 alu.dst.sel = tmp0;
4555 alu.dst.chan = 3;
4556 alu.dst.write = 1;
4557
4558 if (signed_op) {
4559 alu.src[0].sel = tmp2;
4560 alu.src[0].chan = 0;
4561 } else {
4562 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4563 }
4564
4565 alu.src[1].sel = tmp0;
4566 alu.src[1].chan = 1;
4567
4568 alu.last = 1;
4569 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4570 return r;
4571
4572 /* 13. tmp1.x = tmp0.w >= src2 = r >= src2 */
4573 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4574 alu.op = ALU_OP2_SETGE_UINT;
4575
4576 alu.dst.sel = tmp1;
4577 alu.dst.chan = 0;
4578 alu.dst.write = 1;
4579
4580 alu.src[0].sel = tmp0;
4581 alu.src[0].chan = 3;
4582 if (signed_op) {
4583 alu.src[1].sel = tmp2;
4584 alu.src[1].chan = 1;
4585 } else {
4586 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4587 }
4588
4589 alu.last = 1;
4590 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4591 return r;
4592
4593 /* 14. tmp1.y = src1 >= tmp0.y = r >= 0 */
4594 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4595 alu.op = ALU_OP2_SETGE_UINT;
4596
4597 alu.dst.sel = tmp1;
4598 alu.dst.chan = 1;
4599 alu.dst.write = 1;
4600
4601 if (signed_op) {
4602 alu.src[0].sel = tmp2;
4603 alu.src[0].chan = 0;
4604 } else {
4605 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4606 }
4607
4608 alu.src[1].sel = tmp0;
4609 alu.src[1].chan = 1;
4610
4611 alu.last = 1;
4612 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4613 return r;
4614
4615 if (mod) { /* UMOD */
4616
4617 /* 15. tmp1.z = tmp0.w - src2 = r - src2 */
4618 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4619 alu.op = ALU_OP2_SUB_INT;
4620
4621 alu.dst.sel = tmp1;
4622 alu.dst.chan = 2;
4623 alu.dst.write = 1;
4624
4625 alu.src[0].sel = tmp0;
4626 alu.src[0].chan = 3;
4627
4628 if (signed_op) {
4629 alu.src[1].sel = tmp2;
4630 alu.src[1].chan = 1;
4631 } else {
4632 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4633 }
4634
4635 alu.last = 1;
4636 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4637 return r;
4638
4639 /* 16. tmp1.w = tmp0.w + src2 = r + src2 */
4640 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4641 alu.op = ALU_OP2_ADD_INT;
4642
4643 alu.dst.sel = tmp1;
4644 alu.dst.chan = 3;
4645 alu.dst.write = 1;
4646
4647 alu.src[0].sel = tmp0;
4648 alu.src[0].chan = 3;
4649 if (signed_op) {
4650 alu.src[1].sel = tmp2;
4651 alu.src[1].chan = 1;
4652 } else {
4653 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4654 }
4655
4656 alu.last = 1;
4657 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4658 return r;
4659
4660 } else { /* UDIV */
4661
4662 /* 15. tmp1.z = tmp0.z + 1 = q + 1 DIV */
4663 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4664 alu.op = ALU_OP2_ADD_INT;
4665
4666 alu.dst.sel = tmp1;
4667 alu.dst.chan = 2;
4668 alu.dst.write = 1;
4669
4670 alu.src[0].sel = tmp0;
4671 alu.src[0].chan = 2;
4672 alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
4673
4674 alu.last = 1;
4675 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4676 return r;
4677
4678 /* 16. tmp1.w = tmp0.z - 1 = q - 1 */
4679 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4680 alu.op = ALU_OP2_ADD_INT;
4681
4682 alu.dst.sel = tmp1;
4683 alu.dst.chan = 3;
4684 alu.dst.write = 1;
4685
4686 alu.src[0].sel = tmp0;
4687 alu.src[0].chan = 2;
4688 alu.src[1].sel = V_SQ_ALU_SRC_M_1_INT;
4689
4690 alu.last = 1;
4691 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4692 return r;
4693
4694 }
4695
4696 /* 17. tmp1.x = tmp1.x & tmp1.y */
4697 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4698 alu.op = ALU_OP2_AND_INT;
4699
4700 alu.dst.sel = tmp1;
4701 alu.dst.chan = 0;
4702 alu.dst.write = 1;
4703
4704 alu.src[0].sel = tmp1;
4705 alu.src[0].chan = 0;
4706 alu.src[1].sel = tmp1;
4707 alu.src[1].chan = 1;
4708
4709 alu.last = 1;
4710 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4711 return r;
4712
4713 /* 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z DIV */
4714 /* 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z MOD */
4715 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4716 alu.op = ALU_OP3_CNDE_INT;
4717 alu.is_op3 = 1;
4718
4719 alu.dst.sel = tmp0;
4720 alu.dst.chan = 2;
4721 alu.dst.write = 1;
4722
4723 alu.src[0].sel = tmp1;
4724 alu.src[0].chan = 0;
4725 alu.src[1].sel = tmp0;
4726 alu.src[1].chan = mod ? 3 : 2;
4727 alu.src[2].sel = tmp1;
4728 alu.src[2].chan = 2;
4729
4730 alu.last = 1;
4731 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4732 return r;
4733
4734 /* 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z */
4735 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4736 alu.op = ALU_OP3_CNDE_INT;
4737 alu.is_op3 = 1;
4738
4739 if (signed_op) {
4740 alu.dst.sel = tmp0;
4741 alu.dst.chan = 2;
4742 alu.dst.write = 1;
4743 } else {
4744 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4745 }
4746
4747 alu.src[0].sel = tmp1;
4748 alu.src[0].chan = 1;
4749 alu.src[1].sel = tmp1;
4750 alu.src[1].chan = 3;
4751 alu.src[2].sel = tmp0;
4752 alu.src[2].chan = 2;
4753
4754 alu.last = 1;
4755 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4756 return r;
4757
4758 if (signed_op) {
4759
4760 /* fix the sign of the result */
4761
4762 if (mod) {
4763
4764 /* tmp0.x = -tmp0.z */
4765 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4766 alu.op = ALU_OP2_SUB_INT;
4767
4768 alu.dst.sel = tmp0;
4769 alu.dst.chan = 0;
4770 alu.dst.write = 1;
4771
4772 alu.src[0].sel = V_SQ_ALU_SRC_0;
4773 alu.src[1].sel = tmp0;
4774 alu.src[1].chan = 2;
4775
4776 alu.last = 1;
4777 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4778 return r;
4779
4780 /* sign of the remainder is the same as the sign of src0 */
4781 /* tmp0.x = src0>=0 ? tmp0.z : tmp0.x */
4782 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4783 alu.op = ALU_OP3_CNDGE_INT;
4784 alu.is_op3 = 1;
4785
4786 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4787
4788 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4789 alu.src[1].sel = tmp0;
4790 alu.src[1].chan = 2;
4791 alu.src[2].sel = tmp0;
4792 alu.src[2].chan = 0;
4793
4794 alu.last = 1;
4795 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4796 return r;
4797
4798 } else {
4799
4800 /* tmp0.x = -tmp0.z */
4801 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4802 alu.op = ALU_OP2_SUB_INT;
4803
4804 alu.dst.sel = tmp0;
4805 alu.dst.chan = 0;
4806 alu.dst.write = 1;
4807
4808 alu.src[0].sel = V_SQ_ALU_SRC_0;
4809 alu.src[1].sel = tmp0;
4810 alu.src[1].chan = 2;
4811
4812 alu.last = 1;
4813 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4814 return r;
4815
4816 /* fix the quotient sign (same as the sign of src0*src1) */
4817 /* tmp0.x = tmp2.z>=0 ? tmp0.z : tmp0.x */
4818 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4819 alu.op = ALU_OP3_CNDGE_INT;
4820 alu.is_op3 = 1;
4821
4822 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4823
4824 alu.src[0].sel = tmp2;
4825 alu.src[0].chan = 2;
4826 alu.src[1].sel = tmp0;
4827 alu.src[1].chan = 2;
4828 alu.src[2].sel = tmp0;
4829 alu.src[2].chan = 0;
4830
4831 alu.last = 1;
4832 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4833 return r;
4834 }
4835 }
4836 }
4837 return 0;
4838 }
4839
4840 static int tgsi_udiv(struct r600_shader_ctx *ctx)
4841 {
4842 return tgsi_divmod(ctx, 0, 0);
4843 }
4844
4845 static int tgsi_umod(struct r600_shader_ctx *ctx)
4846 {
4847 return tgsi_divmod(ctx, 1, 0);
4848 }
4849
4850 static int tgsi_idiv(struct r600_shader_ctx *ctx)
4851 {
4852 return tgsi_divmod(ctx, 0, 1);
4853 }
4854
4855 static int tgsi_imod(struct r600_shader_ctx *ctx)
4856 {
4857 return tgsi_divmod(ctx, 1, 1);
4858 }
4859
4860
4861 static int tgsi_f2i(struct r600_shader_ctx *ctx)
4862 {
4863 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4864 struct r600_bytecode_alu alu;
4865 int i, r;
4866 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4867 int last_inst = tgsi_last_instruction(write_mask);
4868
4869 for (i = 0; i < 4; i++) {
4870 if (!(write_mask & (1<<i)))
4871 continue;
4872
4873 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4874 alu.op = ALU_OP1_TRUNC;
4875
4876 alu.dst.sel = ctx->temp_reg;
4877 alu.dst.chan = i;
4878 alu.dst.write = 1;
4879
4880 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4881 if (i == last_inst)
4882 alu.last = 1;
4883 r = r600_bytecode_add_alu(ctx->bc, &alu);
4884 if (r)
4885 return r;
4886 }
4887
4888 for (i = 0; i < 4; i++) {
4889 if (!(write_mask & (1<<i)))
4890 continue;
4891
4892 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4893 alu.op = ctx->inst_info->op;
4894
4895 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4896
4897 alu.src[0].sel = ctx->temp_reg;
4898 alu.src[0].chan = i;
4899
4900 if (i == last_inst || alu.op == ALU_OP1_FLT_TO_UINT)
4901 alu.last = 1;
4902 r = r600_bytecode_add_alu(ctx->bc, &alu);
4903 if (r)
4904 return r;
4905 }
4906
4907 return 0;
4908 }
4909
4910 static int tgsi_iabs(struct r600_shader_ctx *ctx)
4911 {
4912 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4913 struct r600_bytecode_alu alu;
4914 int i, r;
4915 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4916 int last_inst = tgsi_last_instruction(write_mask);
4917
4918 /* tmp = -src */
4919 for (i = 0; i < 4; i++) {
4920 if (!(write_mask & (1<<i)))
4921 continue;
4922
4923 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4924 alu.op = ALU_OP2_SUB_INT;
4925
4926 alu.dst.sel = ctx->temp_reg;
4927 alu.dst.chan = i;
4928 alu.dst.write = 1;
4929
4930 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4931 alu.src[0].sel = V_SQ_ALU_SRC_0;
4932
4933 if (i == last_inst)
4934 alu.last = 1;
4935 r = r600_bytecode_add_alu(ctx->bc, &alu);
4936 if (r)
4937 return r;
4938 }
4939
4940 /* dst = (src >= 0 ? src : tmp) */
4941 for (i = 0; i < 4; i++) {
4942 if (!(write_mask & (1<<i)))
4943 continue;
4944
4945 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4946 alu.op = ALU_OP3_CNDGE_INT;
4947 alu.is_op3 = 1;
4948 alu.dst.write = 1;
4949
4950 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4951
4952 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4953 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4954 alu.src[2].sel = ctx->temp_reg;
4955 alu.src[2].chan = i;
4956
4957 if (i == last_inst)
4958 alu.last = 1;
4959 r = r600_bytecode_add_alu(ctx->bc, &alu);
4960 if (r)
4961 return r;
4962 }
4963 return 0;
4964 }
4965
4966 static int tgsi_issg(struct r600_shader_ctx *ctx)
4967 {
4968 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4969 struct r600_bytecode_alu alu;
4970 int i, r;
4971 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4972 int last_inst = tgsi_last_instruction(write_mask);
4973
4974 /* tmp = (src >= 0 ? src : -1) */
4975 for (i = 0; i < 4; i++) {
4976 if (!(write_mask & (1<<i)))
4977 continue;
4978
4979 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4980 alu.op = ALU_OP3_CNDGE_INT;
4981 alu.is_op3 = 1;
4982
4983 alu.dst.sel = ctx->temp_reg;
4984 alu.dst.chan = i;
4985 alu.dst.write = 1;
4986
4987 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4988 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4989 alu.src[2].sel = V_SQ_ALU_SRC_M_1_INT;
4990
4991 if (i == last_inst)
4992 alu.last = 1;
4993 r = r600_bytecode_add_alu(ctx->bc, &alu);
4994 if (r)
4995 return r;
4996 }
4997
4998 /* dst = (tmp > 0 ? 1 : tmp) */
4999 for (i = 0; i < 4; i++) {
5000 if (!(write_mask & (1<<i)))
5001 continue;
5002
5003 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5004 alu.op = ALU_OP3_CNDGT_INT;
5005 alu.is_op3 = 1;
5006 alu.dst.write = 1;
5007
5008 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5009
5010 alu.src[0].sel = ctx->temp_reg;
5011 alu.src[0].chan = i;
5012
5013 alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
5014
5015 alu.src[2].sel = ctx->temp_reg;
5016 alu.src[2].chan = i;
5017
5018 if (i == last_inst)
5019 alu.last = 1;
5020 r = r600_bytecode_add_alu(ctx->bc, &alu);
5021 if (r)
5022 return r;
5023 }
5024 return 0;
5025 }
5026
5027
5028
5029 static int tgsi_ssg(struct r600_shader_ctx *ctx)
5030 {
5031 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5032 struct r600_bytecode_alu alu;
5033 int i, r;
5034
5035 /* tmp = (src > 0 ? 1 : src) */
5036 for (i = 0; i < 4; i++) {
5037 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5038 alu.op = ALU_OP3_CNDGT;
5039 alu.is_op3 = 1;
5040
5041 alu.dst.sel = ctx->temp_reg;
5042 alu.dst.chan = i;
5043
5044 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5045 alu.src[1].sel = V_SQ_ALU_SRC_1;
5046 r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
5047
5048 if (i == 3)
5049 alu.last = 1;
5050 r = r600_bytecode_add_alu(ctx->bc, &alu);
5051 if (r)
5052 return r;
5053 }
5054
5055 /* dst = (-tmp > 0 ? -1 : tmp) */
5056 for (i = 0; i < 4; i++) {
5057 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5058 alu.op = ALU_OP3_CNDGT;
5059 alu.is_op3 = 1;
5060 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5061
5062 alu.src[0].sel = ctx->temp_reg;
5063 alu.src[0].chan = i;
5064 alu.src[0].neg = 1;
5065
5066 alu.src[1].sel = V_SQ_ALU_SRC_1;
5067 alu.src[1].neg = 1;
5068
5069 alu.src[2].sel = ctx->temp_reg;
5070 alu.src[2].chan = i;
5071
5072 if (i == 3)
5073 alu.last = 1;
5074 r = r600_bytecode_add_alu(ctx->bc, &alu);
5075 if (r)
5076 return r;
5077 }
5078 return 0;
5079 }
5080
5081 static int tgsi_bfi(struct r600_shader_ctx *ctx)
5082 {
5083 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5084 struct r600_bytecode_alu alu;
5085 int i, r, t1, t2;
5086
5087 unsigned write_mask = inst->Dst[0].Register.WriteMask;
5088 int last_inst = tgsi_last_instruction(write_mask);
5089
5090 t1 = ctx->temp_reg;
5091
5092 for (i = 0; i < 4; i++) {
5093 if (!(write_mask & (1<<i)))
5094 continue;
5095
5096 /* create mask tmp */
5097 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5098 alu.op = ALU_OP2_BFM_INT;
5099 alu.dst.sel = t1;
5100 alu.dst.chan = i;
5101 alu.dst.write = 1;
5102 alu.last = i == last_inst;
5103
5104 r600_bytecode_src(&alu.src[0], &ctx->src[3], i);
5105 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
5106
5107 r = r600_bytecode_add_alu(ctx->bc, &alu);
5108 if (r)
5109 return r;
5110 }
5111
5112 t2 = r600_get_temp(ctx);
5113
5114 for (i = 0; i < 4; i++) {
5115 if (!(write_mask & (1<<i)))
5116 continue;
5117
5118 /* shift insert left */
5119 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5120 alu.op = ALU_OP2_LSHL_INT;
5121 alu.dst.sel = t2;
5122 alu.dst.chan = i;
5123 alu.dst.write = 1;
5124 alu.last = i == last_inst;
5125
5126 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5127 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
5128
5129 r = r600_bytecode_add_alu(ctx->bc, &alu);
5130 if (r)
5131 return r;
5132 }
5133
5134 for (i = 0; i < 4; i++) {
5135 if (!(write_mask & (1<<i)))
5136 continue;
5137
5138 /* actual bitfield insert */
5139 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5140 alu.op = ALU_OP3_BFI_INT;
5141 alu.is_op3 = 1;
5142 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5143 alu.dst.chan = i;
5144 alu.dst.write = 1;
5145 alu.last = i == last_inst;
5146
5147 alu.src[0].sel = t1;
5148 alu.src[0].chan = i;
5149 alu.src[1].sel = t2;
5150 alu.src[1].chan = i;
5151 r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
5152
5153 r = r600_bytecode_add_alu(ctx->bc, &alu);
5154 if (r)
5155 return r;
5156 }
5157
5158 return 0;
5159 }
5160
5161 static int tgsi_msb(struct r600_shader_ctx *ctx)
5162 {
5163 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5164 struct r600_bytecode_alu alu;
5165 int i, r, t1, t2;
5166
5167 unsigned write_mask = inst->Dst[0].Register.WriteMask;
5168 int last_inst = tgsi_last_instruction(write_mask);
5169
5170 assert(ctx->inst_info->op == ALU_OP1_FFBH_INT ||
5171 ctx->inst_info->op == ALU_OP1_FFBH_UINT);
5172
5173 t1 = ctx->temp_reg;
5174
5175 /* bit position is indexed from lsb by TGSI, and from msb by the hardware */
5176 for (i = 0; i < 4; i++) {
5177 if (!(write_mask & (1<<i)))
5178 continue;
5179
5180 /* t1 = FFBH_INT / FFBH_UINT */
5181 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5182 alu.op = ctx->inst_info->op;
5183 alu.dst.sel = t1;
5184 alu.dst.chan = i;
5185 alu.dst.write = 1;
5186 alu.last = i == last_inst;
5187
5188 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5189
5190 r = r600_bytecode_add_alu(ctx->bc, &alu);
5191 if (r)
5192 return r;
5193 }
5194
5195 t2 = r600_get_temp(ctx);
5196
5197 for (i = 0; i < 4; i++) {
5198 if (!(write_mask & (1<<i)))
5199 continue;
5200
5201 /* t2 = 31 - t1 */
5202 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5203 alu.op = ALU_OP2_SUB_INT;
5204 alu.dst.sel = t2;
5205 alu.dst.chan = i;
5206 alu.dst.write = 1;
5207 alu.last = i == last_inst;
5208
5209 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
5210 alu.src[0].value = 31;
5211 alu.src[1].sel = t1;
5212 alu.src[1].chan = i;
5213
5214 r = r600_bytecode_add_alu(ctx->bc, &alu);
5215 if (r)
5216 return r;
5217 }
5218
5219 for (i = 0; i < 4; i++) {
5220 if (!(write_mask & (1<<i)))
5221 continue;
5222
5223 /* result = t1 >= 0 ? t2 : t1 */
5224 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5225 alu.op = ALU_OP3_CNDGE_INT;
5226 alu.is_op3 = 1;
5227 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5228 alu.dst.chan = i;
5229 alu.dst.write = 1;
5230 alu.last = i == last_inst;
5231
5232 alu.src[0].sel = t1;
5233 alu.src[0].chan = i;
5234 alu.src[1].sel = t2;
5235 alu.src[1].chan = i;
5236 alu.src[2].sel = t1;
5237 alu.src[2].chan = i;
5238
5239 r = r600_bytecode_add_alu(ctx->bc, &alu);
5240 if (r)
5241 return r;
5242 }
5243
5244 return 0;
5245 }
5246
5247 static int tgsi_interp_egcm(struct r600_shader_ctx *ctx)
5248 {
5249 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5250 struct r600_bytecode_alu alu;
5251 int r, i = 0, k, interp_gpr, interp_base_chan, tmp, lasti;
5252 unsigned location;
5253 int input;
5254
5255 assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
5256
5257 input = inst->Src[0].Register.Index;
5258
5259 /* Interpolators have been marked for use already by allocate_system_value_inputs */
5260 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5261 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5262 location = TGSI_INTERPOLATE_LOC_CENTER; /* sample offset will be added explicitly */
5263 }
5264 else {
5265 location = TGSI_INTERPOLATE_LOC_CENTROID;
5266 }
5267
5268 k = eg_get_interpolator_index(ctx->shader->input[input].interpolate, location);
5269 if (k < 0)
5270 k = 0;
5271 interp_gpr = ctx->eg_interpolators[k].ij_index / 2;
5272 interp_base_chan = 2 * (ctx->eg_interpolators[k].ij_index % 2);
5273
5274 /* NOTE: currently offset is not perspective correct */
5275 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5276 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5277 int sample_gpr = -1;
5278 int gradientsH, gradientsV;
5279 struct r600_bytecode_tex tex;
5280
5281 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5282 sample_gpr = load_sample_position(ctx, &ctx->src[1], ctx->src[1].swizzle[0]);
5283 }
5284
5285 gradientsH = r600_get_temp(ctx);
5286 gradientsV = r600_get_temp(ctx);
5287 for (i = 0; i < 2; i++) {
5288 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
5289 tex.op = i == 0 ? FETCH_OP_GET_GRADIENTS_H : FETCH_OP_GET_GRADIENTS_V;
5290 tex.src_gpr = interp_gpr;
5291 tex.src_sel_x = interp_base_chan + 0;
5292 tex.src_sel_y = interp_base_chan + 1;
5293 tex.src_sel_z = 0;
5294 tex.src_sel_w = 0;
5295 tex.dst_gpr = i == 0 ? gradientsH : gradientsV;
5296 tex.dst_sel_x = 0;
5297 tex.dst_sel_y = 1;
5298 tex.dst_sel_z = 7;
5299 tex.dst_sel_w = 7;
5300 tex.inst_mod = 1; // Use per pixel gradient calculation
5301 tex.sampler_id = 0;
5302 tex.resource_id = tex.sampler_id;
5303 r = r600_bytecode_add_tex(ctx->bc, &tex);
5304 if (r)
5305 return r;
5306 }
5307
5308 for (i = 0; i < 2; i++) {
5309 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5310 alu.op = ALU_OP3_MULADD;
5311 alu.is_op3 = 1;
5312 alu.src[0].sel = gradientsH;
5313 alu.src[0].chan = i;
5314 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5315 alu.src[1].sel = sample_gpr;
5316 alu.src[1].chan = 2;
5317 }
5318 else {
5319 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
5320 }
5321 alu.src[2].sel = interp_gpr;
5322 alu.src[2].chan = interp_base_chan + i;
5323 alu.dst.sel = ctx->temp_reg;
5324 alu.dst.chan = i;
5325 alu.last = i == 1;
5326
5327 r = r600_bytecode_add_alu(ctx->bc, &alu);
5328 if (r)
5329 return r;
5330 }
5331
5332 for (i = 0; i < 2; i++) {
5333 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5334 alu.op = ALU_OP3_MULADD;
5335 alu.is_op3 = 1;
5336 alu.src[0].sel = gradientsV;
5337 alu.src[0].chan = i;
5338 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5339 alu.src[1].sel = sample_gpr;
5340 alu.src[1].chan = 3;
5341 }
5342 else {
5343 r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
5344 }
5345 alu.src[2].sel = ctx->temp_reg;
5346 alu.src[2].chan = i;
5347 alu.dst.sel = ctx->temp_reg;
5348 alu.dst.chan = i;
5349 alu.last = i == 1;
5350
5351 r = r600_bytecode_add_alu(ctx->bc, &alu);
5352 if (r)
5353 return r;
5354 }
5355 }
5356
5357 tmp = r600_get_temp(ctx);
5358 for (i = 0; i < 8; i++) {
5359 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5360 alu.op = i < 4 ? ALU_OP2_INTERP_ZW : ALU_OP2_INTERP_XY;
5361
5362 alu.dst.sel = tmp;
5363 if ((i > 1 && i < 6)) {
5364 alu.dst.write = 1;
5365 }
5366 else {
5367 alu.dst.write = 0;
5368 }
5369 alu.dst.chan = i % 4;
5370
5371 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5372 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5373 alu.src[0].sel = ctx->temp_reg;
5374 alu.src[0].chan = 1 - (i % 2);
5375 } else {
5376 alu.src[0].sel = interp_gpr;
5377 alu.src[0].chan = interp_base_chan + 1 - (i % 2);
5378 }
5379 alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
5380 alu.src[1].chan = 0;
5381
5382 alu.last = i % 4 == 3;
5383 alu.bank_swizzle_force = SQ_ALU_VEC_210;
5384
5385 r = r600_bytecode_add_alu(ctx->bc, &alu);
5386 if (r)
5387 return r;
5388 }
5389
5390 // INTERP can't swizzle dst
5391 lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5392 for (i = 0; i <= lasti; i++) {
5393 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5394 continue;
5395
5396 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5397 alu.op = ALU_OP1_MOV;
5398 alu.src[0].sel = tmp;
5399 alu.src[0].chan = ctx->src[0].swizzle[i];
5400 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5401 alu.dst.write = 1;
5402 alu.last = i == lasti;
5403 r = r600_bytecode_add_alu(ctx->bc, &alu);
5404 if (r)
5405 return r;
5406 }
5407
5408 return 0;
5409 }
5410
5411
5412 static int tgsi_helper_copy(struct r600_shader_ctx *ctx, struct tgsi_full_instruction *inst)
5413 {
5414 struct r600_bytecode_alu alu;
5415 int i, r;
5416
5417 for (i = 0; i < 4; i++) {
5418 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5419 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) {
5420 alu.op = ALU_OP0_NOP;
5421 alu.dst.chan = i;
5422 } else {
5423 alu.op = ALU_OP1_MOV;
5424 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5425 alu.src[0].sel = ctx->temp_reg;
5426 alu.src[0].chan = i;
5427 }
5428 if (i == 3) {
5429 alu.last = 1;
5430 }
5431 r = r600_bytecode_add_alu(ctx->bc, &alu);
5432 if (r)
5433 return r;
5434 }
5435 return 0;
5436 }
5437
5438 static int tgsi_make_src_for_op3(struct r600_shader_ctx *ctx,
5439 unsigned temp, int chan,
5440 struct r600_bytecode_alu_src *bc_src,
5441 const struct r600_shader_src *shader_src)
5442 {
5443 struct r600_bytecode_alu alu;
5444 int r;
5445
5446 r600_bytecode_src(bc_src, shader_src, chan);
5447
5448 /* op3 operands don't support abs modifier */
5449 if (bc_src->abs) {
5450 assert(temp!=0); /* we actually need the extra register, make sure it is allocated. */
5451 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5452 alu.op = ALU_OP1_MOV;
5453 alu.dst.sel = temp;
5454 alu.dst.chan = chan;
5455 alu.dst.write = 1;
5456
5457 alu.src[0] = *bc_src;
5458 alu.last = true; // sufficient?
5459 r = r600_bytecode_add_alu(ctx->bc, &alu);
5460 if (r)
5461 return r;
5462
5463 memset(bc_src, 0, sizeof(*bc_src));
5464 bc_src->sel = temp;
5465 bc_src->chan = chan;
5466 }
5467 return 0;
5468 }
5469
5470 static int tgsi_op3(struct r600_shader_ctx *ctx)
5471 {
5472 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5473 struct r600_bytecode_alu alu;
5474 int i, j, r;
5475 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5476 int temp_regs[4];
5477
5478 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
5479 temp_regs[j] = 0;
5480 if (ctx->src[j].abs)
5481 temp_regs[j] = r600_get_temp(ctx);
5482 }
5483 for (i = 0; i < lasti + 1; i++) {
5484 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5485 continue;
5486
5487 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5488 alu.op = ctx->inst_info->op;
5489 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
5490 r = tgsi_make_src_for_op3(ctx, temp_regs[j], i, &alu.src[j], &ctx->src[j]);
5491 if (r)
5492 return r;
5493 }
5494
5495 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5496 alu.dst.chan = i;
5497 alu.dst.write = 1;
5498 alu.is_op3 = 1;
5499 if (i == lasti) {
5500 alu.last = 1;
5501 }
5502 r = r600_bytecode_add_alu(ctx->bc, &alu);
5503 if (r)
5504 return r;
5505 }
5506 return 0;
5507 }
5508
5509 static int tgsi_dp(struct r600_shader_ctx *ctx)
5510 {
5511 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5512 struct r600_bytecode_alu alu;
5513 int i, j, r;
5514
5515 for (i = 0; i < 4; i++) {
5516 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5517 alu.op = ctx->inst_info->op;
5518 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
5519 r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
5520 }
5521
5522 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5523 alu.dst.chan = i;
5524 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
5525 /* handle some special cases */
5526 switch (inst->Instruction.Opcode) {
5527 case TGSI_OPCODE_DP2:
5528 if (i > 1) {
5529 alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
5530 alu.src[0].chan = alu.src[1].chan = 0;
5531 }
5532 break;
5533 case TGSI_OPCODE_DP3:
5534 if (i > 2) {
5535 alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
5536 alu.src[0].chan = alu.src[1].chan = 0;
5537 }
5538 break;
5539 case TGSI_OPCODE_DPH:
5540 if (i == 3) {
5541 alu.src[0].sel = V_SQ_ALU_SRC_1;
5542 alu.src[0].chan = 0;
5543 alu.src[0].neg = 0;
5544 }
5545 break;
5546 default:
5547 break;
5548 }
5549 if (i == 3) {
5550 alu.last = 1;
5551 }
5552 r = r600_bytecode_add_alu(ctx->bc, &alu);
5553 if (r)
5554 return r;
5555 }
5556 return 0;
5557 }
5558
5559 static inline boolean tgsi_tex_src_requires_loading(struct r600_shader_ctx *ctx,
5560 unsigned index)
5561 {
5562 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5563 return (inst->Src[index].Register.File != TGSI_FILE_TEMPORARY &&
5564 inst->Src[index].Register.File != TGSI_FILE_INPUT &&
5565 inst->Src[index].Register.File != TGSI_FILE_OUTPUT) ||
5566 ctx->src[index].neg || ctx->src[index].abs ||
5567 (inst->Src[index].Register.File == TGSI_FILE_INPUT && ctx->type == TGSI_PROCESSOR_GEOMETRY);
5568 }
5569
5570 static inline unsigned tgsi_tex_get_src_gpr(struct r600_shader_ctx *ctx,
5571 unsigned index)
5572 {
5573 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5574 return ctx->file_offset[inst->Src[index].Register.File] + inst->Src[index].Register.Index;
5575 }
5576
5577 static int do_vtx_fetch_inst(struct r600_shader_ctx *ctx, boolean src_requires_loading)
5578 {
5579 struct r600_bytecode_vtx vtx;
5580 struct r600_bytecode_alu alu;
5581 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5582 int src_gpr, r, i;
5583 int id = tgsi_tex_get_src_gpr(ctx, 1);
5584
5585 src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
5586 if (src_requires_loading) {
5587 for (i = 0; i < 4; i++) {
5588 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5589 alu.op = ALU_OP1_MOV;
5590 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5591 alu.dst.sel = ctx->temp_reg;
5592 alu.dst.chan = i;
5593 if (i == 3)
5594 alu.last = 1;
5595 alu.dst.write = 1;
5596 r = r600_bytecode_add_alu(ctx->bc, &alu);
5597 if (r)
5598 return r;
5599 }
5600 src_gpr = ctx->temp_reg;
5601 }
5602
5603 memset(&vtx, 0, sizeof(vtx));
5604 vtx.op = FETCH_OP_VFETCH;
5605 vtx.buffer_id = id + R600_MAX_CONST_BUFFERS;
5606 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
5607 vtx.src_gpr = src_gpr;
5608 vtx.mega_fetch_count = 16;
5609 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
5610 vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; /* SEL_X */
5611 vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; /* SEL_Y */
5612 vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; /* SEL_Z */
5613 vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; /* SEL_W */
5614 vtx.use_const_fields = 1;
5615
5616 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
5617 return r;
5618
5619 if (ctx->bc->chip_class >= EVERGREEN)
5620 return 0;
5621
5622 for (i = 0; i < 4; i++) {
5623 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5624 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5625 continue;
5626
5627 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5628 alu.op = ALU_OP2_AND_INT;
5629
5630 alu.dst.chan = i;
5631 alu.dst.sel = vtx.dst_gpr;
5632 alu.dst.write = 1;
5633
5634 alu.src[0].sel = vtx.dst_gpr;
5635 alu.src[0].chan = i;
5636
5637 alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL;
5638 alu.src[1].sel += (id * 2);
5639 alu.src[1].chan = i % 4;
5640 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
5641
5642 if (i == lasti)
5643 alu.last = 1;
5644 r = r600_bytecode_add_alu(ctx->bc, &alu);
5645 if (r)
5646 return r;
5647 }
5648
5649 if (inst->Dst[0].Register.WriteMask & 3) {
5650 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5651 alu.op = ALU_OP2_OR_INT;
5652
5653 alu.dst.chan = 3;
5654 alu.dst.sel = vtx.dst_gpr;
5655 alu.dst.write = 1;
5656
5657 alu.src[0].sel = vtx.dst_gpr;
5658 alu.src[0].chan = 3;
5659
5660 alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL + (id * 2) + 1;
5661 alu.src[1].chan = 0;
5662 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
5663
5664 alu.last = 1;
5665 r = r600_bytecode_add_alu(ctx->bc, &alu);
5666 if (r)
5667 return r;
5668 }
5669 return 0;
5670 }
5671
5672 static int r600_do_buffer_txq(struct r600_shader_ctx *ctx)
5673 {
5674 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5675 struct r600_bytecode_alu alu;
5676 int r;
5677 int id = tgsi_tex_get_src_gpr(ctx, 1);
5678
5679 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5680 alu.op = ALU_OP1_MOV;
5681 alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
5682 if (ctx->bc->chip_class >= EVERGREEN) {
5683 /* channel 0 or 2 of each word */
5684 alu.src[0].sel += (id / 2);
5685 alu.src[0].chan = (id % 2) * 2;
5686 } else {
5687 /* r600 we have them at channel 2 of the second dword */
5688 alu.src[0].sel += (id * 2) + 1;
5689 alu.src[0].chan = 1;
5690 }
5691 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
5692 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
5693 alu.last = 1;
5694 r = r600_bytecode_add_alu(ctx->bc, &alu);
5695 if (r)
5696 return r;
5697 return 0;
5698 }
5699
5700 static int tgsi_tex(struct r600_shader_ctx *ctx)
5701 {
5702 static float one_point_five = 1.5f;
5703 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5704 struct r600_bytecode_tex tex;
5705 struct r600_bytecode_alu alu;
5706 unsigned src_gpr;
5707 int r, i, j;
5708 int opcode;
5709 bool read_compressed_msaa = ctx->bc->has_compressed_msaa_texturing &&
5710 inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
5711 (inst->Texture.Texture == TGSI_TEXTURE_2D_MSAA ||
5712 inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA);
5713
5714 bool txf_add_offsets = inst->Texture.NumOffsets &&
5715 inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
5716 inst->Texture.Texture != TGSI_TEXTURE_BUFFER;
5717
5718 /* Texture fetch instructions can only use gprs as source.
5719 * Also they cannot negate the source or take the absolute value */
5720 const boolean src_requires_loading = (inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ &&
5721 inst->Instruction.Opcode != TGSI_OPCODE_TXQS &&
5722 tgsi_tex_src_requires_loading(ctx, 0)) ||
5723 read_compressed_msaa || txf_add_offsets;
5724
5725 boolean src_loaded = FALSE;
5726 unsigned sampler_src_reg = inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ ? 0 : 1;
5727 int8_t offset_x = 0, offset_y = 0, offset_z = 0;
5728 boolean has_txq_cube_array_z = false;
5729 unsigned sampler_index_mode;
5730
5731 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ &&
5732 ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
5733 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)))
5734 if (inst->Dst[0].Register.WriteMask & 4) {
5735 ctx->shader->has_txq_cube_array_z_comp = true;
5736 has_txq_cube_array_z = true;
5737 }
5738
5739 if (inst->Instruction.Opcode == TGSI_OPCODE_TEX2 ||
5740 inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
5741 inst->Instruction.Opcode == TGSI_OPCODE_TXL2 ||
5742 inst->Instruction.Opcode == TGSI_OPCODE_TG4)
5743 sampler_src_reg = 2;
5744
5745 /* TGSI moves the sampler to src reg 3 for TXD */
5746 if (inst->Instruction.Opcode == TGSI_OPCODE_TXD)
5747 sampler_src_reg = 3;
5748
5749 sampler_index_mode = inst->Src[sampler_src_reg].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
5750
5751 src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
5752
5753 if (inst->Texture.Texture == TGSI_TEXTURE_BUFFER) {
5754 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) {
5755 ctx->shader->uses_tex_buffers = true;
5756 return r600_do_buffer_txq(ctx);
5757 }
5758 else if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
5759 if (ctx->bc->chip_class < EVERGREEN)
5760 ctx->shader->uses_tex_buffers = true;
5761 return do_vtx_fetch_inst(ctx, src_requires_loading);
5762 }
5763 }
5764
5765 if (inst->Instruction.Opcode == TGSI_OPCODE_TXP) {
5766 int out_chan;
5767 /* Add perspective divide */
5768 if (ctx->bc->chip_class == CAYMAN) {
5769 out_chan = 2;
5770 for (i = 0; i < 3; i++) {
5771 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5772 alu.op = ALU_OP1_RECIP_IEEE;
5773 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
5774
5775 alu.dst.sel = ctx->temp_reg;
5776 alu.dst.chan = i;
5777 if (i == 2)
5778 alu.last = 1;
5779 if (out_chan == i)
5780 alu.dst.write = 1;
5781 r = r600_bytecode_add_alu(ctx->bc, &alu);
5782 if (r)
5783 return r;
5784 }
5785
5786 } else {
5787 out_chan = 3;
5788 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5789 alu.op = ALU_OP1_RECIP_IEEE;
5790 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
5791
5792 alu.dst.sel = ctx->temp_reg;
5793 alu.dst.chan = out_chan;
5794 alu.last = 1;
5795 alu.dst.write = 1;
5796 r = r600_bytecode_add_alu(ctx->bc, &alu);
5797 if (r)
5798 return r;
5799 }
5800
5801 for (i = 0; i < 3; i++) {
5802 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5803 alu.op = ALU_OP2_MUL;
5804 alu.src[0].sel = ctx->temp_reg;
5805 alu.src[0].chan = out_chan;
5806 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5807 alu.dst.sel = ctx->temp_reg;
5808 alu.dst.chan = i;
5809 alu.dst.write = 1;
5810 r = r600_bytecode_add_alu(ctx->bc, &alu);
5811 if (r)
5812 return r;
5813 }
5814 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5815 alu.op = ALU_OP1_MOV;
5816 alu.src[0].sel = V_SQ_ALU_SRC_1;
5817 alu.src[0].chan = 0;
5818 alu.dst.sel = ctx->temp_reg;
5819 alu.dst.chan = 3;
5820 alu.last = 1;
5821 alu.dst.write = 1;
5822 r = r600_bytecode_add_alu(ctx->bc, &alu);
5823 if (r)
5824 return r;
5825 src_loaded = TRUE;
5826 src_gpr = ctx->temp_reg;
5827 }
5828
5829
5830 if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
5831 inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
5832 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
5833 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
5834 inst->Instruction.Opcode != TGSI_OPCODE_TXQ &&
5835 inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ) {
5836
5837 static const unsigned src0_swizzle[] = {2, 2, 0, 1};
5838 static const unsigned src1_swizzle[] = {1, 0, 2, 2};
5839
5840 /* tmp1.xyzw = CUBE(R0.zzxy, R0.yxzz) */
5841 for (i = 0; i < 4; i++) {
5842 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5843 alu.op = ALU_OP2_CUBE;
5844 r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
5845 r600_bytecode_src(&alu.src[1], &ctx->src[0], src1_swizzle[i]);
5846 alu.dst.sel = ctx->temp_reg;
5847 alu.dst.chan = i;
5848 if (i == 3)
5849 alu.last = 1;
5850 alu.dst.write = 1;
5851 r = r600_bytecode_add_alu(ctx->bc, &alu);
5852 if (r)
5853 return r;
5854 }
5855
5856 /* tmp1.z = RCP_e(|tmp1.z|) */
5857 if (ctx->bc->chip_class == CAYMAN) {
5858 for (i = 0; i < 3; i++) {
5859 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5860 alu.op = ALU_OP1_RECIP_IEEE;
5861 alu.src[0].sel = ctx->temp_reg;
5862 alu.src[0].chan = 2;
5863 alu.src[0].abs = 1;
5864 alu.dst.sel = ctx->temp_reg;
5865 alu.dst.chan = i;
5866 if (i == 2)
5867 alu.dst.write = 1;
5868 if (i == 2)
5869 alu.last = 1;
5870 r = r600_bytecode_add_alu(ctx->bc, &alu);
5871 if (r)
5872 return r;
5873 }
5874 } else {
5875 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5876 alu.op = ALU_OP1_RECIP_IEEE;
5877 alu.src[0].sel = ctx->temp_reg;
5878 alu.src[0].chan = 2;
5879 alu.src[0].abs = 1;
5880 alu.dst.sel = ctx->temp_reg;
5881 alu.dst.chan = 2;
5882 alu.dst.write = 1;
5883 alu.last = 1;
5884 r = r600_bytecode_add_alu(ctx->bc, &alu);
5885 if (r)
5886 return r;
5887 }
5888
5889 /* MULADD R0.x, R0.x, PS1, (0x3FC00000, 1.5f).x
5890 * MULADD R0.y, R0.y, PS1, (0x3FC00000, 1.5f).x
5891 * muladd has no writemask, have to use another temp
5892 */
5893 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5894 alu.op = ALU_OP3_MULADD;
5895 alu.is_op3 = 1;
5896
5897 alu.src[0].sel = ctx->temp_reg;
5898 alu.src[0].chan = 0;
5899 alu.src[1].sel = ctx->temp_reg;
5900 alu.src[1].chan = 2;
5901
5902 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
5903 alu.src[2].chan = 0;
5904 alu.src[2].value = *(uint32_t *)&one_point_five;
5905
5906 alu.dst.sel = ctx->temp_reg;
5907 alu.dst.chan = 0;
5908 alu.dst.write = 1;
5909
5910 r = r600_bytecode_add_alu(ctx->bc, &alu);
5911 if (r)
5912 return r;
5913
5914 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5915 alu.op = ALU_OP3_MULADD;
5916 alu.is_op3 = 1;
5917
5918 alu.src[0].sel = ctx->temp_reg;
5919 alu.src[0].chan = 1;
5920 alu.src[1].sel = ctx->temp_reg;
5921 alu.src[1].chan = 2;
5922
5923 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
5924 alu.src[2].chan = 0;
5925 alu.src[2].value = *(uint32_t *)&one_point_five;
5926
5927 alu.dst.sel = ctx->temp_reg;
5928 alu.dst.chan = 1;
5929 alu.dst.write = 1;
5930
5931 alu.last = 1;
5932 r = r600_bytecode_add_alu(ctx->bc, &alu);
5933 if (r)
5934 return r;
5935 /* write initial compare value into Z component
5936 - W src 0 for shadow cube
5937 - X src 1 for shadow cube array */
5938 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
5939 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
5940 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5941 alu.op = ALU_OP1_MOV;
5942 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
5943 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
5944 else
5945 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
5946 alu.dst.sel = ctx->temp_reg;
5947 alu.dst.chan = 2;
5948 alu.dst.write = 1;
5949 alu.last = 1;
5950 r = r600_bytecode_add_alu(ctx->bc, &alu);
5951 if (r)
5952 return r;
5953 }
5954
5955 if (inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
5956 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
5957 if (ctx->bc->chip_class >= EVERGREEN) {
5958 int mytmp = r600_get_temp(ctx);
5959 static const float eight = 8.0f;
5960 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5961 alu.op = ALU_OP1_MOV;
5962 alu.src[0].sel = ctx->temp_reg;
5963 alu.src[0].chan = 3;
5964 alu.dst.sel = mytmp;
5965 alu.dst.chan = 0;
5966 alu.dst.write = 1;
5967 alu.last = 1;
5968 r = r600_bytecode_add_alu(ctx->bc, &alu);
5969 if (r)
5970 return r;
5971
5972 /* have to multiply original layer by 8 and add to face id (temp.w) in Z */
5973 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5974 alu.op = ALU_OP3_MULADD;
5975 alu.is_op3 = 1;
5976 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
5977 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
5978 alu.src[1].chan = 0;
5979 alu.src[1].value = *(uint32_t *)&eight;
5980 alu.src[2].sel = mytmp;
5981 alu.src[2].chan = 0;
5982 alu.dst.sel = ctx->temp_reg;
5983 alu.dst.chan = 3;
5984 alu.dst.write = 1;
5985 alu.last = 1;
5986 r = r600_bytecode_add_alu(ctx->bc, &alu);
5987 if (r)
5988 return r;
5989 } else if (ctx->bc->chip_class < EVERGREEN) {
5990 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
5991 tex.op = FETCH_OP_SET_CUBEMAP_INDEX;
5992 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
5993 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
5994 tex.src_gpr = r600_get_temp(ctx);
5995 tex.src_sel_x = 0;
5996 tex.src_sel_y = 0;
5997 tex.src_sel_z = 0;
5998 tex.src_sel_w = 0;
5999 tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
6000 tex.coord_type_x = 1;
6001 tex.coord_type_y = 1;
6002 tex.coord_type_z = 1;
6003 tex.coord_type_w = 1;
6004 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6005 alu.op = ALU_OP1_MOV;
6006 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
6007 alu.dst.sel = tex.src_gpr;
6008 alu.dst.chan = 0;
6009 alu.last = 1;
6010 alu.dst.write = 1;
6011 r = r600_bytecode_add_alu(ctx->bc, &alu);
6012 if (r)
6013 return r;
6014
6015 r = r600_bytecode_add_tex(ctx->bc, &tex);
6016 if (r)
6017 return r;
6018 }
6019
6020 }
6021
6022 /* for cube forms of lod and bias we need to route things */
6023 if (inst->Instruction.Opcode == TGSI_OPCODE_TXB ||
6024 inst->Instruction.Opcode == TGSI_OPCODE_TXL ||
6025 inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
6026 inst->Instruction.Opcode == TGSI_OPCODE_TXL2) {
6027 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6028 alu.op = ALU_OP1_MOV;
6029 if (inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
6030 inst->Instruction.Opcode == TGSI_OPCODE_TXL2)
6031 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
6032 else
6033 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
6034 alu.dst.sel = ctx->temp_reg;
6035 alu.dst.chan = 2;
6036 alu.last = 1;
6037 alu.dst.write = 1;
6038 r = r600_bytecode_add_alu(ctx->bc, &alu);
6039 if (r)
6040 return r;
6041 }
6042
6043 src_loaded = TRUE;
6044 src_gpr = ctx->temp_reg;
6045 }
6046
6047 if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) {
6048 int temp_h = 0, temp_v = 0;
6049 int start_val = 0;
6050
6051 /* if we've already loaded the src (i.e. CUBE don't reload it). */
6052 if (src_loaded == TRUE)
6053 start_val = 1;
6054 else
6055 src_loaded = TRUE;
6056 for (i = start_val; i < 3; i++) {
6057 int treg = r600_get_temp(ctx);
6058
6059 if (i == 0)
6060 src_gpr = treg;
6061 else if (i == 1)
6062 temp_h = treg;
6063 else
6064 temp_v = treg;
6065
6066 for (j = 0; j < 4; j++) {
6067 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6068 alu.op = ALU_OP1_MOV;
6069 r600_bytecode_src(&alu.src[0], &ctx->src[i], j);
6070 alu.dst.sel = treg;
6071 alu.dst.chan = j;
6072 if (j == 3)
6073 alu.last = 1;
6074 alu.dst.write = 1;
6075 r = r600_bytecode_add_alu(ctx->bc, &alu);
6076 if (r)
6077 return r;
6078 }
6079 }
6080 for (i = 1; i < 3; i++) {
6081 /* set gradients h/v */
6082 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
6083 tex.op = (i == 1) ? FETCH_OP_SET_GRADIENTS_H :
6084 FETCH_OP_SET_GRADIENTS_V;
6085 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
6086 tex.sampler_index_mode = sampler_index_mode;
6087 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
6088 tex.resource_index_mode = sampler_index_mode;
6089
6090 tex.src_gpr = (i == 1) ? temp_h : temp_v;
6091 tex.src_sel_x = 0;
6092 tex.src_sel_y = 1;
6093 tex.src_sel_z = 2;
6094 tex.src_sel_w = 3;
6095
6096 tex.dst_gpr = r600_get_temp(ctx); /* just to avoid confusing the asm scheduler */
6097 tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
6098 if (inst->Texture.Texture != TGSI_TEXTURE_RECT) {
6099 tex.coord_type_x = 1;
6100 tex.coord_type_y = 1;
6101 tex.coord_type_z = 1;
6102 tex.coord_type_w = 1;
6103 }
6104 r = r600_bytecode_add_tex(ctx->bc, &tex);
6105 if (r)
6106 return r;
6107 }
6108 }
6109
6110 if (src_requires_loading && !src_loaded) {
6111 for (i = 0; i < 4; i++) {
6112 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6113 alu.op = ALU_OP1_MOV;
6114 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6115 alu.dst.sel = ctx->temp_reg;
6116 alu.dst.chan = i;
6117 if (i == 3)
6118 alu.last = 1;
6119 alu.dst.write = 1;
6120 r = r600_bytecode_add_alu(ctx->bc, &alu);
6121 if (r)
6122 return r;
6123 }
6124 src_loaded = TRUE;
6125 src_gpr = ctx->temp_reg;
6126 }
6127
6128 /* get offset values */
6129 if (inst->Texture.NumOffsets) {
6130 assert(inst->Texture.NumOffsets == 1);
6131
6132 /* The texture offset feature doesn't work with the TXF instruction
6133 * and must be emulated by adding the offset to the texture coordinates. */
6134 if (txf_add_offsets) {
6135 const struct tgsi_texture_offset *off = inst->TexOffsets;
6136
6137 switch (inst->Texture.Texture) {
6138 case TGSI_TEXTURE_3D:
6139 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6140 alu.op = ALU_OP2_ADD_INT;
6141 alu.src[0].sel = src_gpr;
6142 alu.src[0].chan = 2;
6143 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6144 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleZ];
6145 alu.dst.sel = src_gpr;
6146 alu.dst.chan = 2;
6147 alu.dst.write = 1;
6148 alu.last = 1;
6149 r = r600_bytecode_add_alu(ctx->bc, &alu);
6150 if (r)
6151 return r;
6152 /* fall through */
6153
6154 case TGSI_TEXTURE_2D:
6155 case TGSI_TEXTURE_SHADOW2D:
6156 case TGSI_TEXTURE_RECT:
6157 case TGSI_TEXTURE_SHADOWRECT:
6158 case TGSI_TEXTURE_2D_ARRAY:
6159 case TGSI_TEXTURE_SHADOW2D_ARRAY:
6160 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6161 alu.op = ALU_OP2_ADD_INT;
6162 alu.src[0].sel = src_gpr;
6163 alu.src[0].chan = 1;
6164 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6165 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleY];
6166 alu.dst.sel = src_gpr;
6167 alu.dst.chan = 1;
6168 alu.dst.write = 1;
6169 alu.last = 1;
6170 r = r600_bytecode_add_alu(ctx->bc, &alu);
6171 if (r)
6172 return r;
6173 /* fall through */
6174
6175 case TGSI_TEXTURE_1D:
6176 case TGSI_TEXTURE_SHADOW1D:
6177 case TGSI_TEXTURE_1D_ARRAY:
6178 case TGSI_TEXTURE_SHADOW1D_ARRAY:
6179 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6180 alu.op = ALU_OP2_ADD_INT;
6181 alu.src[0].sel = src_gpr;
6182 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6183 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleX];
6184 alu.dst.sel = src_gpr;
6185 alu.dst.write = 1;
6186 alu.last = 1;
6187 r = r600_bytecode_add_alu(ctx->bc, &alu);
6188 if (r)
6189 return r;
6190 break;
6191 /* texture offsets do not apply to other texture targets */
6192 }
6193 } else {
6194 switch (inst->Texture.Texture) {
6195 case TGSI_TEXTURE_3D:
6196 offset_z = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleZ] << 1;
6197 /* fallthrough */
6198 case TGSI_TEXTURE_2D:
6199 case TGSI_TEXTURE_SHADOW2D:
6200 case TGSI_TEXTURE_RECT:
6201 case TGSI_TEXTURE_SHADOWRECT:
6202 case TGSI_TEXTURE_2D_ARRAY:
6203 case TGSI_TEXTURE_SHADOW2D_ARRAY:
6204 offset_y = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleY] << 1;
6205 /* fallthrough */
6206 case TGSI_TEXTURE_1D:
6207 case TGSI_TEXTURE_SHADOW1D:
6208 case TGSI_TEXTURE_1D_ARRAY:
6209 case TGSI_TEXTURE_SHADOW1D_ARRAY:
6210 offset_x = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleX] << 1;
6211 }
6212 }
6213 }
6214
6215 /* Obtain the sample index for reading a compressed MSAA color texture.
6216 * To read the FMASK, we use the ldfptr instruction, which tells us
6217 * where the samples are stored.
6218 * For uncompressed 8x MSAA surfaces, ldfptr should return 0x76543210,
6219 * which is the identity mapping. Each nibble says which physical sample
6220 * should be fetched to get that sample.
6221 *
6222 * Assume src.z contains the sample index. It should be modified like this:
6223 * src.z = (ldfptr() >> (src.z * 4)) & 0xF;
6224 * Then fetch the texel with src.
6225 */
6226 if (read_compressed_msaa) {
6227 unsigned sample_chan = 3;
6228 unsigned temp = r600_get_temp(ctx);
6229 assert(src_loaded);
6230
6231 /* temp.w = ldfptr() */
6232 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
6233 tex.op = FETCH_OP_LD;
6234 tex.inst_mod = 1; /* to indicate this is ldfptr */
6235 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
6236 tex.sampler_index_mode = sampler_index_mode;
6237 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
6238 tex.resource_index_mode = sampler_index_mode;
6239 tex.src_gpr = src_gpr;
6240 tex.dst_gpr = temp;
6241 tex.dst_sel_x = 7; /* mask out these components */
6242 tex.dst_sel_y = 7;
6243 tex.dst_sel_z = 7;
6244 tex.dst_sel_w = 0; /* store X */
6245 tex.src_sel_x = 0;
6246 tex.src_sel_y = 1;
6247 tex.src_sel_z = 2;
6248 tex.src_sel_w = 3;
6249 tex.offset_x = offset_x;
6250 tex.offset_y = offset_y;
6251 tex.offset_z = offset_z;
6252 r = r600_bytecode_add_tex(ctx->bc, &tex);
6253 if (r)
6254 return r;
6255
6256 /* temp.x = sample_index*4 */
6257 if (ctx->bc->chip_class == CAYMAN) {
6258 for (i = 0 ; i < 4; i++) {
6259 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6260 alu.op = ALU_OP2_MULLO_INT;
6261 alu.src[0].sel = src_gpr;
6262 alu.src[0].chan = sample_chan;
6263 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6264 alu.src[1].value = 4;
6265 alu.dst.sel = temp;
6266 alu.dst.chan = i;
6267 alu.dst.write = i == 0;
6268 if (i == 3)
6269 alu.last = 1;
6270 r = r600_bytecode_add_alu(ctx->bc, &alu);
6271 if (r)
6272 return r;
6273 }
6274 } else {
6275 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6276 alu.op = ALU_OP2_MULLO_INT;
6277 alu.src[0].sel = src_gpr;
6278 alu.src[0].chan = sample_chan;
6279 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6280 alu.src[1].value = 4;
6281 alu.dst.sel = temp;
6282 alu.dst.chan = 0;
6283 alu.dst.write = 1;
6284 alu.last = 1;
6285 r = r600_bytecode_add_alu(ctx->bc, &alu);
6286 if (r)
6287 return r;
6288 }
6289
6290 /* sample_index = temp.w >> temp.x */
6291 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6292 alu.op = ALU_OP2_LSHR_INT;
6293 alu.src[0].sel = temp;
6294 alu.src[0].chan = 3;
6295 alu.src[1].sel = temp;
6296 alu.src[1].chan = 0;
6297 alu.dst.sel = src_gpr;
6298 alu.dst.chan = sample_chan;
6299 alu.dst.write = 1;
6300 alu.last = 1;
6301 r = r600_bytecode_add_alu(ctx->bc, &alu);
6302 if (r)
6303 return r;
6304
6305 /* sample_index & 0xF */
6306 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6307 alu.op = ALU_OP2_AND_INT;
6308 alu.src[0].sel = src_gpr;
6309 alu.src[0].chan = sample_chan;
6310 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6311 alu.src[1].value = 0xF;
6312 alu.dst.sel = src_gpr;
6313 alu.dst.chan = sample_chan;
6314 alu.dst.write = 1;
6315 alu.last = 1;
6316 r = r600_bytecode_add_alu(ctx->bc, &alu);
6317 if (r)
6318 return r;
6319 #if 0
6320 /* visualize the FMASK */
6321 for (i = 0; i < 4; i++) {
6322 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6323 alu.op = ALU_OP1_INT_TO_FLT;
6324 alu.src[0].sel = src_gpr;
6325 alu.src[0].chan = sample_chan;
6326 alu.dst.sel = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
6327 alu.dst.chan = i;
6328 alu.dst.write = 1;
6329 alu.last = 1;
6330 r = r600_bytecode_add_alu(ctx->bc, &alu);
6331 if (r)
6332 return r;
6333 }
6334 return 0;
6335 #endif
6336 }
6337
6338 /* does this shader want a num layers from TXQ for a cube array? */
6339 if (has_txq_cube_array_z) {
6340 int id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
6341
6342 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6343 alu.op = ALU_OP1_MOV;
6344
6345 alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
6346 if (ctx->bc->chip_class >= EVERGREEN) {
6347 /* channel 1 or 3 of each word */
6348 alu.src[0].sel += (id / 2);
6349 alu.src[0].chan = ((id % 2) * 2) + 1;
6350 } else {
6351 /* r600 we have them at channel 2 of the second dword */
6352 alu.src[0].sel += (id * 2) + 1;
6353 alu.src[0].chan = 2;
6354 }
6355 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
6356 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
6357 alu.last = 1;
6358 r = r600_bytecode_add_alu(ctx->bc, &alu);
6359 if (r)
6360 return r;
6361 /* disable writemask from texture instruction */
6362 inst->Dst[0].Register.WriteMask &= ~4;
6363 }
6364
6365 opcode = ctx->inst_info->op;
6366 if (opcode == FETCH_OP_GATHER4 &&
6367 inst->TexOffsets[0].File != TGSI_FILE_NULL &&
6368 inst->TexOffsets[0].File != TGSI_FILE_IMMEDIATE) {
6369 opcode = FETCH_OP_GATHER4_O;
6370
6371 /* GATHER4_O/GATHER4_C_O use offset values loaded by
6372 SET_TEXTURE_OFFSETS instruction. The immediate offset values
6373 encoded in the instruction are ignored. */
6374 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
6375 tex.op = FETCH_OP_SET_TEXTURE_OFFSETS;
6376 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
6377 tex.sampler_index_mode = sampler_index_mode;
6378 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
6379 tex.resource_index_mode = sampler_index_mode;
6380
6381 tex.src_gpr = ctx->file_offset[inst->TexOffsets[0].File] + inst->TexOffsets[0].Index;
6382 tex.src_sel_x = inst->TexOffsets[0].SwizzleX;
6383 tex.src_sel_y = inst->TexOffsets[0].SwizzleY;
6384 tex.src_sel_z = inst->TexOffsets[0].SwizzleZ;
6385 tex.src_sel_w = 4;
6386
6387 tex.dst_sel_x = 7;
6388 tex.dst_sel_y = 7;
6389 tex.dst_sel_z = 7;
6390 tex.dst_sel_w = 7;
6391
6392 r = r600_bytecode_add_tex(ctx->bc, &tex);
6393 if (r)
6394 return r;
6395 }
6396
6397 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
6398 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
6399 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
6400 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
6401 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY ||
6402 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
6403 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
6404 switch (opcode) {
6405 case FETCH_OP_SAMPLE:
6406 opcode = FETCH_OP_SAMPLE_C;
6407 break;
6408 case FETCH_OP_SAMPLE_L:
6409 opcode = FETCH_OP_SAMPLE_C_L;
6410 break;
6411 case FETCH_OP_SAMPLE_LB:
6412 opcode = FETCH_OP_SAMPLE_C_LB;
6413 break;
6414 case FETCH_OP_SAMPLE_G:
6415 opcode = FETCH_OP_SAMPLE_C_G;
6416 break;
6417 /* Texture gather variants */
6418 case FETCH_OP_GATHER4:
6419 opcode = FETCH_OP_GATHER4_C;
6420 break;
6421 case FETCH_OP_GATHER4_O:
6422 opcode = FETCH_OP_GATHER4_C_O;
6423 break;
6424 }
6425 }
6426
6427 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
6428 tex.op = opcode;
6429
6430 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
6431 tex.sampler_index_mode = sampler_index_mode;
6432 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
6433 tex.resource_index_mode = sampler_index_mode;
6434 tex.src_gpr = src_gpr;
6435 tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
6436
6437 if (inst->Instruction.Opcode == TGSI_OPCODE_DDX_FINE ||
6438 inst->Instruction.Opcode == TGSI_OPCODE_DDY_FINE) {
6439 tex.inst_mod = 1; /* per pixel gradient calculation instead of per 2x2 quad */
6440 }
6441
6442 if (inst->Instruction.Opcode == TGSI_OPCODE_TG4) {
6443 int8_t texture_component_select = ctx->literals[4 * inst->Src[1].Register.Index + inst->Src[1].Register.SwizzleX];
6444 tex.inst_mod = texture_component_select;
6445
6446 if (ctx->bc->chip_class == CAYMAN) {
6447 /* GATHER4 result order is different from TGSI TG4 */
6448 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 0 : 7;
6449 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 4) ? 1 : 7;
6450 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 1) ? 2 : 7;
6451 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
6452 } else {
6453 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
6454 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
6455 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
6456 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
6457 }
6458 }
6459 else if (inst->Instruction.Opcode == TGSI_OPCODE_LODQ) {
6460 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
6461 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
6462 tex.dst_sel_z = 7;
6463 tex.dst_sel_w = 7;
6464 }
6465 else if (inst->Instruction.Opcode == TGSI_OPCODE_TXQS) {
6466 tex.dst_sel_x = 3;
6467 tex.dst_sel_y = 7;
6468 tex.dst_sel_z = 7;
6469 tex.dst_sel_w = 7;
6470 }
6471 else {
6472 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
6473 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
6474 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
6475 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
6476 }
6477
6478
6479 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ ||
6480 inst->Instruction.Opcode == TGSI_OPCODE_TXQS) {
6481 tex.src_sel_x = 4;
6482 tex.src_sel_y = 4;
6483 tex.src_sel_z = 4;
6484 tex.src_sel_w = 4;
6485 } else if (src_loaded) {
6486 tex.src_sel_x = 0;
6487 tex.src_sel_y = 1;
6488 tex.src_sel_z = 2;
6489 tex.src_sel_w = 3;
6490 } else {
6491 tex.src_sel_x = ctx->src[0].swizzle[0];
6492 tex.src_sel_y = ctx->src[0].swizzle[1];
6493 tex.src_sel_z = ctx->src[0].swizzle[2];
6494 tex.src_sel_w = ctx->src[0].swizzle[3];
6495 tex.src_rel = ctx->src[0].rel;
6496 }
6497
6498 if (inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
6499 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
6500 inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
6501 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
6502 tex.src_sel_x = 1;
6503 tex.src_sel_y = 0;
6504 tex.src_sel_z = 3;
6505 tex.src_sel_w = 2; /* route Z compare or Lod value into W */
6506 }
6507
6508 if (inst->Texture.Texture != TGSI_TEXTURE_RECT &&
6509 inst->Texture.Texture != TGSI_TEXTURE_SHADOWRECT) {
6510 tex.coord_type_x = 1;
6511 tex.coord_type_y = 1;
6512 }
6513 tex.coord_type_z = 1;
6514 tex.coord_type_w = 1;
6515
6516 tex.offset_x = offset_x;
6517 tex.offset_y = offset_y;
6518 if (inst->Instruction.Opcode == TGSI_OPCODE_TG4 &&
6519 (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
6520 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY)) {
6521 tex.offset_z = 0;
6522 }
6523 else {
6524 tex.offset_z = offset_z;
6525 }
6526
6527 /* Put the depth for comparison in W.
6528 * TGSI_TEXTURE_SHADOW2D_ARRAY already has the depth in W.
6529 * Some instructions expect the depth in Z. */
6530 if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
6531 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
6532 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
6533 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) &&
6534 opcode != FETCH_OP_SAMPLE_C_L &&
6535 opcode != FETCH_OP_SAMPLE_C_LB) {
6536 tex.src_sel_w = tex.src_sel_z;
6537 }
6538
6539 if (inst->Texture.Texture == TGSI_TEXTURE_1D_ARRAY ||
6540 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) {
6541 if (opcode == FETCH_OP_SAMPLE_C_L ||
6542 opcode == FETCH_OP_SAMPLE_C_LB) {
6543 /* the array index is read from Y */
6544 tex.coord_type_y = 0;
6545 } else {
6546 /* the array index is read from Z */
6547 tex.coord_type_z = 0;
6548 tex.src_sel_z = tex.src_sel_y;
6549 }
6550 } else if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
6551 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
6552 ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
6553 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
6554 (ctx->bc->chip_class >= EVERGREEN)))
6555 /* the array index is read from Z */
6556 tex.coord_type_z = 0;
6557
6558 /* mask unused source components */
6559 if (opcode == FETCH_OP_SAMPLE || opcode == FETCH_OP_GATHER4) {
6560 switch (inst->Texture.Texture) {
6561 case TGSI_TEXTURE_2D:
6562 case TGSI_TEXTURE_RECT:
6563 tex.src_sel_z = 7;
6564 tex.src_sel_w = 7;
6565 break;
6566 case TGSI_TEXTURE_1D_ARRAY:
6567 tex.src_sel_y = 7;
6568 tex.src_sel_w = 7;
6569 break;
6570 case TGSI_TEXTURE_1D:
6571 tex.src_sel_y = 7;
6572 tex.src_sel_z = 7;
6573 tex.src_sel_w = 7;
6574 break;
6575 }
6576 }
6577
6578 r = r600_bytecode_add_tex(ctx->bc, &tex);
6579 if (r)
6580 return r;
6581
6582 /* add shadow ambient support - gallium doesn't do it yet */
6583 return 0;
6584 }
6585
6586 static int tgsi_lrp(struct r600_shader_ctx *ctx)
6587 {
6588 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6589 struct r600_bytecode_alu alu;
6590 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6591 unsigned i, temp_regs[2];
6592 int r;
6593
6594 /* optimize if it's just an equal balance */
6595 if (ctx->src[0].sel == V_SQ_ALU_SRC_0_5) {
6596 for (i = 0; i < lasti + 1; i++) {
6597 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6598 continue;
6599
6600 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6601 alu.op = ALU_OP2_ADD;
6602 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
6603 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6604 alu.omod = 3;
6605 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6606 alu.dst.chan = i;
6607 if (i == lasti) {
6608 alu.last = 1;
6609 }
6610 r = r600_bytecode_add_alu(ctx->bc, &alu);
6611 if (r)
6612 return r;
6613 }
6614 return 0;
6615 }
6616
6617 /* 1 - src0 */
6618 for (i = 0; i < lasti + 1; i++) {
6619 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6620 continue;
6621
6622 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6623 alu.op = ALU_OP2_ADD;
6624 alu.src[0].sel = V_SQ_ALU_SRC_1;
6625 alu.src[0].chan = 0;
6626 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6627 r600_bytecode_src_toggle_neg(&alu.src[1]);
6628 alu.dst.sel = ctx->temp_reg;
6629 alu.dst.chan = i;
6630 if (i == lasti) {
6631 alu.last = 1;
6632 }
6633 alu.dst.write = 1;
6634 r = r600_bytecode_add_alu(ctx->bc, &alu);
6635 if (r)
6636 return r;
6637 }
6638
6639 /* (1 - src0) * src2 */
6640 for (i = 0; i < lasti + 1; i++) {
6641 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6642 continue;
6643
6644 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6645 alu.op = ALU_OP2_MUL;
6646 alu.src[0].sel = ctx->temp_reg;
6647 alu.src[0].chan = i;
6648 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6649 alu.dst.sel = ctx->temp_reg;
6650 alu.dst.chan = i;
6651 if (i == lasti) {
6652 alu.last = 1;
6653 }
6654 alu.dst.write = 1;
6655 r = r600_bytecode_add_alu(ctx->bc, &alu);
6656 if (r)
6657 return r;
6658 }
6659
6660 /* src0 * src1 + (1 - src0) * src2 */
6661 if (ctx->src[0].abs)
6662 temp_regs[0] = r600_get_temp(ctx);
6663 else
6664 temp_regs[0] = 0;
6665 if (ctx->src[1].abs)
6666 temp_regs[1] = r600_get_temp(ctx);
6667 else
6668 temp_regs[1] = 0;
6669
6670 for (i = 0; i < lasti + 1; i++) {
6671 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6672 continue;
6673
6674 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6675 alu.op = ALU_OP3_MULADD;
6676 alu.is_op3 = 1;
6677 r = tgsi_make_src_for_op3(ctx, temp_regs[0], i, &alu.src[0], &ctx->src[0]);
6678 if (r)
6679 return r;
6680 r = tgsi_make_src_for_op3(ctx, temp_regs[1], i, &alu.src[1], &ctx->src[1]);
6681 if (r)
6682 return r;
6683 alu.src[2].sel = ctx->temp_reg;
6684 alu.src[2].chan = i;
6685
6686 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6687 alu.dst.chan = i;
6688 if (i == lasti) {
6689 alu.last = 1;
6690 }
6691 r = r600_bytecode_add_alu(ctx->bc, &alu);
6692 if (r)
6693 return r;
6694 }
6695 return 0;
6696 }
6697
6698 static int tgsi_cmp(struct r600_shader_ctx *ctx)
6699 {
6700 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6701 struct r600_bytecode_alu alu;
6702 int i, r, j;
6703 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6704 int temp_regs[3];
6705
6706 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
6707 temp_regs[j] = 0;
6708 if (ctx->src[j].abs)
6709 temp_regs[j] = r600_get_temp(ctx);
6710 }
6711
6712 for (i = 0; i < lasti + 1; i++) {
6713 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6714 continue;
6715
6716 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6717 alu.op = ALU_OP3_CNDGE;
6718 r = tgsi_make_src_for_op3(ctx, temp_regs[0], i, &alu.src[0], &ctx->src[0]);
6719 if (r)
6720 return r;
6721 r = tgsi_make_src_for_op3(ctx, temp_regs[2], i, &alu.src[1], &ctx->src[2]);
6722 if (r)
6723 return r;
6724 r = tgsi_make_src_for_op3(ctx, temp_regs[1], i, &alu.src[2], &ctx->src[1]);
6725 if (r)
6726 return r;
6727 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6728 alu.dst.chan = i;
6729 alu.dst.write = 1;
6730 alu.is_op3 = 1;
6731 if (i == lasti)
6732 alu.last = 1;
6733 r = r600_bytecode_add_alu(ctx->bc, &alu);
6734 if (r)
6735 return r;
6736 }
6737 return 0;
6738 }
6739
6740 static int tgsi_ucmp(struct r600_shader_ctx *ctx)
6741 {
6742 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6743 struct r600_bytecode_alu alu;
6744 int i, r;
6745 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6746
6747 for (i = 0; i < lasti + 1; i++) {
6748 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6749 continue;
6750
6751 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6752 alu.op = ALU_OP3_CNDE_INT;
6753 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6754 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6755 r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
6756 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6757 alu.dst.chan = i;
6758 alu.dst.write = 1;
6759 alu.is_op3 = 1;
6760 if (i == lasti)
6761 alu.last = 1;
6762 r = r600_bytecode_add_alu(ctx->bc, &alu);
6763 if (r)
6764 return r;
6765 }
6766 return 0;
6767 }
6768
6769 static int tgsi_xpd(struct r600_shader_ctx *ctx)
6770 {
6771 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6772 static const unsigned int src0_swizzle[] = {2, 0, 1};
6773 static const unsigned int src1_swizzle[] = {1, 2, 0};
6774 struct r600_bytecode_alu alu;
6775 uint32_t use_temp = 0;
6776 int i, r;
6777
6778 if (inst->Dst[0].Register.WriteMask != 0xf)
6779 use_temp = 1;
6780
6781 for (i = 0; i < 4; i++) {
6782 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6783 alu.op = ALU_OP2_MUL;
6784 if (i < 3) {
6785 r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
6786 r600_bytecode_src(&alu.src[1], &ctx->src[1], src1_swizzle[i]);
6787 } else {
6788 alu.src[0].sel = V_SQ_ALU_SRC_0;
6789 alu.src[0].chan = i;
6790 alu.src[1].sel = V_SQ_ALU_SRC_0;
6791 alu.src[1].chan = i;
6792 }
6793
6794 alu.dst.sel = ctx->temp_reg;
6795 alu.dst.chan = i;
6796 alu.dst.write = 1;
6797
6798 if (i == 3)
6799 alu.last = 1;
6800 r = r600_bytecode_add_alu(ctx->bc, &alu);
6801 if (r)
6802 return r;
6803 }
6804
6805 for (i = 0; i < 4; i++) {
6806 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6807 alu.op = ALU_OP3_MULADD;
6808
6809 if (i < 3) {
6810 r600_bytecode_src(&alu.src[0], &ctx->src[0], src1_swizzle[i]);
6811 r600_bytecode_src(&alu.src[1], &ctx->src[1], src0_swizzle[i]);
6812 } else {
6813 alu.src[0].sel = V_SQ_ALU_SRC_0;
6814 alu.src[0].chan = i;
6815 alu.src[1].sel = V_SQ_ALU_SRC_0;
6816 alu.src[1].chan = i;
6817 }
6818
6819 alu.src[2].sel = ctx->temp_reg;
6820 alu.src[2].neg = 1;
6821 alu.src[2].chan = i;
6822
6823 if (use_temp)
6824 alu.dst.sel = ctx->temp_reg;
6825 else
6826 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6827 alu.dst.chan = i;
6828 alu.dst.write = 1;
6829 alu.is_op3 = 1;
6830 if (i == 3)
6831 alu.last = 1;
6832 r = r600_bytecode_add_alu(ctx->bc, &alu);
6833 if (r)
6834 return r;
6835 }
6836 if (use_temp)
6837 return tgsi_helper_copy(ctx, inst);
6838 return 0;
6839 }
6840
6841 static int tgsi_exp(struct r600_shader_ctx *ctx)
6842 {
6843 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6844 struct r600_bytecode_alu alu;
6845 int r;
6846 int i;
6847
6848 /* result.x = 2^floor(src); */
6849 if (inst->Dst[0].Register.WriteMask & 1) {
6850 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6851
6852 alu.op = ALU_OP1_FLOOR;
6853 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6854
6855 alu.dst.sel = ctx->temp_reg;
6856 alu.dst.chan = 0;
6857 alu.dst.write = 1;
6858 alu.last = 1;
6859 r = r600_bytecode_add_alu(ctx->bc, &alu);
6860 if (r)
6861 return r;
6862
6863 if (ctx->bc->chip_class == CAYMAN) {
6864 for (i = 0; i < 3; i++) {
6865 alu.op = ALU_OP1_EXP_IEEE;
6866 alu.src[0].sel = ctx->temp_reg;
6867 alu.src[0].chan = 0;
6868
6869 alu.dst.sel = ctx->temp_reg;
6870 alu.dst.chan = i;
6871 alu.dst.write = i == 0;
6872 alu.last = i == 2;
6873 r = r600_bytecode_add_alu(ctx->bc, &alu);
6874 if (r)
6875 return r;
6876 }
6877 } else {
6878 alu.op = ALU_OP1_EXP_IEEE;
6879 alu.src[0].sel = ctx->temp_reg;
6880 alu.src[0].chan = 0;
6881
6882 alu.dst.sel = ctx->temp_reg;
6883 alu.dst.chan = 0;
6884 alu.dst.write = 1;
6885 alu.last = 1;
6886 r = r600_bytecode_add_alu(ctx->bc, &alu);
6887 if (r)
6888 return r;
6889 }
6890 }
6891
6892 /* result.y = tmp - floor(tmp); */
6893 if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
6894 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6895
6896 alu.op = ALU_OP1_FRACT;
6897 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6898
6899 alu.dst.sel = ctx->temp_reg;
6900 #if 0
6901 r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6902 if (r)
6903 return r;
6904 #endif
6905 alu.dst.write = 1;
6906 alu.dst.chan = 1;
6907
6908 alu.last = 1;
6909
6910 r = r600_bytecode_add_alu(ctx->bc, &alu);
6911 if (r)
6912 return r;
6913 }
6914
6915 /* result.z = RoughApprox2ToX(tmp);*/
6916 if ((inst->Dst[0].Register.WriteMask >> 2) & 0x1) {
6917 if (ctx->bc->chip_class == CAYMAN) {
6918 for (i = 0; i < 3; i++) {
6919 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6920 alu.op = ALU_OP1_EXP_IEEE;
6921 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6922
6923 alu.dst.sel = ctx->temp_reg;
6924 alu.dst.chan = i;
6925 if (i == 2) {
6926 alu.dst.write = 1;
6927 alu.last = 1;
6928 }
6929
6930 r = r600_bytecode_add_alu(ctx->bc, &alu);
6931 if (r)
6932 return r;
6933 }
6934 } else {
6935 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6936 alu.op = ALU_OP1_EXP_IEEE;
6937 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6938
6939 alu.dst.sel = ctx->temp_reg;
6940 alu.dst.write = 1;
6941 alu.dst.chan = 2;
6942
6943 alu.last = 1;
6944
6945 r = r600_bytecode_add_alu(ctx->bc, &alu);
6946 if (r)
6947 return r;
6948 }
6949 }
6950
6951 /* result.w = 1.0;*/
6952 if ((inst->Dst[0].Register.WriteMask >> 3) & 0x1) {
6953 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6954
6955 alu.op = ALU_OP1_MOV;
6956 alu.src[0].sel = V_SQ_ALU_SRC_1;
6957 alu.src[0].chan = 0;
6958
6959 alu.dst.sel = ctx->temp_reg;
6960 alu.dst.chan = 3;
6961 alu.dst.write = 1;
6962 alu.last = 1;
6963 r = r600_bytecode_add_alu(ctx->bc, &alu);
6964 if (r)
6965 return r;
6966 }
6967 return tgsi_helper_copy(ctx, inst);
6968 }
6969
6970 static int tgsi_log(struct r600_shader_ctx *ctx)
6971 {
6972 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6973 struct r600_bytecode_alu alu;
6974 int r;
6975 int i;
6976
6977 /* result.x = floor(log2(|src|)); */
6978 if (inst->Dst[0].Register.WriteMask & 1) {
6979 if (ctx->bc->chip_class == CAYMAN) {
6980 for (i = 0; i < 3; i++) {
6981 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6982
6983 alu.op = ALU_OP1_LOG_IEEE;
6984 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6985 r600_bytecode_src_set_abs(&alu.src[0]);
6986
6987 alu.dst.sel = ctx->temp_reg;
6988 alu.dst.chan = i;
6989 if (i == 0)
6990 alu.dst.write = 1;
6991 if (i == 2)
6992 alu.last = 1;
6993 r = r600_bytecode_add_alu(ctx->bc, &alu);
6994 if (r)
6995 return r;
6996 }
6997
6998 } else {
6999 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7000
7001 alu.op = ALU_OP1_LOG_IEEE;
7002 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7003 r600_bytecode_src_set_abs(&alu.src[0]);
7004
7005 alu.dst.sel = ctx->temp_reg;
7006 alu.dst.chan = 0;
7007 alu.dst.write = 1;
7008 alu.last = 1;
7009 r = r600_bytecode_add_alu(ctx->bc, &alu);
7010 if (r)
7011 return r;
7012 }
7013
7014 alu.op = ALU_OP1_FLOOR;
7015 alu.src[0].sel = ctx->temp_reg;
7016 alu.src[0].chan = 0;
7017
7018 alu.dst.sel = ctx->temp_reg;
7019 alu.dst.chan = 0;
7020 alu.dst.write = 1;
7021 alu.last = 1;
7022
7023 r = r600_bytecode_add_alu(ctx->bc, &alu);
7024 if (r)
7025 return r;
7026 }
7027
7028 /* result.y = |src.x| / (2 ^ floor(log2(|src.x|))); */
7029 if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
7030
7031 if (ctx->bc->chip_class == CAYMAN) {
7032 for (i = 0; i < 3; i++) {
7033 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7034
7035 alu.op = ALU_OP1_LOG_IEEE;
7036 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7037 r600_bytecode_src_set_abs(&alu.src[0]);
7038
7039 alu.dst.sel = ctx->temp_reg;
7040 alu.dst.chan = i;
7041 if (i == 1)
7042 alu.dst.write = 1;
7043 if (i == 2)
7044 alu.last = 1;
7045
7046 r = r600_bytecode_add_alu(ctx->bc, &alu);
7047 if (r)
7048 return r;
7049 }
7050 } else {
7051 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7052
7053 alu.op = ALU_OP1_LOG_IEEE;
7054 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7055 r600_bytecode_src_set_abs(&alu.src[0]);
7056
7057 alu.dst.sel = ctx->temp_reg;
7058 alu.dst.chan = 1;
7059 alu.dst.write = 1;
7060 alu.last = 1;
7061
7062 r = r600_bytecode_add_alu(ctx->bc, &alu);
7063 if (r)
7064 return r;
7065 }
7066
7067 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7068
7069 alu.op = ALU_OP1_FLOOR;
7070 alu.src[0].sel = ctx->temp_reg;
7071 alu.src[0].chan = 1;
7072
7073 alu.dst.sel = ctx->temp_reg;
7074 alu.dst.chan = 1;
7075 alu.dst.write = 1;
7076 alu.last = 1;
7077
7078 r = r600_bytecode_add_alu(ctx->bc, &alu);
7079 if (r)
7080 return r;
7081
7082 if (ctx->bc->chip_class == CAYMAN) {
7083 for (i = 0; i < 3; i++) {
7084 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7085 alu.op = ALU_OP1_EXP_IEEE;
7086 alu.src[0].sel = ctx->temp_reg;
7087 alu.src[0].chan = 1;
7088
7089 alu.dst.sel = ctx->temp_reg;
7090 alu.dst.chan = i;
7091 if (i == 1)
7092 alu.dst.write = 1;
7093 if (i == 2)
7094 alu.last = 1;
7095
7096 r = r600_bytecode_add_alu(ctx->bc, &alu);
7097 if (r)
7098 return r;
7099 }
7100 } else {
7101 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7102 alu.op = ALU_OP1_EXP_IEEE;
7103 alu.src[0].sel = ctx->temp_reg;
7104 alu.src[0].chan = 1;
7105
7106 alu.dst.sel = ctx->temp_reg;
7107 alu.dst.chan = 1;
7108 alu.dst.write = 1;
7109 alu.last = 1;
7110
7111 r = r600_bytecode_add_alu(ctx->bc, &alu);
7112 if (r)
7113 return r;
7114 }
7115
7116 if (ctx->bc->chip_class == CAYMAN) {
7117 for (i = 0; i < 3; i++) {
7118 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7119 alu.op = ALU_OP1_RECIP_IEEE;
7120 alu.src[0].sel = ctx->temp_reg;
7121 alu.src[0].chan = 1;
7122
7123 alu.dst.sel = ctx->temp_reg;
7124 alu.dst.chan = i;
7125 if (i == 1)
7126 alu.dst.write = 1;
7127 if (i == 2)
7128 alu.last = 1;
7129
7130 r = r600_bytecode_add_alu(ctx->bc, &alu);
7131 if (r)
7132 return r;
7133 }
7134 } else {
7135 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7136 alu.op = ALU_OP1_RECIP_IEEE;
7137 alu.src[0].sel = ctx->temp_reg;
7138 alu.src[0].chan = 1;
7139
7140 alu.dst.sel = ctx->temp_reg;
7141 alu.dst.chan = 1;
7142 alu.dst.write = 1;
7143 alu.last = 1;
7144
7145 r = r600_bytecode_add_alu(ctx->bc, &alu);
7146 if (r)
7147 return r;
7148 }
7149
7150 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7151
7152 alu.op = ALU_OP2_MUL;
7153
7154 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7155 r600_bytecode_src_set_abs(&alu.src[0]);
7156
7157 alu.src[1].sel = ctx->temp_reg;
7158 alu.src[1].chan = 1;
7159
7160 alu.dst.sel = ctx->temp_reg;
7161 alu.dst.chan = 1;
7162 alu.dst.write = 1;
7163 alu.last = 1;
7164
7165 r = r600_bytecode_add_alu(ctx->bc, &alu);
7166 if (r)
7167 return r;
7168 }
7169
7170 /* result.z = log2(|src|);*/
7171 if ((inst->Dst[0].Register.WriteMask >> 2) & 1) {
7172 if (ctx->bc->chip_class == CAYMAN) {
7173 for (i = 0; i < 3; i++) {
7174 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7175
7176 alu.op = ALU_OP1_LOG_IEEE;
7177 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7178 r600_bytecode_src_set_abs(&alu.src[0]);
7179
7180 alu.dst.sel = ctx->temp_reg;
7181 if (i == 2)
7182 alu.dst.write = 1;
7183 alu.dst.chan = i;
7184 if (i == 2)
7185 alu.last = 1;
7186
7187 r = r600_bytecode_add_alu(ctx->bc, &alu);
7188 if (r)
7189 return r;
7190 }
7191 } else {
7192 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7193
7194 alu.op = ALU_OP1_LOG_IEEE;
7195 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7196 r600_bytecode_src_set_abs(&alu.src[0]);
7197
7198 alu.dst.sel = ctx->temp_reg;
7199 alu.dst.write = 1;
7200 alu.dst.chan = 2;
7201 alu.last = 1;
7202
7203 r = r600_bytecode_add_alu(ctx->bc, &alu);
7204 if (r)
7205 return r;
7206 }
7207 }
7208
7209 /* result.w = 1.0; */
7210 if ((inst->Dst[0].Register.WriteMask >> 3) & 1) {
7211 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7212
7213 alu.op = ALU_OP1_MOV;
7214 alu.src[0].sel = V_SQ_ALU_SRC_1;
7215 alu.src[0].chan = 0;
7216
7217 alu.dst.sel = ctx->temp_reg;
7218 alu.dst.chan = 3;
7219 alu.dst.write = 1;
7220 alu.last = 1;
7221
7222 r = r600_bytecode_add_alu(ctx->bc, &alu);
7223 if (r)
7224 return r;
7225 }
7226
7227 return tgsi_helper_copy(ctx, inst);
7228 }
7229
7230 static int tgsi_eg_arl(struct r600_shader_ctx *ctx)
7231 {
7232 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7233 struct r600_bytecode_alu alu;
7234 int r;
7235 int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7236 unsigned reg = get_address_file_reg(ctx, inst->Dst[0].Register.Index);
7237
7238 assert(inst->Dst[0].Register.Index < 3);
7239 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7240
7241 switch (inst->Instruction.Opcode) {
7242 case TGSI_OPCODE_ARL:
7243 alu.op = ALU_OP1_FLT_TO_INT_FLOOR;
7244 break;
7245 case TGSI_OPCODE_ARR:
7246 alu.op = ALU_OP1_FLT_TO_INT;
7247 break;
7248 case TGSI_OPCODE_UARL:
7249 alu.op = ALU_OP1_MOV;
7250 break;
7251 default:
7252 assert(0);
7253 return -1;
7254 }
7255
7256 for (i = 0; i <= lasti; ++i) {
7257 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7258 continue;
7259 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7260 alu.last = i == lasti;
7261 alu.dst.sel = reg;
7262 alu.dst.chan = i;
7263 alu.dst.write = 1;
7264 r = r600_bytecode_add_alu(ctx->bc, &alu);
7265 if (r)
7266 return r;
7267 }
7268
7269 if (inst->Dst[0].Register.Index > 0)
7270 ctx->bc->index_loaded[inst->Dst[0].Register.Index - 1] = 0;
7271 else
7272 ctx->bc->ar_loaded = 0;
7273
7274 return 0;
7275 }
7276 static int tgsi_r600_arl(struct r600_shader_ctx *ctx)
7277 {
7278 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7279 struct r600_bytecode_alu alu;
7280 int r;
7281 int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7282
7283 switch (inst->Instruction.Opcode) {
7284 case TGSI_OPCODE_ARL:
7285 memset(&alu, 0, sizeof(alu));
7286 alu.op = ALU_OP1_FLOOR;
7287 alu.dst.sel = ctx->bc->ar_reg;
7288 alu.dst.write = 1;
7289 for (i = 0; i <= lasti; ++i) {
7290 if (inst->Dst[0].Register.WriteMask & (1 << i)) {
7291 alu.dst.chan = i;
7292 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7293 alu.last = i == lasti;
7294 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
7295 return r;
7296 }
7297 }
7298
7299 memset(&alu, 0, sizeof(alu));
7300 alu.op = ALU_OP1_FLT_TO_INT;
7301 alu.src[0].sel = ctx->bc->ar_reg;
7302 alu.dst.sel = ctx->bc->ar_reg;
7303 alu.dst.write = 1;
7304 /* FLT_TO_INT is trans-only on r600/r700 */
7305 alu.last = TRUE;
7306 for (i = 0; i <= lasti; ++i) {
7307 alu.dst.chan = i;
7308 alu.src[0].chan = i;
7309 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
7310 return r;
7311 }
7312 break;
7313 case TGSI_OPCODE_ARR:
7314 memset(&alu, 0, sizeof(alu));
7315 alu.op = ALU_OP1_FLT_TO_INT;
7316 alu.dst.sel = ctx->bc->ar_reg;
7317 alu.dst.write = 1;
7318 /* FLT_TO_INT is trans-only on r600/r700 */
7319 alu.last = TRUE;
7320 for (i = 0; i <= lasti; ++i) {
7321 if (inst->Dst[0].Register.WriteMask & (1 << i)) {
7322 alu.dst.chan = i;
7323 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7324 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
7325 return r;
7326 }
7327 }
7328 break;
7329 case TGSI_OPCODE_UARL:
7330 memset(&alu, 0, sizeof(alu));
7331 alu.op = ALU_OP1_MOV;
7332 alu.dst.sel = ctx->bc->ar_reg;
7333 alu.dst.write = 1;
7334 for (i = 0; i <= lasti; ++i) {
7335 if (inst->Dst[0].Register.WriteMask & (1 << i)) {
7336 alu.dst.chan = i;
7337 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7338 alu.last = i == lasti;
7339 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
7340 return r;
7341 }
7342 }
7343 break;
7344 default:
7345 assert(0);
7346 return -1;
7347 }
7348
7349 ctx->bc->ar_loaded = 0;
7350 return 0;
7351 }
7352
7353 static int tgsi_opdst(struct r600_shader_ctx *ctx)
7354 {
7355 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7356 struct r600_bytecode_alu alu;
7357 int i, r = 0;
7358
7359 for (i = 0; i < 4; i++) {
7360 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7361
7362 alu.op = ALU_OP2_MUL;
7363 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7364
7365 if (i == 0 || i == 3) {
7366 alu.src[0].sel = V_SQ_ALU_SRC_1;
7367 } else {
7368 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7369 }
7370
7371 if (i == 0 || i == 2) {
7372 alu.src[1].sel = V_SQ_ALU_SRC_1;
7373 } else {
7374 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
7375 }
7376 if (i == 3)
7377 alu.last = 1;
7378 r = r600_bytecode_add_alu(ctx->bc, &alu);
7379 if (r)
7380 return r;
7381 }
7382 return 0;
7383 }
7384
7385 static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode, int alu_type)
7386 {
7387 struct r600_bytecode_alu alu;
7388 int r;
7389
7390 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7391 alu.op = opcode;
7392 alu.execute_mask = 1;
7393 alu.update_pred = 1;
7394
7395 alu.dst.sel = ctx->temp_reg;
7396 alu.dst.write = 1;
7397 alu.dst.chan = 0;
7398
7399 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7400 alu.src[1].sel = V_SQ_ALU_SRC_0;
7401 alu.src[1].chan = 0;
7402
7403 alu.last = 1;
7404
7405 r = r600_bytecode_add_alu_type(ctx->bc, &alu, alu_type);
7406 if (r)
7407 return r;
7408 return 0;
7409 }
7410
7411 static int pops(struct r600_shader_ctx *ctx, int pops)
7412 {
7413 unsigned force_pop = ctx->bc->force_add_cf;
7414
7415 if (!force_pop) {
7416 int alu_pop = 3;
7417 if (ctx->bc->cf_last) {
7418 if (ctx->bc->cf_last->op == CF_OP_ALU)
7419 alu_pop = 0;
7420 else if (ctx->bc->cf_last->op == CF_OP_ALU_POP_AFTER)
7421 alu_pop = 1;
7422 }
7423 alu_pop += pops;
7424 if (alu_pop == 1) {
7425 ctx->bc->cf_last->op = CF_OP_ALU_POP_AFTER;
7426 ctx->bc->force_add_cf = 1;
7427 } else if (alu_pop == 2) {
7428 ctx->bc->cf_last->op = CF_OP_ALU_POP2_AFTER;
7429 ctx->bc->force_add_cf = 1;
7430 } else {
7431 force_pop = 1;
7432 }
7433 }
7434
7435 if (force_pop) {
7436 r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP);
7437 ctx->bc->cf_last->pop_count = pops;
7438 ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
7439 }
7440
7441 return 0;
7442 }
7443
7444 static inline void callstack_update_max_depth(struct r600_shader_ctx *ctx,
7445 unsigned reason)
7446 {
7447 struct r600_stack_info *stack = &ctx->bc->stack;
7448 unsigned elements, entries;
7449
7450 unsigned entry_size = stack->entry_size;
7451
7452 elements = (stack->loop + stack->push_wqm ) * entry_size;
7453 elements += stack->push;
7454
7455 switch (ctx->bc->chip_class) {
7456 case R600:
7457 case R700:
7458 /* pre-r8xx: if any non-WQM PUSH instruction is invoked, 2 elements on
7459 * the stack must be reserved to hold the current active/continue
7460 * masks */
7461 if (reason == FC_PUSH_VPM) {
7462 elements += 2;
7463 }
7464 break;
7465
7466 case CAYMAN:
7467 /* r9xx: any stack operation on empty stack consumes 2 additional
7468 * elements */
7469 elements += 2;
7470
7471 /* fallthrough */
7472 /* FIXME: do the two elements added above cover the cases for the
7473 * r8xx+ below? */
7474
7475 case EVERGREEN:
7476 /* r8xx+: 2 extra elements are not always required, but one extra
7477 * element must be added for each of the following cases:
7478 * 1. There is an ALU_ELSE_AFTER instruction at the point of greatest
7479 * stack usage.
7480 * (Currently we don't use ALU_ELSE_AFTER.)
7481 * 2. There are LOOP/WQM frames on the stack when any flavor of non-WQM
7482 * PUSH instruction executed.
7483 *
7484 * NOTE: it seems we also need to reserve additional element in some
7485 * other cases, e.g. when we have 4 levels of PUSH_VPM in the shader,
7486 * then STACK_SIZE should be 2 instead of 1 */
7487 if (reason == FC_PUSH_VPM) {
7488 elements += 1;
7489 }
7490 break;
7491
7492 default:
7493 assert(0);
7494 break;
7495 }
7496
7497 /* NOTE: it seems STACK_SIZE is interpreted by hw as if entry_size is 4
7498 * for all chips, so we use 4 in the final formula, not the real entry_size
7499 * for the chip */
7500 entry_size = 4;
7501
7502 entries = (elements + (entry_size - 1)) / entry_size;
7503
7504 if (entries > stack->max_entries)
7505 stack->max_entries = entries;
7506 }
7507
7508 static inline void callstack_pop(struct r600_shader_ctx *ctx, unsigned reason)
7509 {
7510 switch(reason) {
7511 case FC_PUSH_VPM:
7512 --ctx->bc->stack.push;
7513 assert(ctx->bc->stack.push >= 0);
7514 break;
7515 case FC_PUSH_WQM:
7516 --ctx->bc->stack.push_wqm;
7517 assert(ctx->bc->stack.push_wqm >= 0);
7518 break;
7519 case FC_LOOP:
7520 --ctx->bc->stack.loop;
7521 assert(ctx->bc->stack.loop >= 0);
7522 break;
7523 default:
7524 assert(0);
7525 break;
7526 }
7527 }
7528
7529 static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason)
7530 {
7531 switch (reason) {
7532 case FC_PUSH_VPM:
7533 ++ctx->bc->stack.push;
7534 break;
7535 case FC_PUSH_WQM:
7536 ++ctx->bc->stack.push_wqm;
7537 case FC_LOOP:
7538 ++ctx->bc->stack.loop;
7539 break;
7540 default:
7541 assert(0);
7542 }
7543
7544 callstack_update_max_depth(ctx, reason);
7545 }
7546
7547 static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp)
7548 {
7549 struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[fc_sp];
7550
7551 sp->mid = realloc((void *)sp->mid,
7552 sizeof(struct r600_bytecode_cf *) * (sp->num_mid + 1));
7553 sp->mid[sp->num_mid] = ctx->bc->cf_last;
7554 sp->num_mid++;
7555 }
7556
7557 static void fc_pushlevel(struct r600_shader_ctx *ctx, int type)
7558 {
7559 ctx->bc->fc_sp++;
7560 ctx->bc->fc_stack[ctx->bc->fc_sp].type = type;
7561 ctx->bc->fc_stack[ctx->bc->fc_sp].start = ctx->bc->cf_last;
7562 }
7563
7564 static void fc_poplevel(struct r600_shader_ctx *ctx)
7565 {
7566 struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[ctx->bc->fc_sp];
7567 free(sp->mid);
7568 sp->mid = NULL;
7569 sp->num_mid = 0;
7570 sp->start = NULL;
7571 sp->type = 0;
7572 ctx->bc->fc_sp--;
7573 }
7574
7575 #if 0
7576 static int emit_return(struct r600_shader_ctx *ctx)
7577 {
7578 r600_bytecode_add_cfinst(ctx->bc, CF_OP_RETURN));
7579 return 0;
7580 }
7581
7582 static int emit_jump_to_offset(struct r600_shader_ctx *ctx, int pops, int offset)
7583 {
7584
7585 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP));
7586 ctx->bc->cf_last->pop_count = pops;
7587 /* XXX work out offset */
7588 return 0;
7589 }
7590
7591 static int emit_setret_in_loop_flag(struct r600_shader_ctx *ctx, unsigned flag_value)
7592 {
7593 return 0;
7594 }
7595
7596 static void emit_testflag(struct r600_shader_ctx *ctx)
7597 {
7598
7599 }
7600
7601 static void emit_return_on_flag(struct r600_shader_ctx *ctx, unsigned ifidx)
7602 {
7603 emit_testflag(ctx);
7604 emit_jump_to_offset(ctx, 1, 4);
7605 emit_setret_in_loop_flag(ctx, V_SQ_ALU_SRC_0);
7606 pops(ctx, ifidx + 1);
7607 emit_return(ctx);
7608 }
7609
7610 static void break_loop_on_flag(struct r600_shader_ctx *ctx, unsigned fc_sp)
7611 {
7612 emit_testflag(ctx);
7613
7614 r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
7615 ctx->bc->cf_last->pop_count = 1;
7616
7617 fc_set_mid(ctx, fc_sp);
7618
7619 pops(ctx, 1);
7620 }
7621 #endif
7622
7623 static int emit_if(struct r600_shader_ctx *ctx, int opcode)
7624 {
7625 int alu_type = CF_OP_ALU_PUSH_BEFORE;
7626
7627 /* There is a hardware bug on Cayman where a BREAK/CONTINUE followed by
7628 * LOOP_STARTxxx for nested loops may put the branch stack into a state
7629 * such that ALU_PUSH_BEFORE doesn't work as expected. Workaround this
7630 * by replacing the ALU_PUSH_BEFORE with a PUSH + ALU */
7631 if (ctx->bc->chip_class == CAYMAN && ctx->bc->stack.loop > 1) {
7632 r600_bytecode_add_cfinst(ctx->bc, CF_OP_PUSH);
7633 ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
7634 alu_type = CF_OP_ALU;
7635 }
7636
7637 emit_logic_pred(ctx, opcode, alu_type);
7638
7639 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
7640
7641 fc_pushlevel(ctx, FC_IF);
7642
7643 callstack_push(ctx, FC_PUSH_VPM);
7644 return 0;
7645 }
7646
7647 static int tgsi_if(struct r600_shader_ctx *ctx)
7648 {
7649 return emit_if(ctx, ALU_OP2_PRED_SETNE);
7650 }
7651
7652 static int tgsi_uif(struct r600_shader_ctx *ctx)
7653 {
7654 return emit_if(ctx, ALU_OP2_PRED_SETNE_INT);
7655 }
7656
7657 static int tgsi_else(struct r600_shader_ctx *ctx)
7658 {
7659 r600_bytecode_add_cfinst(ctx->bc, CF_OP_ELSE);
7660 ctx->bc->cf_last->pop_count = 1;
7661
7662 fc_set_mid(ctx, ctx->bc->fc_sp);
7663 ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id;
7664 return 0;
7665 }
7666
7667 static int tgsi_endif(struct r600_shader_ctx *ctx)
7668 {
7669 pops(ctx, 1);
7670 if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_IF) {
7671 R600_ERR("if/endif unbalanced in shader\n");
7672 return -1;
7673 }
7674
7675 if (ctx->bc->fc_stack[ctx->bc->fc_sp].mid == NULL) {
7676 ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2;
7677 ctx->bc->fc_stack[ctx->bc->fc_sp].start->pop_count = 1;
7678 } else {
7679 ctx->bc->fc_stack[ctx->bc->fc_sp].mid[0]->cf_addr = ctx->bc->cf_last->id + 2;
7680 }
7681 fc_poplevel(ctx);
7682
7683 callstack_pop(ctx, FC_PUSH_VPM);
7684 return 0;
7685 }
7686
7687 static int tgsi_bgnloop(struct r600_shader_ctx *ctx)
7688 {
7689 /* LOOP_START_DX10 ignores the LOOP_CONFIG* registers, so it is not
7690 * limited to 4096 iterations, like the other LOOP_* instructions. */
7691 r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_START_DX10);
7692
7693 fc_pushlevel(ctx, FC_LOOP);
7694
7695 /* check stack depth */
7696 callstack_push(ctx, FC_LOOP);
7697 return 0;
7698 }
7699
7700 static int tgsi_endloop(struct r600_shader_ctx *ctx)
7701 {
7702 int i;
7703
7704 r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_END);
7705
7706 if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_LOOP) {
7707 R600_ERR("loop/endloop in shader code are not paired.\n");
7708 return -EINVAL;
7709 }
7710
7711 /* fixup loop pointers - from r600isa
7712 LOOP END points to CF after LOOP START,
7713 LOOP START point to CF after LOOP END
7714 BRK/CONT point to LOOP END CF
7715 */
7716 ctx->bc->cf_last->cf_addr = ctx->bc->fc_stack[ctx->bc->fc_sp].start->id + 2;
7717
7718 ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2;
7719
7720 for (i = 0; i < ctx->bc->fc_stack[ctx->bc->fc_sp].num_mid; i++) {
7721 ctx->bc->fc_stack[ctx->bc->fc_sp].mid[i]->cf_addr = ctx->bc->cf_last->id;
7722 }
7723 /* XXX add LOOPRET support */
7724 fc_poplevel(ctx);
7725 callstack_pop(ctx, FC_LOOP);
7726 return 0;
7727 }
7728
7729 static int tgsi_loop_breakc(struct r600_shader_ctx *ctx)
7730 {
7731 int r;
7732 unsigned int fscp;
7733
7734 for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
7735 {
7736 if (FC_LOOP == ctx->bc->fc_stack[fscp].type)
7737 break;
7738 }
7739 if (fscp == 0) {
7740 R600_ERR("BREAKC not inside loop/endloop pair\n");
7741 return -EINVAL;
7742 }
7743
7744 if (ctx->bc->chip_class == EVERGREEN &&
7745 ctx->bc->family != CHIP_CYPRESS &&
7746 ctx->bc->family != CHIP_JUNIPER) {
7747 /* HW bug: ALU_BREAK does not save the active mask correctly */
7748 r = tgsi_uif(ctx);
7749 if (r)
7750 return r;
7751
7752 r = r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_BREAK);
7753 if (r)
7754 return r;
7755 fc_set_mid(ctx, fscp);
7756
7757 return tgsi_endif(ctx);
7758 } else {
7759 r = emit_logic_pred(ctx, ALU_OP2_PRED_SETE_INT, CF_OP_ALU_BREAK);
7760 if (r)
7761 return r;
7762 fc_set_mid(ctx, fscp);
7763 }
7764
7765 return 0;
7766 }
7767
7768 static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx)
7769 {
7770 unsigned int fscp;
7771
7772 for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
7773 {
7774 if (FC_LOOP == ctx->bc->fc_stack[fscp].type)
7775 break;
7776 }
7777
7778 if (fscp == 0) {
7779 R600_ERR("Break not inside loop/endloop pair\n");
7780 return -EINVAL;
7781 }
7782
7783 r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
7784
7785 fc_set_mid(ctx, fscp);
7786
7787 return 0;
7788 }
7789
7790 static int tgsi_gs_emit(struct r600_shader_ctx *ctx)
7791 {
7792 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7793 int stream = ctx->literals[inst->Src[0].Register.Index * 4 + inst->Src[0].Register.SwizzleX];
7794 int r;
7795
7796 if (ctx->inst_info->op == CF_OP_EMIT_VERTEX)
7797 emit_gs_ring_writes(ctx, ctx->gs_stream_output_info, stream, TRUE);
7798
7799 r = r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
7800 if (!r) {
7801 ctx->bc->cf_last->count = stream; // Count field for CUT/EMIT_VERTEX indicates which stream
7802 return emit_inc_ring_offset(ctx, stream, TRUE);
7803 }
7804 return r;
7805 }
7806
7807 static int tgsi_umad(struct r600_shader_ctx *ctx)
7808 {
7809 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7810 struct r600_bytecode_alu alu;
7811 int i, j, k, r;
7812 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7813
7814 /* src0 * src1 */
7815 for (i = 0; i < lasti + 1; i++) {
7816 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7817 continue;
7818
7819 if (ctx->bc->chip_class == CAYMAN) {
7820 for (j = 0 ; j < 4; j++) {
7821 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7822
7823 alu.op = ALU_OP2_MULLO_UINT;
7824 for (k = 0; k < inst->Instruction.NumSrcRegs; k++) {
7825 r600_bytecode_src(&alu.src[k], &ctx->src[k], i);
7826 }
7827 alu.dst.chan = j;
7828 alu.dst.sel = ctx->temp_reg;
7829 alu.dst.write = (j == i);
7830 if (j == 3)
7831 alu.last = 1;
7832 r = r600_bytecode_add_alu(ctx->bc, &alu);
7833 if (r)
7834 return r;
7835 }
7836 } else {
7837 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7838
7839 alu.dst.chan = i;
7840 alu.dst.sel = ctx->temp_reg;
7841 alu.dst.write = 1;
7842
7843 alu.op = ALU_OP2_MULLO_UINT;
7844 for (j = 0; j < 2; j++) {
7845 r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
7846 }
7847
7848 alu.last = 1;
7849 r = r600_bytecode_add_alu(ctx->bc, &alu);
7850 if (r)
7851 return r;
7852 }
7853 }
7854
7855
7856 for (i = 0; i < lasti + 1; i++) {
7857 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7858 continue;
7859
7860 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7861 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7862
7863 alu.op = ALU_OP2_ADD_INT;
7864
7865 alu.src[0].sel = ctx->temp_reg;
7866 alu.src[0].chan = i;
7867
7868 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
7869 if (i == lasti) {
7870 alu.last = 1;
7871 }
7872 r = r600_bytecode_add_alu(ctx->bc, &alu);
7873 if (r)
7874 return r;
7875 }
7876 return 0;
7877 }
7878
7879 static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
7880 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_r600_arl},
7881 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2},
7882 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit},
7883
7884 /* XXX:
7885 * For state trackers other than OpenGL, we'll want to use
7886 * _RECIP_IEEE instead.
7887 */
7888 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_CLAMPED, tgsi_trans_srcx_replicate},
7889
7890 [TGSI_OPCODE_RSQ] = { ALU_OP0_NOP, tgsi_rsq},
7891 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp},
7892 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log},
7893 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL, tgsi_op2},
7894 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2},
7895 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4, tgsi_dp},
7896 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4, tgsi_dp},
7897 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst},
7898 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN, tgsi_op2},
7899 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX, tgsi_op2},
7900 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap},
7901 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2},
7902 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD, tgsi_op3},
7903 [TGSI_OPCODE_SUB] = { ALU_OP2_ADD, tgsi_op2},
7904 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp},
7905 [TGSI_OPCODE_FMA] = { ALU_OP0_NOP, tgsi_unsupported},
7906 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
7907 [TGSI_OPCODE_DP2A] = { ALU_OP0_NOP, tgsi_unsupported},
7908 [22] = { ALU_OP0_NOP, tgsi_unsupported},
7909 [23] = { ALU_OP0_NOP, tgsi_unsupported},
7910 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2},
7911 [TGSI_OPCODE_CLAMP] = { ALU_OP0_NOP, tgsi_unsupported},
7912 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2},
7913 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2},
7914 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
7915 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
7916 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, tgsi_pow},
7917 [TGSI_OPCODE_XPD] = { ALU_OP0_NOP, tgsi_xpd},
7918 [32] = { ALU_OP0_NOP, tgsi_unsupported},
7919 [TGSI_OPCODE_ABS] = { ALU_OP1_MOV, tgsi_op2},
7920 [34] = { ALU_OP0_NOP, tgsi_unsupported},
7921 [TGSI_OPCODE_DPH] = { ALU_OP2_DOT4, tgsi_dp},
7922 [TGSI_OPCODE_COS] = { ALU_OP1_COS, tgsi_trig},
7923 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
7924 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
7925 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */
7926 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_unsupported},
7927 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported},
7928 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported},
7929 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported},
7930 [44] = { ALU_OP0_NOP, tgsi_unsupported},
7931 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2},
7932 [46] = { ALU_OP0_NOP, tgsi_unsupported},
7933 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2},
7934 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, tgsi_trig},
7935 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap},
7936 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2},
7937 [51] = { ALU_OP0_NOP, tgsi_unsupported},
7938 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex},
7939 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex},
7940 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex},
7941 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_unsupported},
7942 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported},
7943 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported},
7944 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported},
7945 [59] = { ALU_OP0_NOP, tgsi_unsupported},
7946 [60] = { ALU_OP0_NOP, tgsi_unsupported},
7947 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_r600_arl},
7948 [62] = { ALU_OP0_NOP, tgsi_unsupported},
7949 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported},
7950 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported},
7951 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg},
7952 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp},
7953 [TGSI_OPCODE_SCS] = { ALU_OP0_NOP, tgsi_scs},
7954 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
7955 [69] = { ALU_OP0_NOP, tgsi_unsupported},
7956 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported},
7957 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4, tgsi_dp},
7958 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex},
7959 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
7960 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if},
7961 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif},
7962 [76] = { ALU_OP0_NOP, tgsi_unsupported},
7963 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else},
7964 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif},
7965 [TGSI_OPCODE_DDX_FINE] = { ALU_OP0_NOP, tgsi_unsupported},
7966 [TGSI_OPCODE_DDY_FINE] = { ALU_OP0_NOP, tgsi_unsupported},
7967 [TGSI_OPCODE_PUSHA] = { ALU_OP0_NOP, tgsi_unsupported},
7968 [TGSI_OPCODE_POPA] = { ALU_OP0_NOP, tgsi_unsupported},
7969 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2},
7970 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
7971 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2},
7972 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2},
7973 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2_trans},
7974 [88] = { ALU_OP0_NOP, tgsi_unsupported},
7975 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2},
7976 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2},
7977 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod},
7978 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2},
7979 [TGSI_OPCODE_SAD] = { ALU_OP0_NOP, tgsi_unsupported},
7980 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex},
7981 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
7982 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
7983 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
7984 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit},
7985 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop},
7986 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported},
7987 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop},
7988 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported},
7989 [TGSI_OPCODE_TXQ_LZ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
7990 [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
7991 [105] = { ALU_OP0_NOP, tgsi_unsupported},
7992 [106] = { ALU_OP0_NOP, tgsi_unsupported},
7993 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported},
7994 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2},
7995 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2},
7996 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
7997 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
7998 [112] = { ALU_OP0_NOP, tgsi_unsupported},
7999 [TGSI_OPCODE_CALLNZ] = { ALU_OP0_NOP, tgsi_unsupported},
8000 [114] = { ALU_OP0_NOP, tgsi_unsupported},
8001 [TGSI_OPCODE_BREAKC] = { ALU_OP0_NOP, tgsi_loop_breakc},
8002 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */
8003 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */
8004 [118] = { ALU_OP0_NOP, tgsi_unsupported},
8005 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_op2_trans},
8006 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv},
8007 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2},
8008 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2},
8009 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg},
8010 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2},
8011 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2_trans},
8012 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap},
8013 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_op2_trans},
8014 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
8015 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2},
8016 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv},
8017 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad},
8018 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2},
8019 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2},
8020 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod},
8021 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_UINT, tgsi_op2_trans},
8022 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2},
8023 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2},
8024 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2_trans},
8025 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
8026 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2_swap},
8027 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
8028 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported},
8029 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported},
8030 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
8031 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported},
8032 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported},
8033 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported},
8034 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported},
8035 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported},
8036 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported},
8037 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported},
8038 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported},
8039 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported},
8040 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported},
8041 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported},
8042 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported},
8043 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_r600_arl},
8044 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp},
8045 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs},
8046 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg},
8047 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_unsupported},
8048 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_unsupported},
8049 [TGSI_OPCODE_MFENCE] = { ALU_OP0_NOP, tgsi_unsupported},
8050 [TGSI_OPCODE_LFENCE] = { ALU_OP0_NOP, tgsi_unsupported},
8051 [TGSI_OPCODE_SFENCE] = { ALU_OP0_NOP, tgsi_unsupported},
8052 [TGSI_OPCODE_BARRIER] = { ALU_OP0_NOP, tgsi_unsupported},
8053 [TGSI_OPCODE_ATOMUADD] = { ALU_OP0_NOP, tgsi_unsupported},
8054 [TGSI_OPCODE_ATOMXCHG] = { ALU_OP0_NOP, tgsi_unsupported},
8055 [TGSI_OPCODE_ATOMCAS] = { ALU_OP0_NOP, tgsi_unsupported},
8056 [TGSI_OPCODE_ATOMAND] = { ALU_OP0_NOP, tgsi_unsupported},
8057 [TGSI_OPCODE_ATOMOR] = { ALU_OP0_NOP, tgsi_unsupported},
8058 [TGSI_OPCODE_ATOMXOR] = { ALU_OP0_NOP, tgsi_unsupported},
8059 [TGSI_OPCODE_ATOMUMIN] = { ALU_OP0_NOP, tgsi_unsupported},
8060 [TGSI_OPCODE_ATOMUMAX] = { ALU_OP0_NOP, tgsi_unsupported},
8061 [TGSI_OPCODE_ATOMIMIN] = { ALU_OP0_NOP, tgsi_unsupported},
8062 [TGSI_OPCODE_ATOMIMAX] = { ALU_OP0_NOP, tgsi_unsupported},
8063 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex},
8064 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
8065 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex},
8066 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, tgsi_op2_trans},
8067 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, tgsi_op2_trans},
8068 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_unsupported},
8069 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_unsupported},
8070 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_unsupported},
8071 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_unsupported},
8072 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_unsupported},
8073 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_unsupported},
8074 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_unsupported},
8075 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_unsupported},
8076 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_unsupported},
8077 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_unsupported},
8078 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_unsupported},
8079 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_unsupported},
8080 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_unsupported},
8081 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported},
8082 };
8083
8084 static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = {
8085 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_eg_arl},
8086 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2},
8087 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit},
8088 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate},
8089 [TGSI_OPCODE_RSQ] = { ALU_OP1_RECIPSQRT_IEEE, tgsi_rsq},
8090 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp},
8091 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log},
8092 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL, tgsi_op2},
8093 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2},
8094 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4, tgsi_dp},
8095 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4, tgsi_dp},
8096 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst},
8097 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN, tgsi_op2},
8098 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX, tgsi_op2},
8099 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap},
8100 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2},
8101 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD, tgsi_op3},
8102 [TGSI_OPCODE_SUB] = { ALU_OP2_ADD, tgsi_op2},
8103 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp},
8104 [TGSI_OPCODE_FMA] = { ALU_OP0_NOP, tgsi_unsupported},
8105 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
8106 [TGSI_OPCODE_DP2A] = { ALU_OP0_NOP, tgsi_unsupported},
8107 [22] = { ALU_OP0_NOP, tgsi_unsupported},
8108 [23] = { ALU_OP0_NOP, tgsi_unsupported},
8109 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2},
8110 [TGSI_OPCODE_CLAMP] = { ALU_OP0_NOP, tgsi_unsupported},
8111 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2},
8112 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2},
8113 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
8114 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
8115 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, tgsi_pow},
8116 [TGSI_OPCODE_XPD] = { ALU_OP0_NOP, tgsi_xpd},
8117 [32] = { ALU_OP0_NOP, tgsi_unsupported},
8118 [TGSI_OPCODE_ABS] = { ALU_OP1_MOV, tgsi_op2},
8119 [34] = { ALU_OP0_NOP, tgsi_unsupported},
8120 [TGSI_OPCODE_DPH] = { ALU_OP2_DOT4, tgsi_dp},
8121 [TGSI_OPCODE_COS] = { ALU_OP1_COS, tgsi_trig},
8122 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
8123 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
8124 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */
8125 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_unsupported},
8126 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported},
8127 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported},
8128 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported},
8129 [44] = { ALU_OP0_NOP, tgsi_unsupported},
8130 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2},
8131 [46] = { ALU_OP0_NOP, tgsi_unsupported},
8132 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2},
8133 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, tgsi_trig},
8134 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap},
8135 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2},
8136 [51] = { ALU_OP0_NOP, tgsi_unsupported},
8137 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex},
8138 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex},
8139 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex},
8140 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_unsupported},
8141 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported},
8142 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported},
8143 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported},
8144 [59] = { ALU_OP0_NOP, tgsi_unsupported},
8145 [60] = { ALU_OP0_NOP, tgsi_unsupported},
8146 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_eg_arl},
8147 [62] = { ALU_OP0_NOP, tgsi_unsupported},
8148 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported},
8149 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported},
8150 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg},
8151 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp},
8152 [TGSI_OPCODE_SCS] = { ALU_OP0_NOP, tgsi_scs},
8153 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
8154 [69] = { ALU_OP0_NOP, tgsi_unsupported},
8155 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported},
8156 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4, tgsi_dp},
8157 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex},
8158 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
8159 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if},
8160 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif},
8161 [76] = { ALU_OP0_NOP, tgsi_unsupported},
8162 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else},
8163 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif},
8164 [TGSI_OPCODE_DDX_FINE] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
8165 [TGSI_OPCODE_DDY_FINE] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
8166 [TGSI_OPCODE_PUSHA] = { ALU_OP0_NOP, tgsi_unsupported},
8167 [TGSI_OPCODE_POPA] = { ALU_OP0_NOP, tgsi_unsupported},
8168 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2},
8169 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
8170 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2},
8171 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2},
8172 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2},
8173 [88] = { ALU_OP0_NOP, tgsi_unsupported},
8174 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2},
8175 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2},
8176 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod},
8177 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2},
8178 [TGSI_OPCODE_SAD] = { ALU_OP0_NOP, tgsi_unsupported},
8179 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex},
8180 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
8181 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
8182 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
8183 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit},
8184 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop},
8185 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported},
8186 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop},
8187 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported},
8188 [TGSI_OPCODE_TXQ_LZ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
8189 [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
8190 [105] = { ALU_OP0_NOP, tgsi_unsupported},
8191 [106] = { ALU_OP0_NOP, tgsi_unsupported},
8192 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported},
8193 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2},
8194 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2},
8195 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
8196 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
8197 [112] = { ALU_OP0_NOP, tgsi_unsupported},
8198 [TGSI_OPCODE_CALLNZ] = { ALU_OP0_NOP, tgsi_unsupported},
8199 [114] = { ALU_OP0_NOP, tgsi_unsupported},
8200 [TGSI_OPCODE_BREAKC] = { ALU_OP0_NOP, tgsi_unsupported},
8201 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */
8202 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */
8203 [118] = { ALU_OP0_NOP, tgsi_unsupported},
8204 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_f2i},
8205 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv},
8206 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2},
8207 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2},
8208 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg},
8209 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2},
8210 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2},
8211 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap},
8212 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_f2i},
8213 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
8214 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2},
8215 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv},
8216 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad},
8217 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2},
8218 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2},
8219 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod},
8220 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_UINT, tgsi_op2_trans},
8221 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2},
8222 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2},
8223 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2},
8224 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
8225 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2},
8226 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
8227 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported},
8228 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported},
8229 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
8230 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported},
8231 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported},
8232 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported},
8233 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported},
8234 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported},
8235 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported},
8236 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported},
8237 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported},
8238 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported},
8239 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported},
8240 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported},
8241 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported},
8242 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_eg_arl},
8243 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp},
8244 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs},
8245 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg},
8246 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_unsupported},
8247 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_unsupported},
8248 [TGSI_OPCODE_MFENCE] = { ALU_OP0_NOP, tgsi_unsupported},
8249 [TGSI_OPCODE_LFENCE] = { ALU_OP0_NOP, tgsi_unsupported},
8250 [TGSI_OPCODE_SFENCE] = { ALU_OP0_NOP, tgsi_unsupported},
8251 [TGSI_OPCODE_BARRIER] = { ALU_OP0_NOP, tgsi_unsupported},
8252 [TGSI_OPCODE_ATOMUADD] = { ALU_OP0_NOP, tgsi_unsupported},
8253 [TGSI_OPCODE_ATOMXCHG] = { ALU_OP0_NOP, tgsi_unsupported},
8254 [TGSI_OPCODE_ATOMCAS] = { ALU_OP0_NOP, tgsi_unsupported},
8255 [TGSI_OPCODE_ATOMAND] = { ALU_OP0_NOP, tgsi_unsupported},
8256 [TGSI_OPCODE_ATOMOR] = { ALU_OP0_NOP, tgsi_unsupported},
8257 [TGSI_OPCODE_ATOMXOR] = { ALU_OP0_NOP, tgsi_unsupported},
8258 [TGSI_OPCODE_ATOMUMIN] = { ALU_OP0_NOP, tgsi_unsupported},
8259 [TGSI_OPCODE_ATOMUMAX] = { ALU_OP0_NOP, tgsi_unsupported},
8260 [TGSI_OPCODE_ATOMIMIN] = { ALU_OP0_NOP, tgsi_unsupported},
8261 [TGSI_OPCODE_ATOMIMAX] = { ALU_OP0_NOP, tgsi_unsupported},
8262 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex},
8263 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
8264 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex},
8265 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, tgsi_op2_trans},
8266 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, tgsi_op2_trans},
8267 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_tex},
8268 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_tex},
8269 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_op3},
8270 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_op3},
8271 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_bfi},
8272 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_op2},
8273 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_op2},
8274 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_op2},
8275 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_msb},
8276 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_msb},
8277 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_interp_egcm},
8278 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_interp_egcm},
8279 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_interp_egcm},
8280 [TGSI_OPCODE_F2D] = { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},
8281 [TGSI_OPCODE_D2F] = { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},
8282 [TGSI_OPCODE_DABS] = { ALU_OP1_MOV, tgsi_op2_64},
8283 [TGSI_OPCODE_DNEG] = { ALU_OP2_ADD_64, tgsi_dneg},
8284 [TGSI_OPCODE_DADD] = { ALU_OP2_ADD_64, tgsi_op2_64},
8285 [TGSI_OPCODE_DMUL] = { ALU_OP2_MUL_64, cayman_mul_double_instr},
8286 [TGSI_OPCODE_DMAX] = { ALU_OP2_MAX_64, tgsi_op2_64},
8287 [TGSI_OPCODE_DMIN] = { ALU_OP2_MIN_64, tgsi_op2_64},
8288 [TGSI_OPCODE_DSLT] = { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
8289 [TGSI_OPCODE_DSGE] = { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},
8290 [TGSI_OPCODE_DSEQ] = { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},
8291 [TGSI_OPCODE_DSNE] = { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},
8292 [TGSI_OPCODE_DRCP] = { ALU_OP2_RECIP_64, cayman_emit_double_instr},
8293 [TGSI_OPCODE_DSQRT] = { ALU_OP2_SQRT_64, cayman_emit_double_instr},
8294 [TGSI_OPCODE_DMAD] = { ALU_OP3_FMA_64, tgsi_op3_64},
8295 [TGSI_OPCODE_DFRAC] = { ALU_OP1_FRACT_64, tgsi_op2_64},
8296 [TGSI_OPCODE_DLDEXP] = { ALU_OP2_LDEXP_64, tgsi_op2_64},
8297 [TGSI_OPCODE_DFRACEXP] = { ALU_OP1_FREXP_64, tgsi_dfracexp},
8298 [TGSI_OPCODE_D2I] = { ALU_OP1_FLT_TO_INT, egcm_double_to_int},
8299 [TGSI_OPCODE_I2D] = { ALU_OP1_INT_TO_FLT, egcm_int_to_double},
8300 [TGSI_OPCODE_D2U] = { ALU_OP1_FLT_TO_UINT, egcm_double_to_int},
8301 [TGSI_OPCODE_U2D] = { ALU_OP1_UINT_TO_FLT, egcm_int_to_double},
8302 [TGSI_OPCODE_DRSQ] = { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},
8303 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported},
8304 };
8305
8306 static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = {
8307 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_eg_arl},
8308 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2},
8309 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit},
8310 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, cayman_emit_float_instr},
8311 [TGSI_OPCODE_RSQ] = { ALU_OP1_RECIPSQRT_IEEE, cayman_emit_float_instr},
8312 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp},
8313 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log},
8314 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL, tgsi_op2},
8315 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2},
8316 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4, tgsi_dp},
8317 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4, tgsi_dp},
8318 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst},
8319 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN, tgsi_op2},
8320 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX, tgsi_op2},
8321 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap},
8322 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2},
8323 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD, tgsi_op3},
8324 [TGSI_OPCODE_SUB] = { ALU_OP2_ADD, tgsi_op2},
8325 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp},
8326 [TGSI_OPCODE_FMA] = { ALU_OP0_NOP, tgsi_unsupported},
8327 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, cayman_emit_float_instr},
8328 [TGSI_OPCODE_DP2A] = { ALU_OP0_NOP, tgsi_unsupported},
8329 [22] = { ALU_OP0_NOP, tgsi_unsupported},
8330 [23] = { ALU_OP0_NOP, tgsi_unsupported},
8331 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2},
8332 [TGSI_OPCODE_CLAMP] = { ALU_OP0_NOP, tgsi_unsupported},
8333 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2},
8334 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2},
8335 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, cayman_emit_float_instr},
8336 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, cayman_emit_float_instr},
8337 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, cayman_pow},
8338 [TGSI_OPCODE_XPD] = { ALU_OP0_NOP, tgsi_xpd},
8339 [32] = { ALU_OP0_NOP, tgsi_unsupported},
8340 [TGSI_OPCODE_ABS] = { ALU_OP1_MOV, tgsi_op2},
8341 [34] = { ALU_OP0_NOP, tgsi_unsupported},
8342 [TGSI_OPCODE_DPH] = { ALU_OP2_DOT4, tgsi_dp},
8343 [TGSI_OPCODE_COS] = { ALU_OP1_COS, cayman_trig},
8344 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
8345 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
8346 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */
8347 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_unsupported},
8348 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported},
8349 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported},
8350 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported},
8351 [44] = { ALU_OP0_NOP, tgsi_unsupported},
8352 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2},
8353 [46] = { ALU_OP0_NOP, tgsi_unsupported},
8354 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2},
8355 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, cayman_trig},
8356 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap},
8357 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2},
8358 [51] = { ALU_OP0_NOP, tgsi_unsupported},
8359 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex},
8360 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex},
8361 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex},
8362 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_unsupported},
8363 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported},
8364 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported},
8365 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported},
8366 [59] = { ALU_OP0_NOP, tgsi_unsupported},
8367 [60] = { ALU_OP0_NOP, tgsi_unsupported},
8368 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_eg_arl},
8369 [62] = { ALU_OP0_NOP, tgsi_unsupported},
8370 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported},
8371 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported},
8372 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg},
8373 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp},
8374 [TGSI_OPCODE_SCS] = { ALU_OP0_NOP, tgsi_scs},
8375 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
8376 [69] = { ALU_OP0_NOP, tgsi_unsupported},
8377 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported},
8378 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4, tgsi_dp},
8379 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex},
8380 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
8381 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if},
8382 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif},
8383 [76] = { ALU_OP0_NOP, tgsi_unsupported},
8384 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else},
8385 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif},
8386 [TGSI_OPCODE_DDX_FINE] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
8387 [TGSI_OPCODE_DDY_FINE] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
8388 [TGSI_OPCODE_PUSHA] = { ALU_OP0_NOP, tgsi_unsupported},
8389 [TGSI_OPCODE_POPA] = { ALU_OP0_NOP, tgsi_unsupported},
8390 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2},
8391 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2},
8392 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2},
8393 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2},
8394 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2},
8395 [88] = { ALU_OP0_NOP, tgsi_unsupported},
8396 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2},
8397 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2},
8398 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod},
8399 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2},
8400 [TGSI_OPCODE_SAD] = { ALU_OP0_NOP, tgsi_unsupported},
8401 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex},
8402 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
8403 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
8404 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
8405 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit},
8406 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop},
8407 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported},
8408 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop},
8409 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported},
8410 [TGSI_OPCODE_TXQ_LZ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
8411 [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
8412 [105] = { ALU_OP0_NOP, tgsi_unsupported},
8413 [106] = { ALU_OP0_NOP, tgsi_unsupported},
8414 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported},
8415 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2},
8416 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2},
8417 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
8418 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
8419 [112] = { ALU_OP0_NOP, tgsi_unsupported},
8420 [TGSI_OPCODE_CALLNZ] = { ALU_OP0_NOP, tgsi_unsupported},
8421 [114] = { ALU_OP0_NOP, tgsi_unsupported},
8422 [TGSI_OPCODE_BREAKC] = { ALU_OP0_NOP, tgsi_unsupported},
8423 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */
8424 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */
8425 [118] = { ALU_OP0_NOP, tgsi_unsupported},
8426 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_op2},
8427 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv},
8428 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2},
8429 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2},
8430 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg},
8431 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2},
8432 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2},
8433 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap},
8434 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_op2},
8435 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2},
8436 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2},
8437 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv},
8438 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad},
8439 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2},
8440 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2},
8441 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod},
8442 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_INT, cayman_mul_int_instr},
8443 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2},
8444 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2},
8445 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2},
8446 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
8447 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2},
8448 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
8449 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported},
8450 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported},
8451 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
8452 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported},
8453 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported},
8454 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported},
8455 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported},
8456 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported},
8457 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported},
8458 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported},
8459 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported},
8460 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported},
8461 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported},
8462 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported},
8463 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported},
8464 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_eg_arl},
8465 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp},
8466 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs},
8467 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg},
8468 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_unsupported},
8469 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_unsupported},
8470 [TGSI_OPCODE_MFENCE] = { ALU_OP0_NOP, tgsi_unsupported},
8471 [TGSI_OPCODE_LFENCE] = { ALU_OP0_NOP, tgsi_unsupported},
8472 [TGSI_OPCODE_SFENCE] = { ALU_OP0_NOP, tgsi_unsupported},
8473 [TGSI_OPCODE_BARRIER] = { ALU_OP0_NOP, tgsi_unsupported},
8474 [TGSI_OPCODE_ATOMUADD] = { ALU_OP0_NOP, tgsi_unsupported},
8475 [TGSI_OPCODE_ATOMXCHG] = { ALU_OP0_NOP, tgsi_unsupported},
8476 [TGSI_OPCODE_ATOMCAS] = { ALU_OP0_NOP, tgsi_unsupported},
8477 [TGSI_OPCODE_ATOMAND] = { ALU_OP0_NOP, tgsi_unsupported},
8478 [TGSI_OPCODE_ATOMOR] = { ALU_OP0_NOP, tgsi_unsupported},
8479 [TGSI_OPCODE_ATOMXOR] = { ALU_OP0_NOP, tgsi_unsupported},
8480 [TGSI_OPCODE_ATOMUMIN] = { ALU_OP0_NOP, tgsi_unsupported},
8481 [TGSI_OPCODE_ATOMUMAX] = { ALU_OP0_NOP, tgsi_unsupported},
8482 [TGSI_OPCODE_ATOMIMIN] = { ALU_OP0_NOP, tgsi_unsupported},
8483 [TGSI_OPCODE_ATOMIMAX] = { ALU_OP0_NOP, tgsi_unsupported},
8484 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex},
8485 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
8486 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex},
8487 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, cayman_mul_int_instr},
8488 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, cayman_mul_int_instr},
8489 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_tex},
8490 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_tex},
8491 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_op3},
8492 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_op3},
8493 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_bfi},
8494 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_op2},
8495 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_op2},
8496 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_op2},
8497 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_msb},
8498 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_msb},
8499 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_interp_egcm},
8500 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_interp_egcm},
8501 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_interp_egcm},
8502 [TGSI_OPCODE_F2D] = { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},
8503 [TGSI_OPCODE_D2F] = { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},
8504 [TGSI_OPCODE_DABS] = { ALU_OP1_MOV, tgsi_op2_64},
8505 [TGSI_OPCODE_DNEG] = { ALU_OP2_ADD_64, tgsi_dneg},
8506 [TGSI_OPCODE_DADD] = { ALU_OP2_ADD_64, tgsi_op2_64},
8507 [TGSI_OPCODE_DMUL] = { ALU_OP2_MUL_64, cayman_mul_double_instr},
8508 [TGSI_OPCODE_DMAX] = { ALU_OP2_MAX_64, tgsi_op2_64},
8509 [TGSI_OPCODE_DMIN] = { ALU_OP2_MIN_64, tgsi_op2_64},
8510 [TGSI_OPCODE_DSLT] = { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
8511 [TGSI_OPCODE_DSGE] = { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},
8512 [TGSI_OPCODE_DSEQ] = { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},
8513 [TGSI_OPCODE_DSNE] = { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},
8514 [TGSI_OPCODE_DRCP] = { ALU_OP2_RECIP_64, cayman_emit_double_instr},
8515 [TGSI_OPCODE_DSQRT] = { ALU_OP2_SQRT_64, cayman_emit_double_instr},
8516 [TGSI_OPCODE_DMAD] = { ALU_OP3_FMA_64, tgsi_op3_64},
8517 [TGSI_OPCODE_DFRAC] = { ALU_OP1_FRACT_64, tgsi_op2_64},
8518 [TGSI_OPCODE_DLDEXP] = { ALU_OP2_LDEXP_64, tgsi_op2_64},
8519 [TGSI_OPCODE_DFRACEXP] = { ALU_OP1_FREXP_64, tgsi_dfracexp},
8520 [TGSI_OPCODE_D2I] = { ALU_OP1_FLT_TO_INT, egcm_double_to_int},
8521 [TGSI_OPCODE_I2D] = { ALU_OP1_INT_TO_FLT, egcm_int_to_double},
8522 [TGSI_OPCODE_D2U] = { ALU_OP1_FLT_TO_UINT, egcm_double_to_int},
8523 [TGSI_OPCODE_U2D] = { ALU_OP1_UINT_TO_FLT, egcm_int_to_double},
8524 [TGSI_OPCODE_DRSQ] = { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},
8525 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported},
8526 };