r600: workaround empty geom shader.
[mesa.git] / src / gallium / drivers / r600 / r600_shader.c
1 /*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 */
23 #include "r600_sq.h"
24 #include "r600_llvm.h"
25 #include "r600_formats.h"
26 #include "r600_opcodes.h"
27 #include "r600_shader.h"
28 #include "r600d.h"
29
30 #include "sb/sb_public.h"
31
32 #include "pipe/p_shader_tokens.h"
33 #include "tgsi/tgsi_info.h"
34 #include "tgsi/tgsi_parse.h"
35 #include "tgsi/tgsi_scan.h"
36 #include "tgsi/tgsi_dump.h"
37 #include "util/u_memory.h"
38 #include "util/u_math.h"
39 #include <stdio.h>
40 #include <errno.h>
41
42 /* CAYMAN notes
43 Why CAYMAN got loops for lots of instructions is explained here.
44
45 -These 8xx t-slot only ops are implemented in all vector slots.
46 MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT
47 These 8xx t-slot only opcodes become vector ops, with all four
48 slots expecting the arguments on sources a and b. Result is
49 broadcast to all channels.
50 MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT, MUL_64
51 These 8xx t-slot only opcodes become vector ops in the z, y, and
52 x slots.
53 EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64
54 RECIPSQRT_IEEE/CLAMPED/FF/_64/CLAMPED_64
55 SQRT_IEEE/_64
56 SIN/COS
57 The w slot may have an independent co-issued operation, or if the
58 result is required to be in the w slot, the opcode above may be
59 issued in the w slot as well.
60 The compiler must issue the source argument to slots z, y, and x
61 */
62
63 #define R600_SHADER_BUFFER_INFO_SEL (512 + R600_BUFFER_INFO_OFFSET / 16)
64 static int r600_shader_from_tgsi(struct r600_context *rctx,
65 struct r600_pipe_shader *pipeshader,
66 union r600_shader_key key);
67
68
69 static void r600_add_gpr_array(struct r600_shader *ps, int start_gpr,
70 int size, unsigned comp_mask) {
71
72 if (!size)
73 return;
74
75 if (ps->num_arrays == ps->max_arrays) {
76 ps->max_arrays += 64;
77 ps->arrays = realloc(ps->arrays, ps->max_arrays *
78 sizeof(struct r600_shader_array));
79 }
80
81 int n = ps->num_arrays;
82 ++ps->num_arrays;
83
84 ps->arrays[n].comp_mask = comp_mask;
85 ps->arrays[n].gpr_start = start_gpr;
86 ps->arrays[n].gpr_count = size;
87 }
88
89 static void r600_dump_streamout(struct pipe_stream_output_info *so)
90 {
91 unsigned i;
92
93 fprintf(stderr, "STREAMOUT\n");
94 for (i = 0; i < so->num_outputs; i++) {
95 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
96 so->output[i].start_component;
97 fprintf(stderr, " %i: MEM_STREAM%d_BUF%i[%i..%i] <- OUT[%i].%s%s%s%s%s\n",
98 i,
99 so->output[i].stream,
100 so->output[i].output_buffer,
101 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
102 so->output[i].register_index,
103 mask & 1 ? "x" : "",
104 mask & 2 ? "y" : "",
105 mask & 4 ? "z" : "",
106 mask & 8 ? "w" : "",
107 so->output[i].dst_offset < so->output[i].start_component ? " (will lower)" : "");
108 }
109 }
110
111 static int store_shader(struct pipe_context *ctx,
112 struct r600_pipe_shader *shader)
113 {
114 struct r600_context *rctx = (struct r600_context *)ctx;
115 uint32_t *ptr, i;
116
117 if (shader->bo == NULL) {
118 shader->bo = (struct r600_resource*)
119 pipe_buffer_create(ctx->screen, PIPE_BIND_CUSTOM, PIPE_USAGE_IMMUTABLE, shader->shader.bc.ndw * 4);
120 if (shader->bo == NULL) {
121 return -ENOMEM;
122 }
123 ptr = r600_buffer_map_sync_with_rings(&rctx->b, shader->bo, PIPE_TRANSFER_WRITE);
124 if (R600_BIG_ENDIAN) {
125 for (i = 0; i < shader->shader.bc.ndw; ++i) {
126 ptr[i] = util_cpu_to_le32(shader->shader.bc.bytecode[i]);
127 }
128 } else {
129 memcpy(ptr, shader->shader.bc.bytecode, shader->shader.bc.ndw * sizeof(*ptr));
130 }
131 rctx->b.ws->buffer_unmap(shader->bo->cs_buf);
132 }
133
134 return 0;
135 }
136
137 int r600_pipe_shader_create(struct pipe_context *ctx,
138 struct r600_pipe_shader *shader,
139 union r600_shader_key key)
140 {
141 struct r600_context *rctx = (struct r600_context *)ctx;
142 struct r600_pipe_shader_selector *sel = shader->selector;
143 int r;
144 bool dump = r600_can_dump_shader(&rctx->screen->b, sel->tokens);
145 unsigned use_sb = !(rctx->screen->b.debug_flags & DBG_NO_SB);
146 unsigned sb_disasm = use_sb || (rctx->screen->b.debug_flags & DBG_SB_DISASM);
147 unsigned export_shader;
148
149 shader->shader.bc.isa = rctx->isa;
150
151 if (dump) {
152 fprintf(stderr, "--------------------------------------------------------------\n");
153 tgsi_dump(sel->tokens, 0);
154
155 if (sel->so.num_outputs) {
156 r600_dump_streamout(&sel->so);
157 }
158 }
159 r = r600_shader_from_tgsi(rctx, shader, key);
160 if (r) {
161 R600_ERR("translation from TGSI failed !\n");
162 goto error;
163 }
164
165 /* disable SB for shaders using doubles */
166 use_sb &= !shader->shader.uses_doubles;
167
168 /* Check if the bytecode has already been built. When using the llvm
169 * backend, r600_shader_from_tgsi() will take care of building the
170 * bytecode.
171 */
172 if (!shader->shader.bc.bytecode) {
173 r = r600_bytecode_build(&shader->shader.bc);
174 if (r) {
175 R600_ERR("building bytecode failed !\n");
176 goto error;
177 }
178 }
179
180 if (dump && !sb_disasm) {
181 fprintf(stderr, "--------------------------------------------------------------\n");
182 r600_bytecode_disasm(&shader->shader.bc);
183 fprintf(stderr, "______________________________________________________________\n");
184 } else if ((dump && sb_disasm) || use_sb) {
185 r = r600_sb_bytecode_process(rctx, &shader->shader.bc, &shader->shader,
186 dump, use_sb);
187 if (r) {
188 R600_ERR("r600_sb_bytecode_process failed !\n");
189 goto error;
190 }
191 }
192
193 if (shader->gs_copy_shader) {
194 if (dump) {
195 // dump copy shader
196 r = r600_sb_bytecode_process(rctx, &shader->gs_copy_shader->shader.bc,
197 &shader->gs_copy_shader->shader, dump, 0);
198 if (r)
199 goto error;
200 }
201
202 if ((r = store_shader(ctx, shader->gs_copy_shader)))
203 goto error;
204 }
205
206 /* Store the shader in a buffer. */
207 if ((r = store_shader(ctx, shader)))
208 goto error;
209
210 /* Build state. */
211 switch (shader->shader.processor_type) {
212 case TGSI_PROCESSOR_GEOMETRY:
213 if (rctx->b.chip_class >= EVERGREEN) {
214 evergreen_update_gs_state(ctx, shader);
215 evergreen_update_vs_state(ctx, shader->gs_copy_shader);
216 } else {
217 r600_update_gs_state(ctx, shader);
218 r600_update_vs_state(ctx, shader->gs_copy_shader);
219 }
220 break;
221 case TGSI_PROCESSOR_VERTEX:
222 export_shader = key.vs.as_es;
223 if (rctx->b.chip_class >= EVERGREEN) {
224 if (export_shader)
225 evergreen_update_es_state(ctx, shader);
226 else
227 evergreen_update_vs_state(ctx, shader);
228 } else {
229 if (export_shader)
230 r600_update_es_state(ctx, shader);
231 else
232 r600_update_vs_state(ctx, shader);
233 }
234 break;
235 case TGSI_PROCESSOR_FRAGMENT:
236 if (rctx->b.chip_class >= EVERGREEN) {
237 evergreen_update_ps_state(ctx, shader);
238 } else {
239 r600_update_ps_state(ctx, shader);
240 }
241 break;
242 default:
243 r = -EINVAL;
244 goto error;
245 }
246 return 0;
247
248 error:
249 r600_pipe_shader_destroy(ctx, shader);
250 return r;
251 }
252
253 void r600_pipe_shader_destroy(struct pipe_context *ctx, struct r600_pipe_shader *shader)
254 {
255 pipe_resource_reference((struct pipe_resource**)&shader->bo, NULL);
256 r600_bytecode_clear(&shader->shader.bc);
257 r600_release_command_buffer(&shader->command_buffer);
258 }
259
260 /*
261 * tgsi -> r600 shader
262 */
263 struct r600_shader_tgsi_instruction;
264
265 struct r600_shader_src {
266 unsigned sel;
267 unsigned swizzle[4];
268 unsigned neg;
269 unsigned abs;
270 unsigned rel;
271 unsigned kc_bank;
272 boolean kc_rel; /* true if cache bank is indexed */
273 uint32_t value[4];
274 };
275
276 struct eg_interp {
277 boolean enabled;
278 unsigned ij_index;
279 };
280
281 struct r600_shader_ctx {
282 struct tgsi_shader_info info;
283 struct tgsi_parse_context parse;
284 const struct tgsi_token *tokens;
285 unsigned type;
286 unsigned file_offset[TGSI_FILE_COUNT];
287 unsigned temp_reg;
288 const struct r600_shader_tgsi_instruction *inst_info;
289 struct r600_bytecode *bc;
290 struct r600_shader *shader;
291 struct r600_shader_src src[4];
292 uint32_t *literals;
293 uint32_t nliterals;
294 uint32_t max_driver_temp_used;
295 boolean use_llvm;
296 /* needed for evergreen interpolation */
297 struct eg_interp eg_interpolators[6]; // indexed by Persp/Linear * 3 + sample/center/centroid
298 /* evergreen/cayman also store sample mask in face register */
299 int face_gpr;
300 /* sample id is .w component stored in fixed point position register */
301 int fixed_pt_position_gpr;
302 int colors_used;
303 boolean clip_vertex_write;
304 unsigned cv_output;
305 unsigned edgeflag_output;
306 int fragcoord_input;
307 int native_integers;
308 int next_ring_offset;
309 int gs_out_ring_offset;
310 int gs_next_vertex;
311 struct r600_shader *gs_for_vs;
312 int gs_export_gpr_tregs[4];
313 const struct pipe_stream_output_info *gs_stream_output_info;
314 unsigned enabled_stream_buffers_mask;
315 };
316
317 struct r600_shader_tgsi_instruction {
318 unsigned op;
319 int (*process)(struct r600_shader_ctx *ctx);
320 };
321
322 static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so, int stream, bool ind);
323 static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[];
324 static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx);
325 static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason);
326 static void fc_pushlevel(struct r600_shader_ctx *ctx, int type);
327 static int tgsi_else(struct r600_shader_ctx *ctx);
328 static int tgsi_endif(struct r600_shader_ctx *ctx);
329 static int tgsi_bgnloop(struct r600_shader_ctx *ctx);
330 static int tgsi_endloop(struct r600_shader_ctx *ctx);
331 static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx);
332 static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
333 unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,
334 unsigned int dst_reg);
335 static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
336 const struct r600_shader_src *shader_src,
337 unsigned chan);
338
339 static int tgsi_is_supported(struct r600_shader_ctx *ctx)
340 {
341 struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction;
342 int j;
343
344 if (i->Instruction.NumDstRegs > 1 && i->Instruction.Opcode != TGSI_OPCODE_DFRACEXP) {
345 R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs);
346 return -EINVAL;
347 }
348 if (i->Instruction.Predicate) {
349 R600_ERR("predicate unsupported\n");
350 return -EINVAL;
351 }
352 #if 0
353 if (i->Instruction.Label) {
354 R600_ERR("label unsupported\n");
355 return -EINVAL;
356 }
357 #endif
358 for (j = 0; j < i->Instruction.NumSrcRegs; j++) {
359 if (i->Src[j].Register.Dimension) {
360 switch (i->Src[j].Register.File) {
361 case TGSI_FILE_CONSTANT:
362 break;
363 case TGSI_FILE_INPUT:
364 if (ctx->type == TGSI_PROCESSOR_GEOMETRY)
365 break;
366 default:
367 R600_ERR("unsupported src %d (dimension %d)\n", j,
368 i->Src[j].Register.Dimension);
369 return -EINVAL;
370 }
371 }
372 }
373 for (j = 0; j < i->Instruction.NumDstRegs; j++) {
374 if (i->Dst[j].Register.Dimension) {
375 R600_ERR("unsupported dst (dimension)\n");
376 return -EINVAL;
377 }
378 }
379 return 0;
380 }
381
382 int eg_get_interpolator_index(unsigned interpolate, unsigned location)
383 {
384 if (interpolate == TGSI_INTERPOLATE_COLOR ||
385 interpolate == TGSI_INTERPOLATE_LINEAR ||
386 interpolate == TGSI_INTERPOLATE_PERSPECTIVE)
387 {
388 int is_linear = interpolate == TGSI_INTERPOLATE_LINEAR;
389 int loc;
390
391 switch(location) {
392 case TGSI_INTERPOLATE_LOC_CENTER:
393 loc = 1;
394 break;
395 case TGSI_INTERPOLATE_LOC_CENTROID:
396 loc = 2;
397 break;
398 case TGSI_INTERPOLATE_LOC_SAMPLE:
399 default:
400 loc = 0; break;
401 }
402
403 return is_linear * 3 + loc;
404 }
405
406 return -1;
407 }
408
409 static void evergreen_interp_assign_ij_index(struct r600_shader_ctx *ctx,
410 int input)
411 {
412 int i = eg_get_interpolator_index(
413 ctx->shader->input[input].interpolate,
414 ctx->shader->input[input].interpolate_location);
415 assert(i >= 0);
416 ctx->shader->input[input].ij_index = ctx->eg_interpolators[i].ij_index;
417 }
418
419 static int evergreen_interp_alu(struct r600_shader_ctx *ctx, int input)
420 {
421 int i, r;
422 struct r600_bytecode_alu alu;
423 int gpr = 0, base_chan = 0;
424 int ij_index = ctx->shader->input[input].ij_index;
425
426 /* work out gpr and base_chan from index */
427 gpr = ij_index / 2;
428 base_chan = (2 * (ij_index % 2)) + 1;
429
430 for (i = 0; i < 8; i++) {
431 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
432
433 if (i < 4)
434 alu.op = ALU_OP2_INTERP_ZW;
435 else
436 alu.op = ALU_OP2_INTERP_XY;
437
438 if ((i > 1) && (i < 6)) {
439 alu.dst.sel = ctx->shader->input[input].gpr;
440 alu.dst.write = 1;
441 }
442
443 alu.dst.chan = i % 4;
444
445 alu.src[0].sel = gpr;
446 alu.src[0].chan = (base_chan - (i % 2));
447
448 alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
449
450 alu.bank_swizzle_force = SQ_ALU_VEC_210;
451 if ((i % 4) == 3)
452 alu.last = 1;
453 r = r600_bytecode_add_alu(ctx->bc, &alu);
454 if (r)
455 return r;
456 }
457 return 0;
458 }
459
460 static int evergreen_interp_flat(struct r600_shader_ctx *ctx, int input)
461 {
462 int i, r;
463 struct r600_bytecode_alu alu;
464
465 for (i = 0; i < 4; i++) {
466 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
467
468 alu.op = ALU_OP1_INTERP_LOAD_P0;
469
470 alu.dst.sel = ctx->shader->input[input].gpr;
471 alu.dst.write = 1;
472
473 alu.dst.chan = i;
474
475 alu.src[0].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
476 alu.src[0].chan = i;
477
478 if (i == 3)
479 alu.last = 1;
480 r = r600_bytecode_add_alu(ctx->bc, &alu);
481 if (r)
482 return r;
483 }
484 return 0;
485 }
486
487 /*
488 * Special export handling in shaders
489 *
490 * shader export ARRAY_BASE for EXPORT_POS:
491 * 60 is position
492 * 61 is misc vector
493 * 62, 63 are clip distance vectors
494 *
495 * The use of the values exported in 61-63 are controlled by PA_CL_VS_OUT_CNTL:
496 * VS_OUT_MISC_VEC_ENA - enables the use of all fields in export 61
497 * USE_VTX_POINT_SIZE - point size in the X channel of export 61
498 * USE_VTX_EDGE_FLAG - edge flag in the Y channel of export 61
499 * USE_VTX_RENDER_TARGET_INDX - render target index in the Z channel of export 61
500 * USE_VTX_VIEWPORT_INDX - viewport index in the W channel of export 61
501 * USE_VTX_KILL_FLAG - kill flag in the Z channel of export 61 (mutually
502 * exclusive from render target index)
503 * VS_OUT_CCDIST0_VEC_ENA/VS_OUT_CCDIST1_VEC_ENA - enable clip distance vectors
504 *
505 *
506 * shader export ARRAY_BASE for EXPORT_PIXEL:
507 * 0-7 CB targets
508 * 61 computed Z vector
509 *
510 * The use of the values exported in the computed Z vector are controlled
511 * by DB_SHADER_CONTROL:
512 * Z_EXPORT_ENABLE - Z as a float in RED
513 * STENCIL_REF_EXPORT_ENABLE - stencil ref as int in GREEN
514 * COVERAGE_TO_MASK_ENABLE - alpha to mask in ALPHA
515 * MASK_EXPORT_ENABLE - pixel sample mask in BLUE
516 * DB_SOURCE_FORMAT - export control restrictions
517 *
518 */
519
520
521 /* Map name/sid pair from tgsi to the 8-bit semantic index for SPI setup */
522 static int r600_spi_sid(struct r600_shader_io * io)
523 {
524 int index, name = io->name;
525
526 /* These params are handled differently, they don't need
527 * semantic indices, so we'll use 0 for them.
528 */
529 if (name == TGSI_SEMANTIC_POSITION ||
530 name == TGSI_SEMANTIC_PSIZE ||
531 name == TGSI_SEMANTIC_EDGEFLAG ||
532 name == TGSI_SEMANTIC_FACE ||
533 name == TGSI_SEMANTIC_SAMPLEMASK)
534 index = 0;
535 else {
536 if (name == TGSI_SEMANTIC_GENERIC) {
537 /* For generic params simply use sid from tgsi */
538 index = io->sid;
539 } else {
540 /* For non-generic params - pack name and sid into 8 bits */
541 index = 0x80 | (name<<3) | (io->sid);
542 }
543
544 /* Make sure that all really used indices have nonzero value, so
545 * we can just compare it to 0 later instead of comparing the name
546 * with different values to detect special cases. */
547 index++;
548 }
549
550 return index;
551 };
552
553 /* turn input into interpolate on EG */
554 static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index)
555 {
556 int r = 0;
557
558 if (ctx->shader->input[index].spi_sid) {
559 ctx->shader->input[index].lds_pos = ctx->shader->nlds++;
560 if (ctx->shader->input[index].interpolate > 0) {
561 evergreen_interp_assign_ij_index(ctx, index);
562 if (!ctx->use_llvm)
563 r = evergreen_interp_alu(ctx, index);
564 } else {
565 if (!ctx->use_llvm)
566 r = evergreen_interp_flat(ctx, index);
567 }
568 }
569 return r;
570 }
571
572 static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back)
573 {
574 struct r600_bytecode_alu alu;
575 int i, r;
576 int gpr_front = ctx->shader->input[front].gpr;
577 int gpr_back = ctx->shader->input[back].gpr;
578
579 for (i = 0; i < 4; i++) {
580 memset(&alu, 0, sizeof(alu));
581 alu.op = ALU_OP3_CNDGT;
582 alu.is_op3 = 1;
583 alu.dst.write = 1;
584 alu.dst.sel = gpr_front;
585 alu.src[0].sel = ctx->face_gpr;
586 alu.src[1].sel = gpr_front;
587 alu.src[2].sel = gpr_back;
588
589 alu.dst.chan = i;
590 alu.src[1].chan = i;
591 alu.src[2].chan = i;
592 alu.last = (i==3);
593
594 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
595 return r;
596 }
597
598 return 0;
599 }
600
601 static inline int get_address_file_reg(struct r600_shader_ctx *ctx, int index)
602 {
603 return index > 0 ? ctx->bc->index_reg[index - 1] : ctx->bc->ar_reg;
604 }
605
606 static int vs_add_primid_output(struct r600_shader_ctx *ctx, int prim_id_sid)
607 {
608 int i;
609 i = ctx->shader->noutput++;
610 ctx->shader->output[i].name = TGSI_SEMANTIC_PRIMID;
611 ctx->shader->output[i].sid = 0;
612 ctx->shader->output[i].gpr = 0;
613 ctx->shader->output[i].interpolate = TGSI_INTERPOLATE_CONSTANT;
614 ctx->shader->output[i].write_mask = 0x4;
615 ctx->shader->output[i].spi_sid = prim_id_sid;
616
617 return 0;
618 }
619
620 static int tgsi_declaration(struct r600_shader_ctx *ctx)
621 {
622 struct tgsi_full_declaration *d = &ctx->parse.FullToken.FullDeclaration;
623 int r, i, j, count = d->Range.Last - d->Range.First + 1;
624
625 switch (d->Declaration.File) {
626 case TGSI_FILE_INPUT:
627 for (j = 0; j < count; j++) {
628 i = ctx->shader->ninput + j;
629 assert(i < Elements(ctx->shader->input));
630 ctx->shader->input[i].name = d->Semantic.Name;
631 ctx->shader->input[i].sid = d->Semantic.Index + j;
632 ctx->shader->input[i].interpolate = d->Interp.Interpolate;
633 ctx->shader->input[i].interpolate_location = d->Interp.Location;
634 ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First + j;
635 if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
636 ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]);
637 switch (ctx->shader->input[i].name) {
638 case TGSI_SEMANTIC_FACE:
639 if (ctx->face_gpr != -1)
640 ctx->shader->input[i].gpr = ctx->face_gpr; /* already allocated by allocate_system_value_inputs */
641 else
642 ctx->face_gpr = ctx->shader->input[i].gpr;
643 break;
644 case TGSI_SEMANTIC_COLOR:
645 ctx->colors_used++;
646 break;
647 case TGSI_SEMANTIC_POSITION:
648 ctx->fragcoord_input = i;
649 break;
650 case TGSI_SEMANTIC_PRIMID:
651 /* set this for now */
652 ctx->shader->gs_prim_id_input = true;
653 ctx->shader->ps_prim_id_input = i;
654 break;
655 }
656 if (ctx->bc->chip_class >= EVERGREEN) {
657 if ((r = evergreen_interp_input(ctx, i)))
658 return r;
659 }
660 } else if (ctx->type == TGSI_PROCESSOR_GEOMETRY) {
661 /* FIXME probably skip inputs if they aren't passed in the ring */
662 ctx->shader->input[i].ring_offset = ctx->next_ring_offset;
663 ctx->next_ring_offset += 16;
664 if (ctx->shader->input[i].name == TGSI_SEMANTIC_PRIMID)
665 ctx->shader->gs_prim_id_input = true;
666 }
667 }
668 ctx->shader->ninput += count;
669 break;
670 case TGSI_FILE_OUTPUT:
671 for (j = 0; j < count; j++) {
672 i = ctx->shader->noutput + j;
673 assert(i < Elements(ctx->shader->output));
674 ctx->shader->output[i].name = d->Semantic.Name;
675 ctx->shader->output[i].sid = d->Semantic.Index + j;
676 ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First + j;
677 ctx->shader->output[i].interpolate = d->Interp.Interpolate;
678 ctx->shader->output[i].write_mask = d->Declaration.UsageMask;
679 if (ctx->type == TGSI_PROCESSOR_VERTEX ||
680 ctx->type == TGSI_PROCESSOR_GEOMETRY) {
681 ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]);
682 switch (d->Semantic.Name) {
683 case TGSI_SEMANTIC_CLIPDIST:
684 ctx->shader->clip_dist_write |= d->Declaration.UsageMask <<
685 ((d->Semantic.Index + j) << 2);
686 break;
687 case TGSI_SEMANTIC_PSIZE:
688 ctx->shader->vs_out_misc_write = 1;
689 ctx->shader->vs_out_point_size = 1;
690 break;
691 case TGSI_SEMANTIC_EDGEFLAG:
692 ctx->shader->vs_out_misc_write = 1;
693 ctx->shader->vs_out_edgeflag = 1;
694 ctx->edgeflag_output = i;
695 break;
696 case TGSI_SEMANTIC_VIEWPORT_INDEX:
697 ctx->shader->vs_out_misc_write = 1;
698 ctx->shader->vs_out_viewport = 1;
699 break;
700 case TGSI_SEMANTIC_LAYER:
701 ctx->shader->vs_out_misc_write = 1;
702 ctx->shader->vs_out_layer = 1;
703 break;
704 case TGSI_SEMANTIC_CLIPVERTEX:
705 ctx->clip_vertex_write = TRUE;
706 ctx->cv_output = i;
707 break;
708 }
709 if (ctx->type == TGSI_PROCESSOR_GEOMETRY) {
710 ctx->gs_out_ring_offset += 16;
711 }
712 } else if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
713 switch (d->Semantic.Name) {
714 case TGSI_SEMANTIC_COLOR:
715 ctx->shader->nr_ps_max_color_exports++;
716 break;
717 }
718 }
719 }
720 ctx->shader->noutput += count;
721 break;
722 case TGSI_FILE_TEMPORARY:
723 if (ctx->info.indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
724 if (d->Array.ArrayID) {
725 r600_add_gpr_array(ctx->shader,
726 ctx->file_offset[TGSI_FILE_TEMPORARY] +
727 d->Range.First,
728 d->Range.Last - d->Range.First + 1, 0x0F);
729 }
730 }
731 break;
732
733 case TGSI_FILE_CONSTANT:
734 case TGSI_FILE_SAMPLER:
735 case TGSI_FILE_SAMPLER_VIEW:
736 case TGSI_FILE_ADDRESS:
737 break;
738
739 case TGSI_FILE_SYSTEM_VALUE:
740 if (d->Semantic.Name == TGSI_SEMANTIC_SAMPLEMASK ||
741 d->Semantic.Name == TGSI_SEMANTIC_SAMPLEID ||
742 d->Semantic.Name == TGSI_SEMANTIC_SAMPLEPOS) {
743 break; /* Already handled from allocate_system_value_inputs */
744 } else if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) {
745 if (!ctx->native_integers) {
746 struct r600_bytecode_alu alu;
747 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
748
749 alu.op = ALU_OP1_INT_TO_FLT;
750 alu.src[0].sel = 0;
751 alu.src[0].chan = 3;
752
753 alu.dst.sel = 0;
754 alu.dst.chan = 3;
755 alu.dst.write = 1;
756 alu.last = 1;
757
758 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
759 return r;
760 }
761 break;
762 } else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID)
763 break;
764 else if (d->Semantic.Name == TGSI_SEMANTIC_INVOCATIONID)
765 break;
766 default:
767 R600_ERR("unsupported file %d declaration\n", d->Declaration.File);
768 return -EINVAL;
769 }
770 return 0;
771 }
772
773 static int r600_get_temp(struct r600_shader_ctx *ctx)
774 {
775 return ctx->temp_reg + ctx->max_driver_temp_used++;
776 }
777
778 static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_offset)
779 {
780 struct tgsi_parse_context parse;
781 struct {
782 boolean enabled;
783 int *reg;
784 unsigned name, alternate_name;
785 } inputs[2] = {
786 { false, &ctx->face_gpr, TGSI_SEMANTIC_SAMPLEMASK, ~0u }, /* lives in Front Face GPR.z */
787
788 { false, &ctx->fixed_pt_position_gpr, TGSI_SEMANTIC_SAMPLEID, TGSI_SEMANTIC_SAMPLEPOS } /* SAMPLEID is in Fixed Point Position GPR.w */
789 };
790 int i, k, num_regs = 0;
791
792 if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) {
793 return 0;
794 }
795
796 /* need to scan shader for system values and interpolateAtSample/Offset/Centroid */
797 while (!tgsi_parse_end_of_tokens(&parse)) {
798 tgsi_parse_token(&parse);
799
800 if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) {
801 const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
802 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE ||
803 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
804 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID)
805 {
806 int interpolate, location, k;
807
808 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
809 location = TGSI_INTERPOLATE_LOC_CENTER;
810 inputs[1].enabled = true; /* needs SAMPLEID */
811 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
812 location = TGSI_INTERPOLATE_LOC_CENTER;
813 /* Needs sample positions, currently those are always available */
814 } else {
815 location = TGSI_INTERPOLATE_LOC_CENTROID;
816 }
817
818 interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index];
819 k = eg_get_interpolator_index(interpolate, location);
820 ctx->eg_interpolators[k].enabled = true;
821 }
822 } else if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_DECLARATION) {
823 struct tgsi_full_declaration *d = &parse.FullToken.FullDeclaration;
824 if (d->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
825 for (k = 0; k < Elements(inputs); k++) {
826 if (d->Semantic.Name == inputs[k].name ||
827 d->Semantic.Name == inputs[k].alternate_name) {
828 inputs[k].enabled = true;
829 }
830 }
831 }
832 }
833 }
834
835 tgsi_parse_free(&parse);
836
837 for (i = 0; i < Elements(inputs); i++) {
838 boolean enabled = inputs[i].enabled;
839 int *reg = inputs[i].reg;
840 unsigned name = inputs[i].name;
841
842 if (enabled) {
843 int gpr = gpr_offset + num_regs++;
844
845 // add to inputs, allocate a gpr
846 k = ctx->shader->ninput ++;
847 ctx->shader->input[k].name = name;
848 ctx->shader->input[k].sid = 0;
849 ctx->shader->input[k].interpolate = TGSI_INTERPOLATE_CONSTANT;
850 ctx->shader->input[k].interpolate_location = TGSI_INTERPOLATE_LOC_CENTER;
851 *reg = ctx->shader->input[k].gpr = gpr;
852 }
853 }
854
855 return gpr_offset + num_regs;
856 }
857
858 /*
859 * for evergreen we need to scan the shader to find the number of GPRs we need to
860 * reserve for interpolation and system values
861 *
862 * we need to know if we are going to emit
863 * any sample or centroid inputs
864 * if perspective and linear are required
865 */
866 static int evergreen_gpr_count(struct r600_shader_ctx *ctx)
867 {
868 int i;
869 int num_baryc;
870 struct tgsi_parse_context parse;
871
872 memset(&ctx->eg_interpolators, 0, sizeof(ctx->eg_interpolators));
873
874 for (i = 0; i < ctx->info.num_inputs; i++) {
875 int k;
876 /* skip position/face/mask/sampleid */
877 if (ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_POSITION ||
878 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_FACE ||
879 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEMASK ||
880 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEID)
881 continue;
882
883 k = eg_get_interpolator_index(
884 ctx->info.input_interpolate[i],
885 ctx->info.input_interpolate_loc[i]);
886 if (k >= 0)
887 ctx->eg_interpolators[k].enabled = TRUE;
888 }
889
890 if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) {
891 return 0;
892 }
893
894 /* need to scan shader for system values and interpolateAtSample/Offset/Centroid */
895 while (!tgsi_parse_end_of_tokens(&parse)) {
896 tgsi_parse_token(&parse);
897
898 if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) {
899 const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
900 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE ||
901 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
902 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID)
903 {
904 int interpolate, location, k;
905
906 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
907 location = TGSI_INTERPOLATE_LOC_CENTER;
908 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
909 location = TGSI_INTERPOLATE_LOC_CENTER;
910 } else {
911 location = TGSI_INTERPOLATE_LOC_CENTROID;
912 }
913
914 interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index];
915 k = eg_get_interpolator_index(interpolate, location);
916 ctx->eg_interpolators[k].enabled = true;
917 }
918 }
919 }
920
921 tgsi_parse_free(&parse);
922
923 /* assign gpr to each interpolator according to priority */
924 num_baryc = 0;
925 for (i = 0; i < Elements(ctx->eg_interpolators); i++) {
926 if (ctx->eg_interpolators[i].enabled) {
927 ctx->eg_interpolators[i].ij_index = num_baryc;
928 num_baryc ++;
929 }
930 }
931
932 /* XXX PULL MODEL and LINE STIPPLE */
933
934 num_baryc = (num_baryc + 1) >> 1;
935 return allocate_system_value_inputs(ctx, num_baryc);
936 }
937
938 /* sample_id_sel == NULL means fetch for current sample */
939 static int load_sample_position(struct r600_shader_ctx *ctx, struct r600_shader_src *sample_id, int chan_sel)
940 {
941 struct r600_bytecode_vtx vtx;
942 int r, t1;
943
944 assert(ctx->fixed_pt_position_gpr != -1);
945
946 t1 = r600_get_temp(ctx);
947
948 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
949 vtx.op = FETCH_OP_VFETCH;
950 vtx.buffer_id = R600_BUFFER_INFO_CONST_BUFFER;
951 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
952 if (sample_id == NULL) {
953 vtx.src_gpr = ctx->fixed_pt_position_gpr; // SAMPLEID is in .w;
954 vtx.src_sel_x = 3;
955 }
956 else {
957 struct r600_bytecode_alu alu;
958
959 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
960 alu.op = ALU_OP1_MOV;
961 r600_bytecode_src(&alu.src[0], sample_id, chan_sel);
962 alu.dst.sel = t1;
963 alu.dst.write = 1;
964 alu.last = 1;
965 r = r600_bytecode_add_alu(ctx->bc, &alu);
966 if (r)
967 return r;
968
969 vtx.src_gpr = t1;
970 vtx.src_sel_x = 0;
971 }
972 vtx.mega_fetch_count = 16;
973 vtx.dst_gpr = t1;
974 vtx.dst_sel_x = 0;
975 vtx.dst_sel_y = 1;
976 vtx.dst_sel_z = 2;
977 vtx.dst_sel_w = 3;
978 vtx.data_format = FMT_32_32_32_32_FLOAT;
979 vtx.num_format_all = 2;
980 vtx.format_comp_all = 1;
981 vtx.use_const_fields = 0;
982 vtx.offset = 1; // first element is size of buffer
983 vtx.endian = r600_endian_swap(32);
984 vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */
985
986 r = r600_bytecode_add_vtx(ctx->bc, &vtx);
987 if (r)
988 return r;
989
990 return t1;
991 }
992
993 static void tgsi_src(struct r600_shader_ctx *ctx,
994 const struct tgsi_full_src_register *tgsi_src,
995 struct r600_shader_src *r600_src)
996 {
997 memset(r600_src, 0, sizeof(*r600_src));
998 r600_src->swizzle[0] = tgsi_src->Register.SwizzleX;
999 r600_src->swizzle[1] = tgsi_src->Register.SwizzleY;
1000 r600_src->swizzle[2] = tgsi_src->Register.SwizzleZ;
1001 r600_src->swizzle[3] = tgsi_src->Register.SwizzleW;
1002 r600_src->neg = tgsi_src->Register.Negate;
1003 r600_src->abs = tgsi_src->Register.Absolute;
1004
1005 if (tgsi_src->Register.File == TGSI_FILE_IMMEDIATE) {
1006 int index;
1007 if ((tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleY) &&
1008 (tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleZ) &&
1009 (tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleW)) {
1010
1011 index = tgsi_src->Register.Index * 4 + tgsi_src->Register.SwizzleX;
1012 r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel, &r600_src->neg, r600_src->abs);
1013 if (r600_src->sel != V_SQ_ALU_SRC_LITERAL)
1014 return;
1015 }
1016 index = tgsi_src->Register.Index;
1017 r600_src->sel = V_SQ_ALU_SRC_LITERAL;
1018 memcpy(r600_src->value, ctx->literals + index * 4, sizeof(r600_src->value));
1019 } else if (tgsi_src->Register.File == TGSI_FILE_SYSTEM_VALUE) {
1020 if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEMASK) {
1021 r600_src->swizzle[0] = 2; // Z value
1022 r600_src->swizzle[1] = 2;
1023 r600_src->swizzle[2] = 2;
1024 r600_src->swizzle[3] = 2;
1025 r600_src->sel = ctx->face_gpr;
1026 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEID) {
1027 r600_src->swizzle[0] = 3; // W value
1028 r600_src->swizzle[1] = 3;
1029 r600_src->swizzle[2] = 3;
1030 r600_src->swizzle[3] = 3;
1031 r600_src->sel = ctx->fixed_pt_position_gpr;
1032 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEPOS) {
1033 r600_src->swizzle[0] = 0;
1034 r600_src->swizzle[1] = 1;
1035 r600_src->swizzle[2] = 4;
1036 r600_src->swizzle[3] = 4;
1037 r600_src->sel = load_sample_position(ctx, NULL, -1);
1038 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INSTANCEID) {
1039 r600_src->swizzle[0] = 3;
1040 r600_src->swizzle[1] = 3;
1041 r600_src->swizzle[2] = 3;
1042 r600_src->swizzle[3] = 3;
1043 r600_src->sel = 0;
1044 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTEXID) {
1045 r600_src->swizzle[0] = 0;
1046 r600_src->swizzle[1] = 0;
1047 r600_src->swizzle[2] = 0;
1048 r600_src->swizzle[3] = 0;
1049 r600_src->sel = 0;
1050 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) {
1051 r600_src->swizzle[0] = 3;
1052 r600_src->swizzle[1] = 3;
1053 r600_src->swizzle[2] = 3;
1054 r600_src->swizzle[3] = 3;
1055 r600_src->sel = 1;
1056 }
1057 } else {
1058 if (tgsi_src->Register.Indirect)
1059 r600_src->rel = V_SQ_REL_RELATIVE;
1060 r600_src->sel = tgsi_src->Register.Index;
1061 r600_src->sel += ctx->file_offset[tgsi_src->Register.File];
1062 }
1063 if (tgsi_src->Register.File == TGSI_FILE_CONSTANT) {
1064 if (tgsi_src->Register.Dimension) {
1065 r600_src->kc_bank = tgsi_src->Dimension.Index;
1066 if (tgsi_src->Dimension.Indirect) {
1067 r600_src->kc_rel = 1;
1068 }
1069 }
1070 }
1071 }
1072
1073 static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
1074 unsigned int cb_idx, unsigned cb_rel, unsigned int offset, unsigned ar_chan,
1075 unsigned int dst_reg)
1076 {
1077 struct r600_bytecode_vtx vtx;
1078 unsigned int ar_reg;
1079 int r;
1080
1081 if (offset) {
1082 struct r600_bytecode_alu alu;
1083
1084 memset(&alu, 0, sizeof(alu));
1085
1086 alu.op = ALU_OP2_ADD_INT;
1087 alu.src[0].sel = ctx->bc->ar_reg;
1088 alu.src[0].chan = ar_chan;
1089
1090 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1091 alu.src[1].value = offset;
1092
1093 alu.dst.sel = dst_reg;
1094 alu.dst.chan = ar_chan;
1095 alu.dst.write = 1;
1096 alu.last = 1;
1097
1098 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1099 return r;
1100
1101 ar_reg = dst_reg;
1102 } else {
1103 ar_reg = ctx->bc->ar_reg;
1104 }
1105
1106 memset(&vtx, 0, sizeof(vtx));
1107 vtx.buffer_id = cb_idx;
1108 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1109 vtx.src_gpr = ar_reg;
1110 vtx.src_sel_x = ar_chan;
1111 vtx.mega_fetch_count = 16;
1112 vtx.dst_gpr = dst_reg;
1113 vtx.dst_sel_x = 0; /* SEL_X */
1114 vtx.dst_sel_y = 1; /* SEL_Y */
1115 vtx.dst_sel_z = 2; /* SEL_Z */
1116 vtx.dst_sel_w = 3; /* SEL_W */
1117 vtx.data_format = FMT_32_32_32_32_FLOAT;
1118 vtx.num_format_all = 2; /* NUM_FORMAT_SCALED */
1119 vtx.format_comp_all = 1; /* FORMAT_COMP_SIGNED */
1120 vtx.endian = r600_endian_swap(32);
1121 vtx.buffer_index_mode = cb_rel; // cb_rel ? V_SQ_CF_INDEX_0 : V_SQ_CF_INDEX_NONE;
1122
1123 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1124 return r;
1125
1126 return 0;
1127 }
1128
1129 static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1130 {
1131 struct r600_bytecode_vtx vtx;
1132 int r;
1133 unsigned index = src->Register.Index;
1134 unsigned vtx_id = src->Dimension.Index;
1135 int offset_reg = vtx_id / 3;
1136 int offset_chan = vtx_id % 3;
1137
1138 /* offsets of per-vertex data in ESGS ring are passed to GS in R0.x, R0.y,
1139 * R0.w, R1.x, R1.y, R1.z (it seems R0.z is used for PrimitiveID) */
1140
1141 if (offset_reg == 0 && offset_chan == 2)
1142 offset_chan = 3;
1143
1144 if (src->Dimension.Indirect) {
1145 int treg[3];
1146 int t2;
1147 struct r600_bytecode_alu alu;
1148 int r, i;
1149
1150 /* you have got to be shitting me -
1151 we have to put the R0.x/y/w into Rt.x Rt+1.x Rt+2.x then index reg from Rt.
1152 at least this is what fglrx seems to do. */
1153 for (i = 0; i < 3; i++) {
1154 treg[i] = r600_get_temp(ctx);
1155 }
1156 r600_add_gpr_array(ctx->shader, treg[0], 3, 0x0F);
1157
1158 t2 = r600_get_temp(ctx);
1159 for (i = 0; i < 3; i++) {
1160 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1161 alu.op = ALU_OP1_MOV;
1162 alu.src[0].sel = 0;
1163 alu.src[0].chan = i == 2 ? 3 : i;
1164 alu.dst.sel = treg[i];
1165 alu.dst.chan = 0;
1166 alu.dst.write = 1;
1167 alu.last = 1;
1168 r = r600_bytecode_add_alu(ctx->bc, &alu);
1169 if (r)
1170 return r;
1171 }
1172 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1173 alu.op = ALU_OP1_MOV;
1174 alu.src[0].sel = treg[0];
1175 alu.src[0].rel = 1;
1176 alu.dst.sel = t2;
1177 alu.dst.write = 1;
1178 alu.last = 1;
1179 r = r600_bytecode_add_alu(ctx->bc, &alu);
1180 if (r)
1181 return r;
1182 offset_reg = t2;
1183 }
1184
1185
1186 memset(&vtx, 0, sizeof(vtx));
1187 vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
1188 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1189 vtx.src_gpr = offset_reg;
1190 vtx.src_sel_x = offset_chan;
1191 vtx.offset = index * 16; /*bytes*/
1192 vtx.mega_fetch_count = 16;
1193 vtx.dst_gpr = dst_reg;
1194 vtx.dst_sel_x = 0; /* SEL_X */
1195 vtx.dst_sel_y = 1; /* SEL_Y */
1196 vtx.dst_sel_z = 2; /* SEL_Z */
1197 vtx.dst_sel_w = 3; /* SEL_W */
1198 if (ctx->bc->chip_class >= EVERGREEN) {
1199 vtx.use_const_fields = 1;
1200 } else {
1201 vtx.data_format = FMT_32_32_32_32_FLOAT;
1202 }
1203
1204 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1205 return r;
1206
1207 return 0;
1208 }
1209
1210 static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx)
1211 {
1212 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1213 int i;
1214
1215 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1216 struct tgsi_full_src_register *src = &inst->Src[i];
1217
1218 if (src->Register.File == TGSI_FILE_INPUT) {
1219 if (ctx->shader->input[src->Register.Index].name == TGSI_SEMANTIC_PRIMID) {
1220 /* primitive id is in R0.z */
1221 ctx->src[i].sel = 0;
1222 ctx->src[i].swizzle[0] = 2;
1223 }
1224 }
1225 if (src->Register.File == TGSI_FILE_INPUT && src->Register.Dimension) {
1226 int treg = r600_get_temp(ctx);
1227
1228 fetch_gs_input(ctx, src, treg);
1229 ctx->src[i].sel = treg;
1230 }
1231 }
1232 return 0;
1233 }
1234
1235 static int tgsi_split_constant(struct r600_shader_ctx *ctx)
1236 {
1237 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1238 struct r600_bytecode_alu alu;
1239 int i, j, k, nconst, r;
1240
1241 for (i = 0, nconst = 0; i < inst->Instruction.NumSrcRegs; i++) {
1242 if (inst->Src[i].Register.File == TGSI_FILE_CONSTANT) {
1243 nconst++;
1244 }
1245 tgsi_src(ctx, &inst->Src[i], &ctx->src[i]);
1246 }
1247 for (i = 0, j = nconst - 1; i < inst->Instruction.NumSrcRegs; i++) {
1248 if (inst->Src[i].Register.File != TGSI_FILE_CONSTANT) {
1249 continue;
1250 }
1251
1252 if (ctx->src[i].rel) {
1253 int chan = inst->Src[i].Indirect.Swizzle;
1254 int treg = r600_get_temp(ctx);
1255 if ((r = tgsi_fetch_rel_const(ctx, ctx->src[i].kc_bank, ctx->src[i].kc_rel, ctx->src[i].sel - 512, chan, treg)))
1256 return r;
1257
1258 ctx->src[i].kc_bank = 0;
1259 ctx->src[i].kc_rel = 0;
1260 ctx->src[i].sel = treg;
1261 ctx->src[i].rel = 0;
1262 j--;
1263 } else if (j > 0) {
1264 int treg = r600_get_temp(ctx);
1265 for (k = 0; k < 4; k++) {
1266 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1267 alu.op = ALU_OP1_MOV;
1268 alu.src[0].sel = ctx->src[i].sel;
1269 alu.src[0].chan = k;
1270 alu.src[0].rel = ctx->src[i].rel;
1271 alu.src[0].kc_bank = ctx->src[i].kc_bank;
1272 alu.src[0].kc_rel = ctx->src[i].kc_rel;
1273 alu.dst.sel = treg;
1274 alu.dst.chan = k;
1275 alu.dst.write = 1;
1276 if (k == 3)
1277 alu.last = 1;
1278 r = r600_bytecode_add_alu(ctx->bc, &alu);
1279 if (r)
1280 return r;
1281 }
1282 ctx->src[i].sel = treg;
1283 ctx->src[i].rel =0;
1284 j--;
1285 }
1286 }
1287 return 0;
1288 }
1289
1290 /* need to move any immediate into a temp - for trig functions which use literal for PI stuff */
1291 static int tgsi_split_literal_constant(struct r600_shader_ctx *ctx)
1292 {
1293 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1294 struct r600_bytecode_alu alu;
1295 int i, j, k, nliteral, r;
1296
1297 for (i = 0, nliteral = 0; i < inst->Instruction.NumSrcRegs; i++) {
1298 if (ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
1299 nliteral++;
1300 }
1301 }
1302 for (i = 0, j = nliteral - 1; i < inst->Instruction.NumSrcRegs; i++) {
1303 if (j > 0 && ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
1304 int treg = r600_get_temp(ctx);
1305 for (k = 0; k < 4; k++) {
1306 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1307 alu.op = ALU_OP1_MOV;
1308 alu.src[0].sel = ctx->src[i].sel;
1309 alu.src[0].chan = k;
1310 alu.src[0].value = ctx->src[i].value[k];
1311 alu.dst.sel = treg;
1312 alu.dst.chan = k;
1313 alu.dst.write = 1;
1314 if (k == 3)
1315 alu.last = 1;
1316 r = r600_bytecode_add_alu(ctx->bc, &alu);
1317 if (r)
1318 return r;
1319 }
1320 ctx->src[i].sel = treg;
1321 j--;
1322 }
1323 }
1324 return 0;
1325 }
1326
1327 static int process_twoside_color_inputs(struct r600_shader_ctx *ctx)
1328 {
1329 int i, r, count = ctx->shader->ninput;
1330
1331 for (i = 0; i < count; i++) {
1332 if (ctx->shader->input[i].name == TGSI_SEMANTIC_COLOR) {
1333 r = select_twoside_color(ctx, i, ctx->shader->input[i].back_color_input);
1334 if (r)
1335 return r;
1336 }
1337 }
1338 return 0;
1339 }
1340
1341 static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output_info *so,
1342 int stream, unsigned *stream_item_size)
1343 {
1344 unsigned so_gpr[PIPE_MAX_SHADER_OUTPUTS];
1345 unsigned start_comp[PIPE_MAX_SHADER_OUTPUTS];
1346 int i, j, r;
1347
1348 /* Sanity checking. */
1349 if (so->num_outputs > PIPE_MAX_SO_OUTPUTS) {
1350 R600_ERR("Too many stream outputs: %d\n", so->num_outputs);
1351 r = -EINVAL;
1352 goto out_err;
1353 }
1354 for (i = 0; i < so->num_outputs; i++) {
1355 if (so->output[i].output_buffer >= 4) {
1356 R600_ERR("Exceeded the max number of stream output buffers, got: %d\n",
1357 so->output[i].output_buffer);
1358 r = -EINVAL;
1359 goto out_err;
1360 }
1361 }
1362
1363 /* Initialize locations where the outputs are stored. */
1364 for (i = 0; i < so->num_outputs; i++) {
1365
1366 so_gpr[i] = ctx->shader->output[so->output[i].register_index].gpr;
1367 start_comp[i] = so->output[i].start_component;
1368 /* Lower outputs with dst_offset < start_component.
1369 *
1370 * We can only output 4D vectors with a write mask, e.g. we can
1371 * only output the W component at offset 3, etc. If we want
1372 * to store Y, Z, or W at buffer offset 0, we need to use MOV
1373 * to move it to X and output X. */
1374 if (so->output[i].dst_offset < so->output[i].start_component) {
1375 unsigned tmp = r600_get_temp(ctx);
1376
1377 for (j = 0; j < so->output[i].num_components; j++) {
1378 struct r600_bytecode_alu alu;
1379 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1380 alu.op = ALU_OP1_MOV;
1381 alu.src[0].sel = so_gpr[i];
1382 alu.src[0].chan = so->output[i].start_component + j;
1383
1384 alu.dst.sel = tmp;
1385 alu.dst.chan = j;
1386 alu.dst.write = 1;
1387 if (j == so->output[i].num_components - 1)
1388 alu.last = 1;
1389 r = r600_bytecode_add_alu(ctx->bc, &alu);
1390 if (r)
1391 return r;
1392 }
1393 start_comp[i] = 0;
1394 so_gpr[i] = tmp;
1395 }
1396 }
1397
1398 /* Write outputs to buffers. */
1399 for (i = 0; i < so->num_outputs; i++) {
1400 struct r600_bytecode_output output;
1401
1402 if (stream != -1 && stream != so->output[i].output_buffer)
1403 continue;
1404
1405 memset(&output, 0, sizeof(struct r600_bytecode_output));
1406 output.gpr = so_gpr[i];
1407 output.elem_size = so->output[i].num_components - 1;
1408 if (output.elem_size == 2)
1409 output.elem_size = 3; // 3 not supported, write 4 with junk at end
1410 output.array_base = so->output[i].dst_offset - start_comp[i];
1411 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
1412 output.burst_count = 1;
1413 /* array_size is an upper limit for the burst_count
1414 * with MEM_STREAM instructions */
1415 output.array_size = 0xFFF;
1416 output.comp_mask = ((1 << so->output[i].num_components) - 1) << start_comp[i];
1417
1418 if (ctx->bc->chip_class >= EVERGREEN) {
1419 switch (so->output[i].output_buffer) {
1420 case 0:
1421 output.op = CF_OP_MEM_STREAM0_BUF0;
1422 break;
1423 case 1:
1424 output.op = CF_OP_MEM_STREAM0_BUF1;
1425 break;
1426 case 2:
1427 output.op = CF_OP_MEM_STREAM0_BUF2;
1428 break;
1429 case 3:
1430 output.op = CF_OP_MEM_STREAM0_BUF3;
1431 break;
1432 }
1433 output.op += so->output[i].stream * 4;
1434 assert(output.op >= CF_OP_MEM_STREAM0_BUF0 && output.op <= CF_OP_MEM_STREAM3_BUF3);
1435 ctx->enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer) << so->output[i].stream * 4;
1436 } else {
1437 switch (so->output[i].output_buffer) {
1438 case 0:
1439 output.op = CF_OP_MEM_STREAM0;
1440 break;
1441 case 1:
1442 output.op = CF_OP_MEM_STREAM1;
1443 break;
1444 case 2:
1445 output.op = CF_OP_MEM_STREAM2;
1446 break;
1447 case 3:
1448 output.op = CF_OP_MEM_STREAM3;
1449 break;
1450 }
1451 ctx->enabled_stream_buffers_mask |= 1 << so->output[i].output_buffer;
1452 }
1453 r = r600_bytecode_add_output(ctx->bc, &output);
1454 if (r)
1455 goto out_err;
1456 }
1457 return 0;
1458 out_err:
1459 return r;
1460 }
1461
1462 static void convert_edgeflag_to_int(struct r600_shader_ctx *ctx)
1463 {
1464 struct r600_bytecode_alu alu;
1465 unsigned reg;
1466
1467 if (!ctx->shader->vs_out_edgeflag)
1468 return;
1469
1470 reg = ctx->shader->output[ctx->edgeflag_output].gpr;
1471
1472 /* clamp(x, 0, 1) */
1473 memset(&alu, 0, sizeof(alu));
1474 alu.op = ALU_OP1_MOV;
1475 alu.src[0].sel = reg;
1476 alu.dst.sel = reg;
1477 alu.dst.write = 1;
1478 alu.dst.clamp = 1;
1479 alu.last = 1;
1480 r600_bytecode_add_alu(ctx->bc, &alu);
1481
1482 memset(&alu, 0, sizeof(alu));
1483 alu.op = ALU_OP1_FLT_TO_INT;
1484 alu.src[0].sel = reg;
1485 alu.dst.sel = reg;
1486 alu.dst.write = 1;
1487 alu.last = 1;
1488 r600_bytecode_add_alu(ctx->bc, &alu);
1489 }
1490
1491 static int generate_gs_copy_shader(struct r600_context *rctx,
1492 struct r600_pipe_shader *gs,
1493 struct pipe_stream_output_info *so)
1494 {
1495 struct r600_shader_ctx ctx = {};
1496 struct r600_shader *gs_shader = &gs->shader;
1497 struct r600_pipe_shader *cshader;
1498 int ocnt = gs_shader->noutput;
1499 struct r600_bytecode_alu alu;
1500 struct r600_bytecode_vtx vtx;
1501 struct r600_bytecode_output output;
1502 struct r600_bytecode_cf *cf_jump, *cf_pop,
1503 *last_exp_pos = NULL, *last_exp_param = NULL;
1504 int i, j, next_clip_pos = 61, next_param = 0;
1505 int ring;
1506
1507 cshader = calloc(1, sizeof(struct r600_pipe_shader));
1508 if (!cshader)
1509 return 0;
1510
1511 memcpy(cshader->shader.output, gs_shader->output, ocnt *
1512 sizeof(struct r600_shader_io));
1513
1514 cshader->shader.noutput = ocnt;
1515
1516 ctx.shader = &cshader->shader;
1517 ctx.bc = &ctx.shader->bc;
1518 ctx.type = ctx.bc->type = TGSI_PROCESSOR_VERTEX;
1519
1520 r600_bytecode_init(ctx.bc, rctx->b.chip_class, rctx->b.family,
1521 rctx->screen->has_compressed_msaa_texturing);
1522
1523 ctx.bc->isa = rctx->isa;
1524
1525 cf_jump = NULL;
1526 memset(cshader->shader.ring_item_sizes, 0, sizeof(cshader->shader.ring_item_sizes));
1527
1528 /* R0.x = R0.x & 0x3fffffff */
1529 memset(&alu, 0, sizeof(alu));
1530 alu.op = ALU_OP2_AND_INT;
1531 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1532 alu.src[1].value = 0x3fffffff;
1533 alu.dst.write = 1;
1534 r600_bytecode_add_alu(ctx.bc, &alu);
1535
1536 /* R0.y = R0.x >> 30 */
1537 memset(&alu, 0, sizeof(alu));
1538 alu.op = ALU_OP2_LSHR_INT;
1539 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1540 alu.src[1].value = 0x1e;
1541 alu.dst.chan = 1;
1542 alu.dst.write = 1;
1543 alu.last = 1;
1544 r600_bytecode_add_alu(ctx.bc, &alu);
1545
1546 /* fetch vertex data from GSVS ring */
1547 for (i = 0; i < ocnt; ++i) {
1548 struct r600_shader_io *out = &ctx.shader->output[i];
1549
1550 out->gpr = i + 1;
1551 out->ring_offset = i * 16;
1552
1553 memset(&vtx, 0, sizeof(vtx));
1554 vtx.op = FETCH_OP_VFETCH;
1555 vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
1556 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
1557 vtx.offset = out->ring_offset;
1558 vtx.dst_gpr = out->gpr;
1559 vtx.src_gpr = 0;
1560 vtx.dst_sel_x = 0;
1561 vtx.dst_sel_y = 1;
1562 vtx.dst_sel_z = 2;
1563 vtx.dst_sel_w = 3;
1564 if (rctx->b.chip_class >= EVERGREEN) {
1565 vtx.use_const_fields = 1;
1566 } else {
1567 vtx.data_format = FMT_32_32_32_32_FLOAT;
1568 }
1569
1570 r600_bytecode_add_vtx(ctx.bc, &vtx);
1571 }
1572 ctx.temp_reg = i + 1;
1573 for (ring = 3; ring >= 0; --ring) {
1574 bool enabled = false;
1575 for (i = 0; i < so->num_outputs; i++) {
1576 if (so->output[i].stream == ring) {
1577 enabled = true;
1578 break;
1579 }
1580 }
1581 if (ring != 0 && !enabled) {
1582 cshader->shader.ring_item_sizes[ring] = 0;
1583 continue;
1584 }
1585
1586 if (cf_jump) {
1587 // Patch up jump label
1588 r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
1589 cf_pop = ctx.bc->cf_last;
1590
1591 cf_jump->cf_addr = cf_pop->id + 2;
1592 cf_jump->pop_count = 1;
1593 cf_pop->cf_addr = cf_pop->id + 2;
1594 cf_pop->pop_count = 1;
1595 }
1596
1597 /* PRED_SETE_INT __, R0.y, ring */
1598 memset(&alu, 0, sizeof(alu));
1599 alu.op = ALU_OP2_PRED_SETE_INT;
1600 alu.src[0].chan = 1;
1601 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1602 alu.src[1].value = ring;
1603 alu.execute_mask = 1;
1604 alu.update_pred = 1;
1605 alu.last = 1;
1606 r600_bytecode_add_alu_type(ctx.bc, &alu, CF_OP_ALU_PUSH_BEFORE);
1607
1608 r600_bytecode_add_cfinst(ctx.bc, CF_OP_JUMP);
1609 cf_jump = ctx.bc->cf_last;
1610
1611 if (enabled)
1612 emit_streamout(&ctx, so, ring, &cshader->shader.ring_item_sizes[ring]);
1613 cshader->shader.ring_item_sizes[ring] = ocnt * 16;
1614 }
1615
1616 /* export vertex data */
1617 /* XXX factor out common code with r600_shader_from_tgsi ? */
1618 for (i = 0; i < ocnt; ++i) {
1619 struct r600_shader_io *out = &ctx.shader->output[i];
1620 bool instream0 = true;
1621 if (out->name == TGSI_SEMANTIC_CLIPVERTEX)
1622 continue;
1623
1624 for (j = 0; j < so->num_outputs; j++) {
1625 if (so->output[j].register_index == i) {
1626 if (so->output[j].stream == 0)
1627 break;
1628 if (so->output[j].stream > 0)
1629 instream0 = false;
1630 }
1631 }
1632 if (!instream0)
1633 continue;
1634 memset(&output, 0, sizeof(output));
1635 output.gpr = out->gpr;
1636 output.elem_size = 3;
1637 output.swizzle_x = 0;
1638 output.swizzle_y = 1;
1639 output.swizzle_z = 2;
1640 output.swizzle_w = 3;
1641 output.burst_count = 1;
1642 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
1643 output.op = CF_OP_EXPORT;
1644 switch (out->name) {
1645 case TGSI_SEMANTIC_POSITION:
1646 output.array_base = 60;
1647 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1648 break;
1649
1650 case TGSI_SEMANTIC_PSIZE:
1651 output.array_base = 61;
1652 if (next_clip_pos == 61)
1653 next_clip_pos = 62;
1654 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1655 output.swizzle_y = 7;
1656 output.swizzle_z = 7;
1657 output.swizzle_w = 7;
1658 ctx.shader->vs_out_misc_write = 1;
1659 ctx.shader->vs_out_point_size = 1;
1660 break;
1661 case TGSI_SEMANTIC_LAYER:
1662 if (out->spi_sid) {
1663 /* duplicate it as PARAM to pass to the pixel shader */
1664 output.array_base = next_param++;
1665 r600_bytecode_add_output(ctx.bc, &output);
1666 last_exp_param = ctx.bc->cf_last;
1667 }
1668 output.array_base = 61;
1669 if (next_clip_pos == 61)
1670 next_clip_pos = 62;
1671 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1672 output.swizzle_x = 7;
1673 output.swizzle_y = 7;
1674 output.swizzle_z = 0;
1675 output.swizzle_w = 7;
1676 ctx.shader->vs_out_misc_write = 1;
1677 ctx.shader->vs_out_layer = 1;
1678 break;
1679 case TGSI_SEMANTIC_VIEWPORT_INDEX:
1680 if (out->spi_sid) {
1681 /* duplicate it as PARAM to pass to the pixel shader */
1682 output.array_base = next_param++;
1683 r600_bytecode_add_output(ctx.bc, &output);
1684 last_exp_param = ctx.bc->cf_last;
1685 }
1686 output.array_base = 61;
1687 if (next_clip_pos == 61)
1688 next_clip_pos = 62;
1689 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1690 ctx.shader->vs_out_misc_write = 1;
1691 ctx.shader->vs_out_viewport = 1;
1692 output.swizzle_x = 7;
1693 output.swizzle_y = 7;
1694 output.swizzle_z = 7;
1695 output.swizzle_w = 0;
1696 break;
1697 case TGSI_SEMANTIC_CLIPDIST:
1698 /* spi_sid is 0 for clipdistance outputs that were generated
1699 * for clipvertex - we don't need to pass them to PS */
1700 ctx.shader->clip_dist_write = gs->shader.clip_dist_write;
1701 if (out->spi_sid) {
1702 /* duplicate it as PARAM to pass to the pixel shader */
1703 output.array_base = next_param++;
1704 r600_bytecode_add_output(ctx.bc, &output);
1705 last_exp_param = ctx.bc->cf_last;
1706 }
1707 output.array_base = next_clip_pos++;
1708 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1709 break;
1710 case TGSI_SEMANTIC_FOG:
1711 output.swizzle_y = 4; /* 0 */
1712 output.swizzle_z = 4; /* 0 */
1713 output.swizzle_w = 5; /* 1 */
1714 break;
1715 default:
1716 output.array_base = next_param++;
1717 break;
1718 }
1719 r600_bytecode_add_output(ctx.bc, &output);
1720 if (output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM)
1721 last_exp_param = ctx.bc->cf_last;
1722 else
1723 last_exp_pos = ctx.bc->cf_last;
1724 }
1725
1726 if (!last_exp_pos) {
1727 memset(&output, 0, sizeof(output));
1728 output.gpr = 0;
1729 output.elem_size = 3;
1730 output.swizzle_x = 7;
1731 output.swizzle_y = 7;
1732 output.swizzle_z = 7;
1733 output.swizzle_w = 7;
1734 output.burst_count = 1;
1735 output.type = 2;
1736 output.op = CF_OP_EXPORT;
1737 output.array_base = 60;
1738 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1739 r600_bytecode_add_output(ctx.bc, &output);
1740 last_exp_pos = ctx.bc->cf_last;
1741 }
1742
1743 if (!last_exp_param) {
1744 memset(&output, 0, sizeof(output));
1745 output.gpr = 0;
1746 output.elem_size = 3;
1747 output.swizzle_x = 7;
1748 output.swizzle_y = 7;
1749 output.swizzle_z = 7;
1750 output.swizzle_w = 7;
1751 output.burst_count = 1;
1752 output.type = 2;
1753 output.op = CF_OP_EXPORT;
1754 output.array_base = next_param++;
1755 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
1756 r600_bytecode_add_output(ctx.bc, &output);
1757 last_exp_param = ctx.bc->cf_last;
1758 }
1759
1760 last_exp_pos->op = CF_OP_EXPORT_DONE;
1761 last_exp_param->op = CF_OP_EXPORT_DONE;
1762
1763 r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
1764 cf_pop = ctx.bc->cf_last;
1765
1766 cf_jump->cf_addr = cf_pop->id + 2;
1767 cf_jump->pop_count = 1;
1768 cf_pop->cf_addr = cf_pop->id + 2;
1769 cf_pop->pop_count = 1;
1770
1771 if (ctx.bc->chip_class == CAYMAN)
1772 cm_bytecode_add_cf_end(ctx.bc);
1773 else {
1774 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
1775 ctx.bc->cf_last->end_of_program = 1;
1776 }
1777
1778 gs->gs_copy_shader = cshader;
1779 cshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
1780
1781 ctx.bc->nstack = 1;
1782
1783 return r600_bytecode_build(ctx.bc);
1784 }
1785
1786 static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, const struct pipe_stream_output_info *so, int stream, bool ind)
1787 {
1788 struct r600_bytecode_output output;
1789 int i, k, ring_offset;
1790 int effective_stream = stream == -1 ? 0 : stream;
1791 int idx = 0;
1792
1793 for (i = 0; i < ctx->shader->noutput; i++) {
1794 if (ctx->gs_for_vs) {
1795 /* for ES we need to lookup corresponding ring offset expected by GS
1796 * (map this output to GS input by name and sid) */
1797 /* FIXME precompute offsets */
1798 ring_offset = -1;
1799 for(k = 0; k < ctx->gs_for_vs->ninput; ++k) {
1800 struct r600_shader_io *in = &ctx->gs_for_vs->input[k];
1801 struct r600_shader_io *out = &ctx->shader->output[i];
1802 if (in->name == out->name && in->sid == out->sid)
1803 ring_offset = in->ring_offset;
1804 }
1805
1806 if (ring_offset == -1)
1807 continue;
1808 } else {
1809 ring_offset = idx * 16;
1810 idx++;
1811 }
1812
1813 if (stream > 0 && ctx->shader->output[i].name == TGSI_SEMANTIC_POSITION)
1814 continue;
1815 /* next_ring_offset after parsing input decls contains total size of
1816 * single vertex data, gs_next_vertex - current vertex index */
1817 if (!ind)
1818 ring_offset += ctx->gs_out_ring_offset * ctx->gs_next_vertex;
1819
1820 memset(&output, 0, sizeof(struct r600_bytecode_output));
1821 output.gpr = ctx->shader->output[i].gpr;
1822 output.elem_size = 3;
1823 output.comp_mask = 0xF;
1824 output.burst_count = 1;
1825
1826 if (ind)
1827 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
1828 else
1829 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
1830
1831 switch (stream) {
1832 default:
1833 case 0:
1834 output.op = CF_OP_MEM_RING; break;
1835 case 1:
1836 output.op = CF_OP_MEM_RING1; break;
1837 case 2:
1838 output.op = CF_OP_MEM_RING2; break;
1839 case 3:
1840 output.op = CF_OP_MEM_RING3; break;
1841 }
1842
1843 if (ind) {
1844 output.array_base = ring_offset >> 2; /* in dwords */
1845 output.array_size = 0xfff;
1846 output.index_gpr = ctx->gs_export_gpr_tregs[effective_stream];
1847 } else
1848 output.array_base = ring_offset >> 2; /* in dwords */
1849 r600_bytecode_add_output(ctx->bc, &output);
1850 }
1851
1852 if (ind) {
1853 /* get a temp and add the ring offset to the next vertex base in the shader */
1854 struct r600_bytecode_alu alu;
1855 int r;
1856
1857 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1858 alu.op = ALU_OP2_ADD_INT;
1859 alu.src[0].sel = ctx->gs_export_gpr_tregs[effective_stream];
1860 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1861 alu.src[1].value = ctx->gs_out_ring_offset >> 4;
1862 alu.dst.sel = ctx->gs_export_gpr_tregs[effective_stream];
1863 alu.dst.write = 1;
1864 alu.last = 1;
1865 r = r600_bytecode_add_alu(ctx->bc, &alu);
1866 if (r)
1867 return r;
1868 }
1869 ++ctx->gs_next_vertex;
1870 return 0;
1871 }
1872
1873 static int r600_shader_from_tgsi(struct r600_context *rctx,
1874 struct r600_pipe_shader *pipeshader,
1875 union r600_shader_key key)
1876 {
1877 struct r600_screen *rscreen = rctx->screen;
1878 struct r600_shader *shader = &pipeshader->shader;
1879 struct tgsi_token *tokens = pipeshader->selector->tokens;
1880 struct pipe_stream_output_info so = pipeshader->selector->so;
1881 struct tgsi_full_immediate *immediate;
1882 struct r600_shader_ctx ctx;
1883 struct r600_bytecode_output output[32];
1884 unsigned output_done, noutput;
1885 unsigned opcode;
1886 int i, j, k, r = 0;
1887 int next_param_base = 0, next_clip_base;
1888 int max_color_exports = MAX2(key.ps.nr_cbufs, 1);
1889 /* Declarations used by llvm code */
1890 bool use_llvm = false;
1891 bool indirect_gprs;
1892 bool ring_outputs = false;
1893 bool pos_emitted = false;
1894
1895 #ifdef R600_USE_LLVM
1896 use_llvm = rscreen->b.debug_flags & DBG_LLVM;
1897 #endif
1898 ctx.bc = &shader->bc;
1899 ctx.shader = shader;
1900 ctx.native_integers = true;
1901
1902
1903 r600_bytecode_init(ctx.bc, rscreen->b.chip_class, rscreen->b.family,
1904 rscreen->has_compressed_msaa_texturing);
1905 ctx.tokens = tokens;
1906 tgsi_scan_shader(tokens, &ctx.info);
1907 shader->indirect_files = ctx.info.indirect_files;
1908
1909 shader->uses_doubles = ctx.info.uses_doubles;
1910
1911 indirect_gprs = ctx.info.indirect_files & ~((1 << TGSI_FILE_CONSTANT) | (1 << TGSI_FILE_SAMPLER));
1912 tgsi_parse_init(&ctx.parse, tokens);
1913 ctx.type = ctx.info.processor;
1914 shader->processor_type = ctx.type;
1915 ctx.bc->type = shader->processor_type;
1916
1917 switch (ctx.type) {
1918 case TGSI_PROCESSOR_VERTEX:
1919 shader->vs_as_gs_a = key.vs.as_gs_a;
1920 shader->vs_as_es = key.vs.as_es;
1921 if (shader->vs_as_es)
1922 ring_outputs = true;
1923 break;
1924 case TGSI_PROCESSOR_GEOMETRY:
1925 ring_outputs = true;
1926 break;
1927 case TGSI_PROCESSOR_FRAGMENT:
1928 shader->two_side = key.ps.color_two_side;
1929 break;
1930 default:
1931 break;
1932 }
1933
1934 if (shader->vs_as_es) {
1935 ctx.gs_for_vs = &rctx->gs_shader->current->shader;
1936 } else {
1937 ctx.gs_for_vs = NULL;
1938 }
1939
1940 ctx.next_ring_offset = 0;
1941 ctx.gs_out_ring_offset = 0;
1942 ctx.gs_next_vertex = 0;
1943 ctx.gs_stream_output_info = &so;
1944
1945 ctx.face_gpr = -1;
1946 ctx.fixed_pt_position_gpr = -1;
1947 ctx.fragcoord_input = -1;
1948 ctx.colors_used = 0;
1949 ctx.clip_vertex_write = 0;
1950
1951 shader->nr_ps_color_exports = 0;
1952 shader->nr_ps_max_color_exports = 0;
1953
1954
1955 /* register allocations */
1956 /* Values [0,127] correspond to GPR[0..127].
1957 * Values [128,159] correspond to constant buffer bank 0
1958 * Values [160,191] correspond to constant buffer bank 1
1959 * Values [256,511] correspond to cfile constants c[0..255]. (Gone on EG)
1960 * Values [256,287] correspond to constant buffer bank 2 (EG)
1961 * Values [288,319] correspond to constant buffer bank 3 (EG)
1962 * Other special values are shown in the list below.
1963 * 244 ALU_SRC_1_DBL_L: special constant 1.0 double-float, LSW. (RV670+)
1964 * 245 ALU_SRC_1_DBL_M: special constant 1.0 double-float, MSW. (RV670+)
1965 * 246 ALU_SRC_0_5_DBL_L: special constant 0.5 double-float, LSW. (RV670+)
1966 * 247 ALU_SRC_0_5_DBL_M: special constant 0.5 double-float, MSW. (RV670+)
1967 * 248 SQ_ALU_SRC_0: special constant 0.0.
1968 * 249 SQ_ALU_SRC_1: special constant 1.0 float.
1969 * 250 SQ_ALU_SRC_1_INT: special constant 1 integer.
1970 * 251 SQ_ALU_SRC_M_1_INT: special constant -1 integer.
1971 * 252 SQ_ALU_SRC_0_5: special constant 0.5 float.
1972 * 253 SQ_ALU_SRC_LITERAL: literal constant.
1973 * 254 SQ_ALU_SRC_PV: previous vector result.
1974 * 255 SQ_ALU_SRC_PS: previous scalar result.
1975 */
1976 for (i = 0; i < TGSI_FILE_COUNT; i++) {
1977 ctx.file_offset[i] = 0;
1978 }
1979
1980 #ifdef R600_USE_LLVM
1981 if (use_llvm && ctx.info.indirect_files && (ctx.info.indirect_files & (1 << TGSI_FILE_CONSTANT)) != ctx.info.indirect_files) {
1982 fprintf(stderr, "Warning: R600 LLVM backend does not support "
1983 "indirect adressing. Falling back to TGSI "
1984 "backend.\n");
1985 use_llvm = 0;
1986 }
1987 #endif
1988 if (ctx.type == TGSI_PROCESSOR_VERTEX) {
1989 ctx.file_offset[TGSI_FILE_INPUT] = 1;
1990 if (!use_llvm) {
1991 r600_bytecode_add_cfinst(ctx.bc, CF_OP_CALL_FS);
1992 }
1993 }
1994 if (ctx.type == TGSI_PROCESSOR_FRAGMENT) {
1995 if (ctx.bc->chip_class >= EVERGREEN)
1996 ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx);
1997 else
1998 ctx.file_offset[TGSI_FILE_INPUT] = allocate_system_value_inputs(&ctx, ctx.file_offset[TGSI_FILE_INPUT]);
1999 }
2000 if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
2001 /* FIXME 1 would be enough in some cases (3 or less input vertices) */
2002 ctx.file_offset[TGSI_FILE_INPUT] = 2;
2003 }
2004 ctx.use_llvm = use_llvm;
2005
2006 if (use_llvm) {
2007 ctx.file_offset[TGSI_FILE_OUTPUT] =
2008 ctx.file_offset[TGSI_FILE_INPUT];
2009 } else {
2010 ctx.file_offset[TGSI_FILE_OUTPUT] =
2011 ctx.file_offset[TGSI_FILE_INPUT] +
2012 ctx.info.file_max[TGSI_FILE_INPUT] + 1;
2013 }
2014 ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] +
2015 ctx.info.file_max[TGSI_FILE_OUTPUT] + 1;
2016
2017 /* Outside the GPR range. This will be translated to one of the
2018 * kcache banks later. */
2019 ctx.file_offset[TGSI_FILE_CONSTANT] = 512;
2020
2021 ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL;
2022 ctx.bc->ar_reg = ctx.file_offset[TGSI_FILE_TEMPORARY] +
2023 ctx.info.file_max[TGSI_FILE_TEMPORARY] + 1;
2024 ctx.bc->index_reg[0] = ctx.bc->ar_reg + 1;
2025 ctx.bc->index_reg[1] = ctx.bc->ar_reg + 2;
2026
2027 if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
2028 ctx.gs_export_gpr_tregs[0] = ctx.bc->ar_reg + 3;
2029 ctx.gs_export_gpr_tregs[1] = ctx.bc->ar_reg + 4;
2030 ctx.gs_export_gpr_tregs[2] = ctx.bc->ar_reg + 5;
2031 ctx.gs_export_gpr_tregs[3] = ctx.bc->ar_reg + 6;
2032 ctx.temp_reg = ctx.bc->ar_reg + 7;
2033 } else {
2034 ctx.temp_reg = ctx.bc->ar_reg + 3;
2035 }
2036
2037 shader->max_arrays = 0;
2038 shader->num_arrays = 0;
2039 if (indirect_gprs) {
2040
2041 if (ctx.info.indirect_files & (1 << TGSI_FILE_INPUT)) {
2042 r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_INPUT],
2043 ctx.file_offset[TGSI_FILE_OUTPUT] -
2044 ctx.file_offset[TGSI_FILE_INPUT],
2045 0x0F);
2046 }
2047 if (ctx.info.indirect_files & (1 << TGSI_FILE_OUTPUT)) {
2048 r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_OUTPUT],
2049 ctx.file_offset[TGSI_FILE_TEMPORARY] -
2050 ctx.file_offset[TGSI_FILE_OUTPUT],
2051 0x0F);
2052 }
2053 }
2054
2055 ctx.nliterals = 0;
2056 ctx.literals = NULL;
2057
2058 shader->fs_write_all = ctx.info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS];
2059 shader->vs_position_window_space = ctx.info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION];
2060 shader->ps_conservative_z = (uint8_t)ctx.info.properties[TGSI_PROPERTY_FS_DEPTH_LAYOUT];
2061
2062 if (shader->vs_as_gs_a)
2063 vs_add_primid_output(&ctx, key.vs.prim_id_out);
2064
2065 while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
2066 tgsi_parse_token(&ctx.parse);
2067 switch (ctx.parse.FullToken.Token.Type) {
2068 case TGSI_TOKEN_TYPE_IMMEDIATE:
2069 immediate = &ctx.parse.FullToken.FullImmediate;
2070 ctx.literals = realloc(ctx.literals, (ctx.nliterals + 1) * 16);
2071 if(ctx.literals == NULL) {
2072 r = -ENOMEM;
2073 goto out_err;
2074 }
2075 ctx.literals[ctx.nliterals * 4 + 0] = immediate->u[0].Uint;
2076 ctx.literals[ctx.nliterals * 4 + 1] = immediate->u[1].Uint;
2077 ctx.literals[ctx.nliterals * 4 + 2] = immediate->u[2].Uint;
2078 ctx.literals[ctx.nliterals * 4 + 3] = immediate->u[3].Uint;
2079 ctx.nliterals++;
2080 break;
2081 case TGSI_TOKEN_TYPE_DECLARATION:
2082 r = tgsi_declaration(&ctx);
2083 if (r)
2084 goto out_err;
2085 break;
2086 case TGSI_TOKEN_TYPE_INSTRUCTION:
2087 case TGSI_TOKEN_TYPE_PROPERTY:
2088 break;
2089 default:
2090 R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type);
2091 r = -EINVAL;
2092 goto out_err;
2093 }
2094 }
2095
2096 shader->ring_item_sizes[0] = ctx.next_ring_offset;
2097 shader->ring_item_sizes[1] = 0;
2098 shader->ring_item_sizes[2] = 0;
2099 shader->ring_item_sizes[3] = 0;
2100
2101 /* Process two side if needed */
2102 if (shader->two_side && ctx.colors_used) {
2103 int i, count = ctx.shader->ninput;
2104 unsigned next_lds_loc = ctx.shader->nlds;
2105
2106 /* additional inputs will be allocated right after the existing inputs,
2107 * we won't need them after the color selection, so we don't need to
2108 * reserve these gprs for the rest of the shader code and to adjust
2109 * output offsets etc. */
2110 int gpr = ctx.file_offset[TGSI_FILE_INPUT] +
2111 ctx.info.file_max[TGSI_FILE_INPUT] + 1;
2112
2113 /* if two sided and neither face or sample mask is used by shader, ensure face_gpr is emitted */
2114 if (ctx.face_gpr == -1) {
2115 i = ctx.shader->ninput++;
2116 ctx.shader->input[i].name = TGSI_SEMANTIC_FACE;
2117 ctx.shader->input[i].spi_sid = 0;
2118 ctx.shader->input[i].gpr = gpr++;
2119 ctx.face_gpr = ctx.shader->input[i].gpr;
2120 }
2121
2122 for (i = 0; i < count; i++) {
2123 if (ctx.shader->input[i].name == TGSI_SEMANTIC_COLOR) {
2124 int ni = ctx.shader->ninput++;
2125 memcpy(&ctx.shader->input[ni],&ctx.shader->input[i], sizeof(struct r600_shader_io));
2126 ctx.shader->input[ni].name = TGSI_SEMANTIC_BCOLOR;
2127 ctx.shader->input[ni].spi_sid = r600_spi_sid(&ctx.shader->input[ni]);
2128 ctx.shader->input[ni].gpr = gpr++;
2129 // TGSI to LLVM needs to know the lds position of inputs.
2130 // Non LLVM path computes it later (in process_twoside_color)
2131 ctx.shader->input[ni].lds_pos = next_lds_loc++;
2132 ctx.shader->input[i].back_color_input = ni;
2133 if (ctx.bc->chip_class >= EVERGREEN) {
2134 if ((r = evergreen_interp_input(&ctx, ni)))
2135 return r;
2136 }
2137 }
2138 }
2139 }
2140
2141 /* LLVM backend setup */
2142 #ifdef R600_USE_LLVM
2143 if (use_llvm) {
2144 struct radeon_llvm_context radeon_llvm_ctx;
2145 LLVMModuleRef mod;
2146 bool dump = r600_can_dump_shader(&rscreen->b, tokens);
2147 boolean use_kill = false;
2148
2149 memset(&radeon_llvm_ctx, 0, sizeof(radeon_llvm_ctx));
2150 radeon_llvm_ctx.type = ctx.type;
2151 radeon_llvm_ctx.two_side = shader->two_side;
2152 radeon_llvm_ctx.face_gpr = ctx.face_gpr;
2153 radeon_llvm_ctx.inputs_count = ctx.shader->ninput + 1;
2154 radeon_llvm_ctx.r600_inputs = ctx.shader->input;
2155 radeon_llvm_ctx.r600_outputs = ctx.shader->output;
2156 radeon_llvm_ctx.color_buffer_count = max_color_exports;
2157 radeon_llvm_ctx.chip_class = ctx.bc->chip_class;
2158 radeon_llvm_ctx.fs_color_all = shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN);
2159 radeon_llvm_ctx.stream_outputs = &so;
2160 radeon_llvm_ctx.alpha_to_one = key.ps.alpha_to_one;
2161 radeon_llvm_ctx.has_compressed_msaa_texturing =
2162 ctx.bc->has_compressed_msaa_texturing;
2163 mod = r600_tgsi_llvm(&radeon_llvm_ctx, tokens);
2164 ctx.shader->has_txq_cube_array_z_comp = radeon_llvm_ctx.has_txq_cube_array_z_comp;
2165 ctx.shader->uses_tex_buffers = radeon_llvm_ctx.uses_tex_buffers;
2166
2167 if (r600_llvm_compile(mod, rscreen->b.family, ctx.bc, &use_kill, dump)) {
2168 radeon_llvm_dispose(&radeon_llvm_ctx);
2169 use_llvm = 0;
2170 fprintf(stderr, "R600 LLVM backend failed to compile "
2171 "shader. Falling back to TGSI\n");
2172 } else {
2173 ctx.file_offset[TGSI_FILE_OUTPUT] =
2174 ctx.file_offset[TGSI_FILE_INPUT];
2175 }
2176 if (use_kill)
2177 ctx.shader->uses_kill = use_kill;
2178 radeon_llvm_dispose(&radeon_llvm_ctx);
2179 }
2180 #endif
2181 /* End of LLVM backend setup */
2182
2183 if (shader->fs_write_all && rscreen->b.chip_class >= EVERGREEN)
2184 shader->nr_ps_max_color_exports = 8;
2185
2186 if (!use_llvm) {
2187 if (ctx.fragcoord_input >= 0) {
2188 if (ctx.bc->chip_class == CAYMAN) {
2189 for (j = 0 ; j < 4; j++) {
2190 struct r600_bytecode_alu alu;
2191 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2192 alu.op = ALU_OP1_RECIP_IEEE;
2193 alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
2194 alu.src[0].chan = 3;
2195
2196 alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
2197 alu.dst.chan = j;
2198 alu.dst.write = (j == 3);
2199 alu.last = 1;
2200 if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
2201 return r;
2202 }
2203 } else {
2204 struct r600_bytecode_alu alu;
2205 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2206 alu.op = ALU_OP1_RECIP_IEEE;
2207 alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
2208 alu.src[0].chan = 3;
2209
2210 alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
2211 alu.dst.chan = 3;
2212 alu.dst.write = 1;
2213 alu.last = 1;
2214 if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
2215 return r;
2216 }
2217 }
2218
2219 if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
2220 struct r600_bytecode_alu alu;
2221 int r;
2222
2223 /* GS thread with no output workaround - emit a cut at start of GS */
2224 if (ctx.bc->chip_class == R600)
2225 r600_bytecode_add_cfinst(ctx.bc, CF_OP_CUT_VERTEX);
2226
2227 for (j = 0; j < 4; j++) {
2228 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2229 alu.op = ALU_OP1_MOV;
2230 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
2231 alu.src[0].value = 0;
2232 alu.dst.sel = ctx.gs_export_gpr_tregs[j];
2233 alu.dst.write = 1;
2234 alu.last = 1;
2235 r = r600_bytecode_add_alu(ctx.bc, &alu);
2236 if (r)
2237 return r;
2238 }
2239 }
2240 if (shader->two_side && ctx.colors_used) {
2241 if ((r = process_twoside_color_inputs(&ctx)))
2242 return r;
2243 }
2244
2245 tgsi_parse_init(&ctx.parse, tokens);
2246 while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
2247 tgsi_parse_token(&ctx.parse);
2248 switch (ctx.parse.FullToken.Token.Type) {
2249 case TGSI_TOKEN_TYPE_INSTRUCTION:
2250 r = tgsi_is_supported(&ctx);
2251 if (r)
2252 goto out_err;
2253 ctx.max_driver_temp_used = 0;
2254 /* reserve first tmp for everyone */
2255 r600_get_temp(&ctx);
2256
2257 opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode;
2258 if ((r = tgsi_split_constant(&ctx)))
2259 goto out_err;
2260 if ((r = tgsi_split_literal_constant(&ctx)))
2261 goto out_err;
2262 if (ctx.type == TGSI_PROCESSOR_GEOMETRY)
2263 if ((r = tgsi_split_gs_inputs(&ctx)))
2264 goto out_err;
2265 if (ctx.bc->chip_class == CAYMAN)
2266 ctx.inst_info = &cm_shader_tgsi_instruction[opcode];
2267 else if (ctx.bc->chip_class >= EVERGREEN)
2268 ctx.inst_info = &eg_shader_tgsi_instruction[opcode];
2269 else
2270 ctx.inst_info = &r600_shader_tgsi_instruction[opcode];
2271 r = ctx.inst_info->process(&ctx);
2272 if (r)
2273 goto out_err;
2274 break;
2275 default:
2276 break;
2277 }
2278 }
2279 }
2280
2281 /* Reset the temporary register counter. */
2282 ctx.max_driver_temp_used = 0;
2283
2284 noutput = shader->noutput;
2285
2286 if (!ring_outputs && ctx.clip_vertex_write) {
2287 unsigned clipdist_temp[2];
2288
2289 clipdist_temp[0] = r600_get_temp(&ctx);
2290 clipdist_temp[1] = r600_get_temp(&ctx);
2291
2292 /* need to convert a clipvertex write into clipdistance writes and not export
2293 the clip vertex anymore */
2294
2295 memset(&shader->output[noutput], 0, 2*sizeof(struct r600_shader_io));
2296 shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
2297 shader->output[noutput].gpr = clipdist_temp[0];
2298 noutput++;
2299 shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
2300 shader->output[noutput].gpr = clipdist_temp[1];
2301 noutput++;
2302
2303 /* reset spi_sid for clipvertex output to avoid confusing spi */
2304 shader->output[ctx.cv_output].spi_sid = 0;
2305
2306 shader->clip_dist_write = 0xFF;
2307
2308 for (i = 0; i < 8; i++) {
2309 int oreg = i >> 2;
2310 int ochan = i & 3;
2311
2312 for (j = 0; j < 4; j++) {
2313 struct r600_bytecode_alu alu;
2314 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2315 alu.op = ALU_OP2_DOT4;
2316 alu.src[0].sel = shader->output[ctx.cv_output].gpr;
2317 alu.src[0].chan = j;
2318
2319 alu.src[1].sel = 512 + i;
2320 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
2321 alu.src[1].chan = j;
2322
2323 alu.dst.sel = clipdist_temp[oreg];
2324 alu.dst.chan = j;
2325 alu.dst.write = (j == ochan);
2326 if (j == 3)
2327 alu.last = 1;
2328 if (!use_llvm)
2329 r = r600_bytecode_add_alu(ctx.bc, &alu);
2330 if (r)
2331 return r;
2332 }
2333 }
2334 }
2335
2336 /* Add stream outputs. */
2337 if (!ring_outputs && ctx.type == TGSI_PROCESSOR_VERTEX &&
2338 so.num_outputs && !use_llvm)
2339 emit_streamout(&ctx, &so, -1, NULL);
2340
2341 pipeshader->enabled_stream_buffers_mask = ctx.enabled_stream_buffers_mask;
2342 convert_edgeflag_to_int(&ctx);
2343
2344 if (ring_outputs) {
2345 if (shader->vs_as_es) {
2346 ctx.gs_export_gpr_tregs[0] = r600_get_temp(&ctx);
2347 ctx.gs_export_gpr_tregs[1] = -1;
2348 ctx.gs_export_gpr_tregs[2] = -1;
2349 ctx.gs_export_gpr_tregs[3] = -1;
2350
2351 emit_gs_ring_writes(&ctx, &so, -1, FALSE);
2352 }
2353 } else {
2354 /* Export output */
2355 next_clip_base = shader->vs_out_misc_write ? 62 : 61;
2356
2357 for (i = 0, j = 0; i < noutput; i++, j++) {
2358 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
2359 output[j].gpr = shader->output[i].gpr;
2360 output[j].elem_size = 3;
2361 output[j].swizzle_x = 0;
2362 output[j].swizzle_y = 1;
2363 output[j].swizzle_z = 2;
2364 output[j].swizzle_w = 3;
2365 output[j].burst_count = 1;
2366 output[j].type = -1;
2367 output[j].op = CF_OP_EXPORT;
2368 switch (ctx.type) {
2369 case TGSI_PROCESSOR_VERTEX:
2370 switch (shader->output[i].name) {
2371 case TGSI_SEMANTIC_POSITION:
2372 output[j].array_base = 60;
2373 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2374 pos_emitted = true;
2375 break;
2376
2377 case TGSI_SEMANTIC_PSIZE:
2378 output[j].array_base = 61;
2379 output[j].swizzle_y = 7;
2380 output[j].swizzle_z = 7;
2381 output[j].swizzle_w = 7;
2382 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2383 pos_emitted = true;
2384 break;
2385 case TGSI_SEMANTIC_EDGEFLAG:
2386 output[j].array_base = 61;
2387 output[j].swizzle_x = 7;
2388 output[j].swizzle_y = 0;
2389 output[j].swizzle_z = 7;
2390 output[j].swizzle_w = 7;
2391 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2392 pos_emitted = true;
2393 break;
2394 case TGSI_SEMANTIC_LAYER:
2395 /* spi_sid is 0 for outputs that are
2396 * not consumed by PS */
2397 if (shader->output[i].spi_sid) {
2398 output[j].array_base = next_param_base++;
2399 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2400 j++;
2401 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
2402 }
2403 output[j].array_base = 61;
2404 output[j].swizzle_x = 7;
2405 output[j].swizzle_y = 7;
2406 output[j].swizzle_z = 0;
2407 output[j].swizzle_w = 7;
2408 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2409 pos_emitted = true;
2410 break;
2411 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2412 /* spi_sid is 0 for outputs that are
2413 * not consumed by PS */
2414 if (shader->output[i].spi_sid) {
2415 output[j].array_base = next_param_base++;
2416 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2417 j++;
2418 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
2419 }
2420 output[j].array_base = 61;
2421 output[j].swizzle_x = 7;
2422 output[j].swizzle_y = 7;
2423 output[j].swizzle_z = 7;
2424 output[j].swizzle_w = 0;
2425 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2426 pos_emitted = true;
2427 break;
2428 case TGSI_SEMANTIC_CLIPVERTEX:
2429 j--;
2430 break;
2431 case TGSI_SEMANTIC_CLIPDIST:
2432 output[j].array_base = next_clip_base++;
2433 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2434 pos_emitted = true;
2435 /* spi_sid is 0 for clipdistance outputs that were generated
2436 * for clipvertex - we don't need to pass them to PS */
2437 if (shader->output[i].spi_sid) {
2438 j++;
2439 /* duplicate it as PARAM to pass to the pixel shader */
2440 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
2441 output[j].array_base = next_param_base++;
2442 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2443 }
2444 break;
2445 case TGSI_SEMANTIC_FOG:
2446 output[j].swizzle_y = 4; /* 0 */
2447 output[j].swizzle_z = 4; /* 0 */
2448 output[j].swizzle_w = 5; /* 1 */
2449 break;
2450 case TGSI_SEMANTIC_PRIMID:
2451 output[j].swizzle_x = 2;
2452 output[j].swizzle_y = 4; /* 0 */
2453 output[j].swizzle_z = 4; /* 0 */
2454 output[j].swizzle_w = 4; /* 0 */
2455 break;
2456 }
2457
2458 break;
2459 case TGSI_PROCESSOR_FRAGMENT:
2460 if (shader->output[i].name == TGSI_SEMANTIC_COLOR) {
2461 /* never export more colors than the number of CBs */
2462 if (shader->output[i].sid >= max_color_exports) {
2463 /* skip export */
2464 j--;
2465 continue;
2466 }
2467 output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
2468 output[j].array_base = shader->output[i].sid;
2469 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2470 shader->nr_ps_color_exports++;
2471 if (shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN)) {
2472 for (k = 1; k < max_color_exports; k++) {
2473 j++;
2474 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
2475 output[j].gpr = shader->output[i].gpr;
2476 output[j].elem_size = 3;
2477 output[j].swizzle_x = 0;
2478 output[j].swizzle_y = 1;
2479 output[j].swizzle_z = 2;
2480 output[j].swizzle_w = key.ps.alpha_to_one ? 5 : 3;
2481 output[j].burst_count = 1;
2482 output[j].array_base = k;
2483 output[j].op = CF_OP_EXPORT;
2484 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2485 shader->nr_ps_color_exports++;
2486 }
2487 }
2488 } else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) {
2489 output[j].array_base = 61;
2490 output[j].swizzle_x = 2;
2491 output[j].swizzle_y = 7;
2492 output[j].swizzle_z = output[j].swizzle_w = 7;
2493 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2494 } else if (shader->output[i].name == TGSI_SEMANTIC_STENCIL) {
2495 output[j].array_base = 61;
2496 output[j].swizzle_x = 7;
2497 output[j].swizzle_y = 1;
2498 output[j].swizzle_z = output[j].swizzle_w = 7;
2499 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2500 } else if (shader->output[i].name == TGSI_SEMANTIC_SAMPLEMASK) {
2501 output[j].array_base = 61;
2502 output[j].swizzle_x = 7;
2503 output[j].swizzle_y = 7;
2504 output[j].swizzle_z = 0;
2505 output[j].swizzle_w = 7;
2506 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2507 } else {
2508 R600_ERR("unsupported fragment output name %d\n", shader->output[i].name);
2509 r = -EINVAL;
2510 goto out_err;
2511 }
2512 break;
2513 default:
2514 R600_ERR("unsupported processor type %d\n", ctx.type);
2515 r = -EINVAL;
2516 goto out_err;
2517 }
2518
2519 if (output[j].type==-1) {
2520 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2521 output[j].array_base = next_param_base++;
2522 }
2523 }
2524
2525 /* add fake position export */
2526 if (ctx.type == TGSI_PROCESSOR_VERTEX && pos_emitted == false) {
2527 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
2528 output[j].gpr = 0;
2529 output[j].elem_size = 3;
2530 output[j].swizzle_x = 7;
2531 output[j].swizzle_y = 7;
2532 output[j].swizzle_z = 7;
2533 output[j].swizzle_w = 7;
2534 output[j].burst_count = 1;
2535 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2536 output[j].array_base = 60;
2537 output[j].op = CF_OP_EXPORT;
2538 j++;
2539 }
2540
2541 /* add fake param output for vertex shader if no param is exported */
2542 if (ctx.type == TGSI_PROCESSOR_VERTEX && next_param_base == 0) {
2543 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
2544 output[j].gpr = 0;
2545 output[j].elem_size = 3;
2546 output[j].swizzle_x = 7;
2547 output[j].swizzle_y = 7;
2548 output[j].swizzle_z = 7;
2549 output[j].swizzle_w = 7;
2550 output[j].burst_count = 1;
2551 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2552 output[j].array_base = 0;
2553 output[j].op = CF_OP_EXPORT;
2554 j++;
2555 }
2556
2557 /* add fake pixel export */
2558 if (ctx.type == TGSI_PROCESSOR_FRAGMENT && shader->nr_ps_color_exports == 0) {
2559 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
2560 output[j].gpr = 0;
2561 output[j].elem_size = 3;
2562 output[j].swizzle_x = 7;
2563 output[j].swizzle_y = 7;
2564 output[j].swizzle_z = 7;
2565 output[j].swizzle_w = 7;
2566 output[j].burst_count = 1;
2567 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2568 output[j].array_base = 0;
2569 output[j].op = CF_OP_EXPORT;
2570 j++;
2571 shader->nr_ps_color_exports++;
2572 }
2573
2574 noutput = j;
2575
2576 /* set export done on last export of each type */
2577 for (i = noutput - 1, output_done = 0; i >= 0; i--) {
2578 if (!(output_done & (1 << output[i].type))) {
2579 output_done |= (1 << output[i].type);
2580 output[i].op = CF_OP_EXPORT_DONE;
2581 }
2582 }
2583 /* add output to bytecode */
2584 if (!use_llvm) {
2585 for (i = 0; i < noutput; i++) {
2586 r = r600_bytecode_add_output(ctx.bc, &output[i]);
2587 if (r)
2588 goto out_err;
2589 }
2590 }
2591 }
2592
2593 /* add program end */
2594 if (!use_llvm) {
2595 if (ctx.bc->chip_class == CAYMAN)
2596 cm_bytecode_add_cf_end(ctx.bc);
2597 else {
2598 const struct cf_op_info *last = NULL;
2599
2600 if (ctx.bc->cf_last)
2601 last = r600_isa_cf(ctx.bc->cf_last->op);
2602
2603 /* alu clause instructions don't have EOP bit, so add NOP */
2604 if (!last || last->flags & CF_ALU || ctx.bc->cf_last->op == CF_OP_LOOP_END || ctx.bc->cf_last->op == CF_OP_CALL_FS)
2605 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
2606
2607 ctx.bc->cf_last->end_of_program = 1;
2608 }
2609 }
2610
2611 /* check GPR limit - we have 124 = 128 - 4
2612 * (4 are reserved as alu clause temporary registers) */
2613 if (ctx.bc->ngpr > 124) {
2614 R600_ERR("GPR limit exceeded - shader requires %d registers\n", ctx.bc->ngpr);
2615 r = -ENOMEM;
2616 goto out_err;
2617 }
2618
2619 if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
2620 if ((r = generate_gs_copy_shader(rctx, pipeshader, &so)))
2621 return r;
2622 }
2623
2624 free(ctx.literals);
2625 tgsi_parse_free(&ctx.parse);
2626 return 0;
2627 out_err:
2628 free(ctx.literals);
2629 tgsi_parse_free(&ctx.parse);
2630 return r;
2631 }
2632
2633 static int tgsi_unsupported(struct r600_shader_ctx *ctx)
2634 {
2635 const unsigned tgsi_opcode =
2636 ctx->parse.FullToken.FullInstruction.Instruction.Opcode;
2637 R600_ERR("%s tgsi opcode unsupported\n",
2638 tgsi_get_opcode_name(tgsi_opcode));
2639 return -EINVAL;
2640 }
2641
2642 static int tgsi_end(struct r600_shader_ctx *ctx)
2643 {
2644 return 0;
2645 }
2646
2647 static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
2648 const struct r600_shader_src *shader_src,
2649 unsigned chan)
2650 {
2651 bc_src->sel = shader_src->sel;
2652 bc_src->chan = shader_src->swizzle[chan];
2653 bc_src->neg = shader_src->neg;
2654 bc_src->abs = shader_src->abs;
2655 bc_src->rel = shader_src->rel;
2656 bc_src->value = shader_src->value[bc_src->chan];
2657 bc_src->kc_bank = shader_src->kc_bank;
2658 bc_src->kc_rel = shader_src->kc_rel;
2659 }
2660
2661 static void r600_bytecode_src_set_abs(struct r600_bytecode_alu_src *bc_src)
2662 {
2663 bc_src->abs = 1;
2664 bc_src->neg = 0;
2665 }
2666
2667 static void r600_bytecode_src_toggle_neg(struct r600_bytecode_alu_src *bc_src)
2668 {
2669 bc_src->neg = !bc_src->neg;
2670 }
2671
2672 static void tgsi_dst(struct r600_shader_ctx *ctx,
2673 const struct tgsi_full_dst_register *tgsi_dst,
2674 unsigned swizzle,
2675 struct r600_bytecode_alu_dst *r600_dst)
2676 {
2677 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2678
2679 r600_dst->sel = tgsi_dst->Register.Index;
2680 r600_dst->sel += ctx->file_offset[tgsi_dst->Register.File];
2681 r600_dst->chan = swizzle;
2682 r600_dst->write = 1;
2683 if (tgsi_dst->Register.Indirect)
2684 r600_dst->rel = V_SQ_REL_RELATIVE;
2685 if (inst->Instruction.Saturate) {
2686 r600_dst->clamp = 1;
2687 }
2688 }
2689
2690 static int tgsi_last_instruction(unsigned writemask)
2691 {
2692 int i, lasti = 0;
2693
2694 for (i = 0; i < 4; i++) {
2695 if (writemask & (1 << i)) {
2696 lasti = i;
2697 }
2698 }
2699 return lasti;
2700 }
2701
2702
2703
2704 static int tgsi_op2_64_params(struct r600_shader_ctx *ctx, bool singledest, bool swap)
2705 {
2706 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2707 unsigned write_mask = inst->Dst[0].Register.WriteMask;
2708 struct r600_bytecode_alu alu;
2709 int i, j, r, lasti = tgsi_last_instruction(write_mask);
2710 int use_tmp = 0;
2711
2712 if (singledest) {
2713 switch (write_mask) {
2714 case 0x1:
2715 write_mask = 0x3;
2716 break;
2717 case 0x2:
2718 use_tmp = 1;
2719 write_mask = 0x3;
2720 break;
2721 case 0x4:
2722 write_mask = 0xc;
2723 break;
2724 case 0x8:
2725 write_mask = 0xc;
2726 use_tmp = 3;
2727 break;
2728 }
2729 }
2730
2731 lasti = tgsi_last_instruction(write_mask);
2732 for (i = 0; i <= lasti; i++) {
2733
2734 if (!(write_mask & (1 << i)))
2735 continue;
2736
2737 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2738
2739 if (singledest) {
2740 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2741 if (use_tmp) {
2742 alu.dst.sel = ctx->temp_reg;
2743 alu.dst.chan = i;
2744 alu.dst.write = 1;
2745 }
2746 if (i == 1 || i == 3)
2747 alu.dst.write = 0;
2748 } else
2749 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2750
2751 alu.op = ctx->inst_info->op;
2752 if (ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DABS) {
2753 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
2754 } else if (!swap) {
2755 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
2756 r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
2757 }
2758 } else {
2759 r600_bytecode_src(&alu.src[0], &ctx->src[1], fp64_switch(i));
2760 r600_bytecode_src(&alu.src[1], &ctx->src[0], fp64_switch(i));
2761 }
2762
2763 /* handle some special cases */
2764 if (i == 1 || i == 3) {
2765 switch (ctx->parse.FullToken.FullInstruction.Instruction.Opcode) {
2766 case TGSI_OPCODE_SUB:
2767 r600_bytecode_src_toggle_neg(&alu.src[1]);
2768 break;
2769 case TGSI_OPCODE_DABS:
2770 r600_bytecode_src_set_abs(&alu.src[0]);
2771 break;
2772 default:
2773 break;
2774 }
2775 }
2776 if (i == lasti) {
2777 alu.last = 1;
2778 }
2779 r = r600_bytecode_add_alu(ctx->bc, &alu);
2780 if (r)
2781 return r;
2782 }
2783
2784 if (use_tmp) {
2785 write_mask = inst->Dst[0].Register.WriteMask;
2786
2787 /* move result from temp to dst */
2788 for (i = 0; i <= lasti; i++) {
2789 if (!(write_mask & (1 << i)))
2790 continue;
2791
2792 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2793 alu.op = ALU_OP1_MOV;
2794 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2795 alu.src[0].sel = ctx->temp_reg;
2796 alu.src[0].chan = use_tmp - 1;
2797 alu.last = (i == lasti);
2798
2799 r = r600_bytecode_add_alu(ctx->bc, &alu);
2800 if (r)
2801 return r;
2802 }
2803 }
2804 return 0;
2805 }
2806
2807 static int tgsi_op2_64(struct r600_shader_ctx *ctx)
2808 {
2809 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2810 unsigned write_mask = inst->Dst[0].Register.WriteMask;
2811 /* confirm writemasking */
2812 if ((write_mask & 0x3) != 0x3 &&
2813 (write_mask & 0xc) != 0xc) {
2814 fprintf(stderr, "illegal writemask for 64-bit: 0x%x\n", write_mask);
2815 return -1;
2816 }
2817 return tgsi_op2_64_params(ctx, false, false);
2818 }
2819
2820 static int tgsi_op2_64_single_dest(struct r600_shader_ctx *ctx)
2821 {
2822 return tgsi_op2_64_params(ctx, true, false);
2823 }
2824
2825 static int tgsi_op2_64_single_dest_s(struct r600_shader_ctx *ctx)
2826 {
2827 return tgsi_op2_64_params(ctx, true, true);
2828 }
2829
2830 static int tgsi_op3_64(struct r600_shader_ctx *ctx)
2831 {
2832 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2833 struct r600_bytecode_alu alu;
2834 int i, j, r;
2835 int lasti = 3;
2836 int tmp = r600_get_temp(ctx);
2837
2838 for (i = 0; i < lasti + 1; i++) {
2839
2840 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2841 alu.op = ctx->inst_info->op;
2842 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
2843 r600_bytecode_src(&alu.src[j], &ctx->src[j], i == 3 ? 0 : 1);
2844 }
2845
2846 if (inst->Dst[0].Register.WriteMask & (1 << i))
2847 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2848 else
2849 alu.dst.sel = tmp;
2850
2851 alu.dst.chan = i;
2852 alu.is_op3 = 1;
2853 if (i == lasti) {
2854 alu.last = 1;
2855 }
2856 r = r600_bytecode_add_alu(ctx->bc, &alu);
2857 if (r)
2858 return r;
2859 }
2860 return 0;
2861 }
2862
2863 static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap, int trans_only)
2864 {
2865 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2866 struct r600_bytecode_alu alu;
2867 unsigned write_mask = inst->Dst[0].Register.WriteMask;
2868 int i, j, r, lasti = tgsi_last_instruction(write_mask);
2869 /* use temp register if trans_only and more than one dst component */
2870 int use_tmp = trans_only && (write_mask ^ (1 << lasti));
2871
2872 for (i = 0; i <= lasti; i++) {
2873 if (!(write_mask & (1 << i)))
2874 continue;
2875
2876 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2877 if (use_tmp) {
2878 alu.dst.sel = ctx->temp_reg;
2879 alu.dst.chan = i;
2880 alu.dst.write = 1;
2881 } else
2882 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2883
2884 alu.op = ctx->inst_info->op;
2885 if (!swap) {
2886 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
2887 r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
2888 }
2889 } else {
2890 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
2891 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
2892 }
2893 /* handle some special cases */
2894 switch (inst->Instruction.Opcode) {
2895 case TGSI_OPCODE_SUB:
2896 r600_bytecode_src_toggle_neg(&alu.src[1]);
2897 break;
2898 case TGSI_OPCODE_ABS:
2899 r600_bytecode_src_set_abs(&alu.src[0]);
2900 break;
2901 default:
2902 break;
2903 }
2904 if (i == lasti || trans_only) {
2905 alu.last = 1;
2906 }
2907 r = r600_bytecode_add_alu(ctx->bc, &alu);
2908 if (r)
2909 return r;
2910 }
2911
2912 if (use_tmp) {
2913 /* move result from temp to dst */
2914 for (i = 0; i <= lasti; i++) {
2915 if (!(write_mask & (1 << i)))
2916 continue;
2917
2918 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2919 alu.op = ALU_OP1_MOV;
2920 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2921 alu.src[0].sel = ctx->temp_reg;
2922 alu.src[0].chan = i;
2923 alu.last = (i == lasti);
2924
2925 r = r600_bytecode_add_alu(ctx->bc, &alu);
2926 if (r)
2927 return r;
2928 }
2929 }
2930 return 0;
2931 }
2932
2933 static int tgsi_op2(struct r600_shader_ctx *ctx)
2934 {
2935 return tgsi_op2_s(ctx, 0, 0);
2936 }
2937
2938 static int tgsi_op2_swap(struct r600_shader_ctx *ctx)
2939 {
2940 return tgsi_op2_s(ctx, 1, 0);
2941 }
2942
2943 static int tgsi_op2_trans(struct r600_shader_ctx *ctx)
2944 {
2945 return tgsi_op2_s(ctx, 0, 1);
2946 }
2947
2948 static int tgsi_ineg(struct r600_shader_ctx *ctx)
2949 {
2950 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2951 struct r600_bytecode_alu alu;
2952 int i, r;
2953 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
2954
2955 for (i = 0; i < lasti + 1; i++) {
2956
2957 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
2958 continue;
2959 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2960 alu.op = ctx->inst_info->op;
2961
2962 alu.src[0].sel = V_SQ_ALU_SRC_0;
2963
2964 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
2965
2966 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2967
2968 if (i == lasti) {
2969 alu.last = 1;
2970 }
2971 r = r600_bytecode_add_alu(ctx->bc, &alu);
2972 if (r)
2973 return r;
2974 }
2975 return 0;
2976
2977 }
2978
2979 static int tgsi_dneg(struct r600_shader_ctx *ctx)
2980 {
2981 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2982 struct r600_bytecode_alu alu;
2983 int i, r;
2984 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
2985
2986 for (i = 0; i < lasti + 1; i++) {
2987
2988 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
2989 continue;
2990 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2991 alu.op = ALU_OP1_MOV;
2992
2993 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
2994
2995 if (i == 1 || i == 3)
2996 r600_bytecode_src_toggle_neg(&alu.src[0]);
2997 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2998
2999 if (i == lasti) {
3000 alu.last = 1;
3001 }
3002 r = r600_bytecode_add_alu(ctx->bc, &alu);
3003 if (r)
3004 return r;
3005 }
3006 return 0;
3007
3008 }
3009
3010 static int tgsi_dfracexp(struct r600_shader_ctx *ctx)
3011 {
3012 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3013 struct r600_bytecode_alu alu;
3014 unsigned write_mask = inst->Dst[0].Register.WriteMask;
3015 int i, j, r;
3016 int firsti = write_mask == 0xc ? 2 : 0;
3017
3018 for (i = 0; i <= 3; i++) {
3019 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3020 alu.op = ctx->inst_info->op;
3021
3022 alu.dst.sel = ctx->temp_reg;
3023 alu.dst.chan = i;
3024 alu.dst.write = 1;
3025 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3026 r600_bytecode_src(&alu.src[j], &ctx->src[j], fp64_switch(i));
3027 }
3028
3029 if (i == 3)
3030 alu.last = 1;
3031
3032 r = r600_bytecode_add_alu(ctx->bc, &alu);
3033 if (r)
3034 return r;
3035 }
3036
3037 /* MOV first two channels to writemask dst0 */
3038 for (i = 0; i <= 1; i++) {
3039 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3040 alu.op = ALU_OP1_MOV;
3041 alu.src[0].chan = i + 2;
3042 alu.src[0].sel = ctx->temp_reg;
3043
3044 tgsi_dst(ctx, &inst->Dst[0], firsti + i, &alu.dst);
3045 alu.dst.write = (inst->Dst[0].Register.WriteMask >> (firsti + i)) & 1;
3046 alu.last = 1;
3047 r = r600_bytecode_add_alu(ctx->bc, &alu);
3048 if (r)
3049 return r;
3050 }
3051
3052 for (i = 0; i <= 3; i++) {
3053 if (inst->Dst[1].Register.WriteMask & (1 << i)) {
3054 /* MOV third channels to writemask dst1 */
3055 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3056 alu.op = ALU_OP1_MOV;
3057 alu.src[0].chan = 1;
3058 alu.src[0].sel = ctx->temp_reg;
3059
3060 tgsi_dst(ctx, &inst->Dst[1], i, &alu.dst);
3061 alu.last = 1;
3062 r = r600_bytecode_add_alu(ctx->bc, &alu);
3063 if (r)
3064 return r;
3065 break;
3066 }
3067 }
3068 return 0;
3069 }
3070
3071
3072 static int egcm_int_to_double(struct r600_shader_ctx *ctx)
3073 {
3074 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3075 struct r600_bytecode_alu alu;
3076 int i, r;
3077 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3078
3079 assert(inst->Instruction.Opcode == TGSI_OPCODE_I2D ||
3080 inst->Instruction.Opcode == TGSI_OPCODE_U2D);
3081
3082 for (i = 0; i <= (lasti+1)/2; i++) {
3083 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3084 alu.op = ctx->inst_info->op;
3085
3086 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3087 alu.dst.sel = ctx->temp_reg;
3088 alu.dst.chan = i;
3089 alu.dst.write = 1;
3090 alu.last = 1;
3091
3092 r = r600_bytecode_add_alu(ctx->bc, &alu);
3093 if (r)
3094 return r;
3095 }
3096
3097 for (i = 0; i <= lasti; i++) {
3098 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3099 alu.op = ALU_OP1_FLT32_TO_FLT64;
3100
3101 alu.src[0].chan = i/2;
3102 if (i%2 == 0)
3103 alu.src[0].sel = ctx->temp_reg;
3104 else {
3105 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
3106 alu.src[0].value = 0x0;
3107 }
3108 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3109 alu.last = i == lasti;
3110
3111 r = r600_bytecode_add_alu(ctx->bc, &alu);
3112 if (r)
3113 return r;
3114 }
3115
3116 return 0;
3117 }
3118
3119 static int egcm_double_to_int(struct r600_shader_ctx *ctx)
3120 {
3121 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3122 struct r600_bytecode_alu alu;
3123 int i, r;
3124 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3125
3126 assert(inst->Instruction.Opcode == TGSI_OPCODE_D2I ||
3127 inst->Instruction.Opcode == TGSI_OPCODE_D2U);
3128
3129 for (i = 0; i <= lasti; i++) {
3130 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3131 alu.op = ALU_OP1_FLT64_TO_FLT32;
3132
3133 r600_bytecode_src(&alu.src[0], &ctx->src[0], fp64_switch(i));
3134 alu.dst.chan = i;
3135 alu.dst.sel = ctx->temp_reg;
3136 alu.dst.write = i%2 == 0;
3137 alu.last = i == lasti;
3138
3139 r = r600_bytecode_add_alu(ctx->bc, &alu);
3140 if (r)
3141 return r;
3142 }
3143
3144 for (i = 0; i <= (lasti+1)/2; i++) {
3145 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3146 alu.op = ctx->inst_info->op;
3147
3148 alu.src[0].chan = i*2;
3149 alu.src[0].sel = ctx->temp_reg;
3150 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
3151 alu.last = 1;
3152
3153 r = r600_bytecode_add_alu(ctx->bc, &alu);
3154 if (r)
3155 return r;
3156 }
3157
3158 return 0;
3159 }
3160
3161 static int cayman_emit_double_instr(struct r600_shader_ctx *ctx)
3162 {
3163 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3164 int i, r;
3165 struct r600_bytecode_alu alu;
3166 int last_slot = 3;
3167 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3168 int t1 = ctx->temp_reg;
3169
3170 /* these have to write the result to X/Y by the looks of it */
3171 for (i = 0 ; i < last_slot; i++) {
3172 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3173 alu.op = ctx->inst_info->op;
3174
3175 /* should only be one src regs */
3176 assert (inst->Instruction.NumSrcRegs == 1);
3177
3178 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
3179 r600_bytecode_src(&alu.src[1], &ctx->src[0], 0);
3180
3181 /* RSQ should take the absolute value of src */
3182 if (ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DRSQ ||
3183 ctx->parse.FullToken.FullInstruction.Instruction.Opcode == TGSI_OPCODE_DSQRT) {
3184 r600_bytecode_src_set_abs(&alu.src[1]);
3185 }
3186 alu.dst.sel = t1;
3187 alu.dst.chan = i;
3188 alu.dst.write = (i == 0 || i == 1);
3189
3190 if (ctx->bc->chip_class != CAYMAN || i == last_slot - 1)
3191 alu.last = 1;
3192 r = r600_bytecode_add_alu(ctx->bc, &alu);
3193 if (r)
3194 return r;
3195 }
3196
3197 for (i = 0 ; i <= lasti; i++) {
3198 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
3199 continue;
3200 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3201 alu.op = ALU_OP1_MOV;
3202 alu.src[0].sel = t1;
3203 alu.src[0].chan = (i == 0 || i == 2) ? 0 : 1;
3204 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3205 alu.dst.write = 1;
3206 if (i == lasti)
3207 alu.last = 1;
3208 r = r600_bytecode_add_alu(ctx->bc, &alu);
3209 if (r)
3210 return r;
3211 }
3212 return 0;
3213 }
3214
3215 static int cayman_emit_float_instr(struct r600_shader_ctx *ctx)
3216 {
3217 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3218 int i, j, r;
3219 struct r600_bytecode_alu alu;
3220 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
3221
3222 for (i = 0 ; i < last_slot; i++) {
3223 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3224 alu.op = ctx->inst_info->op;
3225 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3226 r600_bytecode_src(&alu.src[j], &ctx->src[j], 0);
3227
3228 /* RSQ should take the absolute value of src */
3229 if (inst->Instruction.Opcode == TGSI_OPCODE_RSQ) {
3230 r600_bytecode_src_set_abs(&alu.src[j]);
3231 }
3232 }
3233 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3234 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
3235
3236 if (i == last_slot - 1)
3237 alu.last = 1;
3238 r = r600_bytecode_add_alu(ctx->bc, &alu);
3239 if (r)
3240 return r;
3241 }
3242 return 0;
3243 }
3244
3245 static int cayman_mul_int_instr(struct r600_shader_ctx *ctx)
3246 {
3247 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3248 int i, j, k, r;
3249 struct r600_bytecode_alu alu;
3250 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3251 int t1 = ctx->temp_reg;
3252
3253 for (k = 0; k <= lasti; k++) {
3254 if (!(inst->Dst[0].Register.WriteMask & (1 << k)))
3255 continue;
3256
3257 for (i = 0 ; i < 4; i++) {
3258 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3259 alu.op = ctx->inst_info->op;
3260 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3261 r600_bytecode_src(&alu.src[j], &ctx->src[j], k);
3262 }
3263 alu.dst.sel = t1;
3264 alu.dst.chan = i;
3265 alu.dst.write = (i == k);
3266 if (i == 3)
3267 alu.last = 1;
3268 r = r600_bytecode_add_alu(ctx->bc, &alu);
3269 if (r)
3270 return r;
3271 }
3272 }
3273
3274 for (i = 0 ; i <= lasti; i++) {
3275 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
3276 continue;
3277 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3278 alu.op = ALU_OP1_MOV;
3279 alu.src[0].sel = t1;
3280 alu.src[0].chan = i;
3281 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3282 alu.dst.write = 1;
3283 if (i == lasti)
3284 alu.last = 1;
3285 r = r600_bytecode_add_alu(ctx->bc, &alu);
3286 if (r)
3287 return r;
3288 }
3289
3290 return 0;
3291 }
3292
3293
3294 static int cayman_mul_double_instr(struct r600_shader_ctx *ctx)
3295 {
3296 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3297 int i, j, k, r;
3298 struct r600_bytecode_alu alu;
3299 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3300 int t1 = ctx->temp_reg;
3301
3302 for (k = 0; k < 2; k++) {
3303 if (!(inst->Dst[0].Register.WriteMask & (0x3 << (k * 2))))
3304 continue;
3305
3306 for (i = 0; i < 4; i++) {
3307 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3308 alu.op = ctx->inst_info->op;
3309 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
3310 r600_bytecode_src(&alu.src[j], &ctx->src[j], k * 2 + ((i == 3) ? 0 : 1));;
3311 }
3312 alu.dst.sel = t1;
3313 alu.dst.chan = i;
3314 alu.dst.write = 1;
3315 if (i == 3)
3316 alu.last = 1;
3317 r = r600_bytecode_add_alu(ctx->bc, &alu);
3318 if (r)
3319 return r;
3320 }
3321 }
3322
3323 for (i = 0; i <= lasti; i++) {
3324 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
3325 continue;
3326 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3327 alu.op = ALU_OP1_MOV;
3328 alu.src[0].sel = t1;
3329 alu.src[0].chan = i;
3330 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3331 alu.dst.write = 1;
3332 if (i == lasti)
3333 alu.last = 1;
3334 r = r600_bytecode_add_alu(ctx->bc, &alu);
3335 if (r)
3336 return r;
3337 }
3338
3339 return 0;
3340 }
3341
3342 /*
3343 * r600 - trunc to -PI..PI range
3344 * r700 - normalize by dividing by 2PI
3345 * see fdo bug 27901
3346 */
3347 static int tgsi_setup_trig(struct r600_shader_ctx *ctx)
3348 {
3349 static float half_inv_pi = 1.0 /(3.1415926535 * 2);
3350 static float double_pi = 3.1415926535 * 2;
3351 static float neg_pi = -3.1415926535;
3352
3353 int r;
3354 struct r600_bytecode_alu alu;
3355
3356 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3357 alu.op = ALU_OP3_MULADD;
3358 alu.is_op3 = 1;
3359
3360 alu.dst.chan = 0;
3361 alu.dst.sel = ctx->temp_reg;
3362 alu.dst.write = 1;
3363
3364 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
3365
3366 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
3367 alu.src[1].chan = 0;
3368 alu.src[1].value = *(uint32_t *)&half_inv_pi;
3369 alu.src[2].sel = V_SQ_ALU_SRC_0_5;
3370 alu.src[2].chan = 0;
3371 alu.last = 1;
3372 r = r600_bytecode_add_alu(ctx->bc, &alu);
3373 if (r)
3374 return r;
3375
3376 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3377 alu.op = ALU_OP1_FRACT;
3378
3379 alu.dst.chan = 0;
3380 alu.dst.sel = ctx->temp_reg;
3381 alu.dst.write = 1;
3382
3383 alu.src[0].sel = ctx->temp_reg;
3384 alu.src[0].chan = 0;
3385 alu.last = 1;
3386 r = r600_bytecode_add_alu(ctx->bc, &alu);
3387 if (r)
3388 return r;
3389
3390 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3391 alu.op = ALU_OP3_MULADD;
3392 alu.is_op3 = 1;
3393
3394 alu.dst.chan = 0;
3395 alu.dst.sel = ctx->temp_reg;
3396 alu.dst.write = 1;
3397
3398 alu.src[0].sel = ctx->temp_reg;
3399 alu.src[0].chan = 0;
3400
3401 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
3402 alu.src[1].chan = 0;
3403 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
3404 alu.src[2].chan = 0;
3405
3406 if (ctx->bc->chip_class == R600) {
3407 alu.src[1].value = *(uint32_t *)&double_pi;
3408 alu.src[2].value = *(uint32_t *)&neg_pi;
3409 } else {
3410 alu.src[1].sel = V_SQ_ALU_SRC_1;
3411 alu.src[2].sel = V_SQ_ALU_SRC_0_5;
3412 alu.src[2].neg = 1;
3413 }
3414
3415 alu.last = 1;
3416 r = r600_bytecode_add_alu(ctx->bc, &alu);
3417 if (r)
3418 return r;
3419 return 0;
3420 }
3421
3422 static int cayman_trig(struct r600_shader_ctx *ctx)
3423 {
3424 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3425 struct r600_bytecode_alu alu;
3426 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
3427 int i, r;
3428
3429 r = tgsi_setup_trig(ctx);
3430 if (r)
3431 return r;
3432
3433
3434 for (i = 0; i < last_slot; i++) {
3435 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3436 alu.op = ctx->inst_info->op;
3437 alu.dst.chan = i;
3438
3439 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3440 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
3441
3442 alu.src[0].sel = ctx->temp_reg;
3443 alu.src[0].chan = 0;
3444 if (i == last_slot - 1)
3445 alu.last = 1;
3446 r = r600_bytecode_add_alu(ctx->bc, &alu);
3447 if (r)
3448 return r;
3449 }
3450 return 0;
3451 }
3452
3453 static int tgsi_trig(struct r600_shader_ctx *ctx)
3454 {
3455 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3456 struct r600_bytecode_alu alu;
3457 int i, r;
3458 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
3459
3460 r = tgsi_setup_trig(ctx);
3461 if (r)
3462 return r;
3463
3464 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3465 alu.op = ctx->inst_info->op;
3466 alu.dst.chan = 0;
3467 alu.dst.sel = ctx->temp_reg;
3468 alu.dst.write = 1;
3469
3470 alu.src[0].sel = ctx->temp_reg;
3471 alu.src[0].chan = 0;
3472 alu.last = 1;
3473 r = r600_bytecode_add_alu(ctx->bc, &alu);
3474 if (r)
3475 return r;
3476
3477 /* replicate result */
3478 for (i = 0; i < lasti + 1; i++) {
3479 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
3480 continue;
3481
3482 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3483 alu.op = ALU_OP1_MOV;
3484
3485 alu.src[0].sel = ctx->temp_reg;
3486 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3487 if (i == lasti)
3488 alu.last = 1;
3489 r = r600_bytecode_add_alu(ctx->bc, &alu);
3490 if (r)
3491 return r;
3492 }
3493 return 0;
3494 }
3495
3496 static int tgsi_scs(struct r600_shader_ctx *ctx)
3497 {
3498 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3499 struct r600_bytecode_alu alu;
3500 int i, r;
3501
3502 /* We'll only need the trig stuff if we are going to write to the
3503 * X or Y components of the destination vector.
3504 */
3505 if (likely(inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY)) {
3506 r = tgsi_setup_trig(ctx);
3507 if (r)
3508 return r;
3509 }
3510
3511 /* dst.x = COS */
3512 if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
3513 if (ctx->bc->chip_class == CAYMAN) {
3514 for (i = 0 ; i < 3; i++) {
3515 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3516 alu.op = ALU_OP1_COS;
3517 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3518
3519 if (i == 0)
3520 alu.dst.write = 1;
3521 else
3522 alu.dst.write = 0;
3523 alu.src[0].sel = ctx->temp_reg;
3524 alu.src[0].chan = 0;
3525 if (i == 2)
3526 alu.last = 1;
3527 r = r600_bytecode_add_alu(ctx->bc, &alu);
3528 if (r)
3529 return r;
3530 }
3531 } else {
3532 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3533 alu.op = ALU_OP1_COS;
3534 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
3535
3536 alu.src[0].sel = ctx->temp_reg;
3537 alu.src[0].chan = 0;
3538 alu.last = 1;
3539 r = r600_bytecode_add_alu(ctx->bc, &alu);
3540 if (r)
3541 return r;
3542 }
3543 }
3544
3545 /* dst.y = SIN */
3546 if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
3547 if (ctx->bc->chip_class == CAYMAN) {
3548 for (i = 0 ; i < 3; i++) {
3549 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3550 alu.op = ALU_OP1_SIN;
3551 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3552 if (i == 1)
3553 alu.dst.write = 1;
3554 else
3555 alu.dst.write = 0;
3556 alu.src[0].sel = ctx->temp_reg;
3557 alu.src[0].chan = 0;
3558 if (i == 2)
3559 alu.last = 1;
3560 r = r600_bytecode_add_alu(ctx->bc, &alu);
3561 if (r)
3562 return r;
3563 }
3564 } else {
3565 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3566 alu.op = ALU_OP1_SIN;
3567 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
3568
3569 alu.src[0].sel = ctx->temp_reg;
3570 alu.src[0].chan = 0;
3571 alu.last = 1;
3572 r = r600_bytecode_add_alu(ctx->bc, &alu);
3573 if (r)
3574 return r;
3575 }
3576 }
3577
3578 /* dst.z = 0.0; */
3579 if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
3580 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3581
3582 alu.op = ALU_OP1_MOV;
3583
3584 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
3585
3586 alu.src[0].sel = V_SQ_ALU_SRC_0;
3587 alu.src[0].chan = 0;
3588
3589 alu.last = 1;
3590
3591 r = r600_bytecode_add_alu(ctx->bc, &alu);
3592 if (r)
3593 return r;
3594 }
3595
3596 /* dst.w = 1.0; */
3597 if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
3598 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3599
3600 alu.op = ALU_OP1_MOV;
3601
3602 tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
3603
3604 alu.src[0].sel = V_SQ_ALU_SRC_1;
3605 alu.src[0].chan = 0;
3606
3607 alu.last = 1;
3608
3609 r = r600_bytecode_add_alu(ctx->bc, &alu);
3610 if (r)
3611 return r;
3612 }
3613
3614 return 0;
3615 }
3616
3617 static int tgsi_kill(struct r600_shader_ctx *ctx)
3618 {
3619 const struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3620 struct r600_bytecode_alu alu;
3621 int i, r;
3622
3623 for (i = 0; i < 4; i++) {
3624 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3625 alu.op = ctx->inst_info->op;
3626
3627 alu.dst.chan = i;
3628
3629 alu.src[0].sel = V_SQ_ALU_SRC_0;
3630
3631 if (inst->Instruction.Opcode == TGSI_OPCODE_KILL) {
3632 alu.src[1].sel = V_SQ_ALU_SRC_1;
3633 alu.src[1].neg = 1;
3634 } else {
3635 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3636 }
3637 if (i == 3) {
3638 alu.last = 1;
3639 }
3640 r = r600_bytecode_add_alu(ctx->bc, &alu);
3641 if (r)
3642 return r;
3643 }
3644
3645 /* kill must be last in ALU */
3646 ctx->bc->force_add_cf = 1;
3647 ctx->shader->uses_kill = TRUE;
3648 return 0;
3649 }
3650
3651 static int tgsi_lit(struct r600_shader_ctx *ctx)
3652 {
3653 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3654 struct r600_bytecode_alu alu;
3655 int r;
3656
3657 /* tmp.x = max(src.y, 0.0) */
3658 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3659 alu.op = ALU_OP2_MAX;
3660 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
3661 alu.src[1].sel = V_SQ_ALU_SRC_0; /*0.0*/
3662 alu.src[1].chan = 1;
3663
3664 alu.dst.sel = ctx->temp_reg;
3665 alu.dst.chan = 0;
3666 alu.dst.write = 1;
3667
3668 alu.last = 1;
3669 r = r600_bytecode_add_alu(ctx->bc, &alu);
3670 if (r)
3671 return r;
3672
3673 if (inst->Dst[0].Register.WriteMask & (1 << 2))
3674 {
3675 int chan;
3676 int sel;
3677 int i;
3678
3679 if (ctx->bc->chip_class == CAYMAN) {
3680 for (i = 0; i < 3; i++) {
3681 /* tmp.z = log(tmp.x) */
3682 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3683 alu.op = ALU_OP1_LOG_CLAMPED;
3684 alu.src[0].sel = ctx->temp_reg;
3685 alu.src[0].chan = 0;
3686 alu.dst.sel = ctx->temp_reg;
3687 alu.dst.chan = i;
3688 if (i == 2) {
3689 alu.dst.write = 1;
3690 alu.last = 1;
3691 } else
3692 alu.dst.write = 0;
3693
3694 r = r600_bytecode_add_alu(ctx->bc, &alu);
3695 if (r)
3696 return r;
3697 }
3698 } else {
3699 /* tmp.z = log(tmp.x) */
3700 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3701 alu.op = ALU_OP1_LOG_CLAMPED;
3702 alu.src[0].sel = ctx->temp_reg;
3703 alu.src[0].chan = 0;
3704 alu.dst.sel = ctx->temp_reg;
3705 alu.dst.chan = 2;
3706 alu.dst.write = 1;
3707 alu.last = 1;
3708 r = r600_bytecode_add_alu(ctx->bc, &alu);
3709 if (r)
3710 return r;
3711 }
3712
3713 chan = alu.dst.chan;
3714 sel = alu.dst.sel;
3715
3716 /* tmp.x = amd MUL_LIT(tmp.z, src.w, src.x ) */
3717 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3718 alu.op = ALU_OP3_MUL_LIT;
3719 alu.src[0].sel = sel;
3720 alu.src[0].chan = chan;
3721 r600_bytecode_src(&alu.src[1], &ctx->src[0], 3);
3722 r600_bytecode_src(&alu.src[2], &ctx->src[0], 0);
3723 alu.dst.sel = ctx->temp_reg;
3724 alu.dst.chan = 0;
3725 alu.dst.write = 1;
3726 alu.is_op3 = 1;
3727 alu.last = 1;
3728 r = r600_bytecode_add_alu(ctx->bc, &alu);
3729 if (r)
3730 return r;
3731
3732 if (ctx->bc->chip_class == CAYMAN) {
3733 for (i = 0; i < 3; i++) {
3734 /* dst.z = exp(tmp.x) */
3735 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3736 alu.op = ALU_OP1_EXP_IEEE;
3737 alu.src[0].sel = ctx->temp_reg;
3738 alu.src[0].chan = 0;
3739 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3740 if (i == 2) {
3741 alu.dst.write = 1;
3742 alu.last = 1;
3743 } else
3744 alu.dst.write = 0;
3745 r = r600_bytecode_add_alu(ctx->bc, &alu);
3746 if (r)
3747 return r;
3748 }
3749 } else {
3750 /* dst.z = exp(tmp.x) */
3751 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3752 alu.op = ALU_OP1_EXP_IEEE;
3753 alu.src[0].sel = ctx->temp_reg;
3754 alu.src[0].chan = 0;
3755 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
3756 alu.last = 1;
3757 r = r600_bytecode_add_alu(ctx->bc, &alu);
3758 if (r)
3759 return r;
3760 }
3761 }
3762
3763 /* dst.x, <- 1.0 */
3764 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3765 alu.op = ALU_OP1_MOV;
3766 alu.src[0].sel = V_SQ_ALU_SRC_1; /*1.0*/
3767 alu.src[0].chan = 0;
3768 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
3769 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 0) & 1;
3770 r = r600_bytecode_add_alu(ctx->bc, &alu);
3771 if (r)
3772 return r;
3773
3774 /* dst.y = max(src.x, 0.0) */
3775 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3776 alu.op = ALU_OP2_MAX;
3777 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
3778 alu.src[1].sel = V_SQ_ALU_SRC_0; /*0.0*/
3779 alu.src[1].chan = 0;
3780 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
3781 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 1) & 1;
3782 r = r600_bytecode_add_alu(ctx->bc, &alu);
3783 if (r)
3784 return r;
3785
3786 /* dst.w, <- 1.0 */
3787 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3788 alu.op = ALU_OP1_MOV;
3789 alu.src[0].sel = V_SQ_ALU_SRC_1;
3790 alu.src[0].chan = 0;
3791 tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
3792 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 3) & 1;
3793 alu.last = 1;
3794 r = r600_bytecode_add_alu(ctx->bc, &alu);
3795 if (r)
3796 return r;
3797
3798 return 0;
3799 }
3800
3801 static int tgsi_rsq(struct r600_shader_ctx *ctx)
3802 {
3803 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3804 struct r600_bytecode_alu alu;
3805 int i, r;
3806
3807 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3808
3809 /* XXX:
3810 * For state trackers other than OpenGL, we'll want to use
3811 * _RECIPSQRT_IEEE instead.
3812 */
3813 alu.op = ALU_OP1_RECIPSQRT_CLAMPED;
3814
3815 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
3816 r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
3817 r600_bytecode_src_set_abs(&alu.src[i]);
3818 }
3819 alu.dst.sel = ctx->temp_reg;
3820 alu.dst.write = 1;
3821 alu.last = 1;
3822 r = r600_bytecode_add_alu(ctx->bc, &alu);
3823 if (r)
3824 return r;
3825 /* replicate result */
3826 return tgsi_helper_tempx_replicate(ctx);
3827 }
3828
3829 static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx)
3830 {
3831 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3832 struct r600_bytecode_alu alu;
3833 int i, r;
3834
3835 for (i = 0; i < 4; i++) {
3836 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3837 alu.src[0].sel = ctx->temp_reg;
3838 alu.op = ALU_OP1_MOV;
3839 alu.dst.chan = i;
3840 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3841 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
3842 if (i == 3)
3843 alu.last = 1;
3844 r = r600_bytecode_add_alu(ctx->bc, &alu);
3845 if (r)
3846 return r;
3847 }
3848 return 0;
3849 }
3850
3851 static int tgsi_trans_srcx_replicate(struct r600_shader_ctx *ctx)
3852 {
3853 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3854 struct r600_bytecode_alu alu;
3855 int i, r;
3856
3857 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3858 alu.op = ctx->inst_info->op;
3859 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
3860 r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
3861 }
3862 alu.dst.sel = ctx->temp_reg;
3863 alu.dst.write = 1;
3864 alu.last = 1;
3865 r = r600_bytecode_add_alu(ctx->bc, &alu);
3866 if (r)
3867 return r;
3868 /* replicate result */
3869 return tgsi_helper_tempx_replicate(ctx);
3870 }
3871
3872 static int cayman_pow(struct r600_shader_ctx *ctx)
3873 {
3874 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3875 int i, r;
3876 struct r600_bytecode_alu alu;
3877 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
3878
3879 for (i = 0; i < 3; i++) {
3880 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3881 alu.op = ALU_OP1_LOG_IEEE;
3882 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
3883 alu.dst.sel = ctx->temp_reg;
3884 alu.dst.chan = i;
3885 alu.dst.write = 1;
3886 if (i == 2)
3887 alu.last = 1;
3888 r = r600_bytecode_add_alu(ctx->bc, &alu);
3889 if (r)
3890 return r;
3891 }
3892
3893 /* b * LOG2(a) */
3894 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3895 alu.op = ALU_OP2_MUL;
3896 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
3897 alu.src[1].sel = ctx->temp_reg;
3898 alu.dst.sel = ctx->temp_reg;
3899 alu.dst.write = 1;
3900 alu.last = 1;
3901 r = r600_bytecode_add_alu(ctx->bc, &alu);
3902 if (r)
3903 return r;
3904
3905 for (i = 0; i < last_slot; i++) {
3906 /* POW(a,b) = EXP2(b * LOG2(a))*/
3907 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3908 alu.op = ALU_OP1_EXP_IEEE;
3909 alu.src[0].sel = ctx->temp_reg;
3910
3911 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3912 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
3913 if (i == last_slot - 1)
3914 alu.last = 1;
3915 r = r600_bytecode_add_alu(ctx->bc, &alu);
3916 if (r)
3917 return r;
3918 }
3919 return 0;
3920 }
3921
3922 static int tgsi_pow(struct r600_shader_ctx *ctx)
3923 {
3924 struct r600_bytecode_alu alu;
3925 int r;
3926
3927 /* LOG2(a) */
3928 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3929 alu.op = ALU_OP1_LOG_IEEE;
3930 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
3931 alu.dst.sel = ctx->temp_reg;
3932 alu.dst.write = 1;
3933 alu.last = 1;
3934 r = r600_bytecode_add_alu(ctx->bc, &alu);
3935 if (r)
3936 return r;
3937 /* b * LOG2(a) */
3938 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3939 alu.op = ALU_OP2_MUL;
3940 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
3941 alu.src[1].sel = ctx->temp_reg;
3942 alu.dst.sel = ctx->temp_reg;
3943 alu.dst.write = 1;
3944 alu.last = 1;
3945 r = r600_bytecode_add_alu(ctx->bc, &alu);
3946 if (r)
3947 return r;
3948 /* POW(a,b) = EXP2(b * LOG2(a))*/
3949 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3950 alu.op = ALU_OP1_EXP_IEEE;
3951 alu.src[0].sel = ctx->temp_reg;
3952 alu.dst.sel = ctx->temp_reg;
3953 alu.dst.write = 1;
3954 alu.last = 1;
3955 r = r600_bytecode_add_alu(ctx->bc, &alu);
3956 if (r)
3957 return r;
3958 return tgsi_helper_tempx_replicate(ctx);
3959 }
3960
3961 static int tgsi_divmod(struct r600_shader_ctx *ctx, int mod, int signed_op)
3962 {
3963 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3964 struct r600_bytecode_alu alu;
3965 int i, r, j;
3966 unsigned write_mask = inst->Dst[0].Register.WriteMask;
3967 int tmp0 = ctx->temp_reg;
3968 int tmp1 = r600_get_temp(ctx);
3969 int tmp2 = r600_get_temp(ctx);
3970 int tmp3 = r600_get_temp(ctx);
3971 /* Unsigned path:
3972 *
3973 * we need to represent src1 as src2*q + r, where q - quotient, r - remainder
3974 *
3975 * 1. tmp0.x = rcp (src2) = 2^32/src2 + e, where e is rounding error
3976 * 2. tmp0.z = lo (tmp0.x * src2)
3977 * 3. tmp0.w = -tmp0.z
3978 * 4. tmp0.y = hi (tmp0.x * src2)
3979 * 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z) = abs(lo(rcp*src2))
3980 * 6. tmp0.w = hi (tmp0.z * tmp0.x) = e, rounding error
3981 * 7. tmp1.x = tmp0.x - tmp0.w
3982 * 8. tmp1.y = tmp0.x + tmp0.w
3983 * 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x)
3984 * 10. tmp0.z = hi(tmp0.x * src1) = q
3985 * 11. tmp0.y = lo (tmp0.z * src2) = src2*q = src1 - r
3986 *
3987 * 12. tmp0.w = src1 - tmp0.y = r
3988 * 13. tmp1.x = tmp0.w >= src2 = r >= src2 (uint comparison)
3989 * 14. tmp1.y = src1 >= tmp0.y = r >= 0 (uint comparison)
3990 *
3991 * if DIV
3992 *
3993 * 15. tmp1.z = tmp0.z + 1 = q + 1
3994 * 16. tmp1.w = tmp0.z - 1 = q - 1
3995 *
3996 * else MOD
3997 *
3998 * 15. tmp1.z = tmp0.w - src2 = r - src2
3999 * 16. tmp1.w = tmp0.w + src2 = r + src2
4000 *
4001 * endif
4002 *
4003 * 17. tmp1.x = tmp1.x & tmp1.y
4004 *
4005 * DIV: 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z
4006 * MOD: 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z
4007 *
4008 * 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z
4009 * 20. dst = src2==0 ? MAX_UINT : tmp0.z
4010 *
4011 * Signed path:
4012 *
4013 * Same as unsigned, using abs values of the operands,
4014 * and fixing the sign of the result in the end.
4015 */
4016
4017 for (i = 0; i < 4; i++) {
4018 if (!(write_mask & (1<<i)))
4019 continue;
4020
4021 if (signed_op) {
4022
4023 /* tmp2.x = -src0 */
4024 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4025 alu.op = ALU_OP2_SUB_INT;
4026
4027 alu.dst.sel = tmp2;
4028 alu.dst.chan = 0;
4029 alu.dst.write = 1;
4030
4031 alu.src[0].sel = V_SQ_ALU_SRC_0;
4032
4033 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4034
4035 alu.last = 1;
4036 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4037 return r;
4038
4039 /* tmp2.y = -src1 */
4040 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4041 alu.op = ALU_OP2_SUB_INT;
4042
4043 alu.dst.sel = tmp2;
4044 alu.dst.chan = 1;
4045 alu.dst.write = 1;
4046
4047 alu.src[0].sel = V_SQ_ALU_SRC_0;
4048
4049 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4050
4051 alu.last = 1;
4052 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4053 return r;
4054
4055 /* tmp2.z sign bit is set if src0 and src2 signs are different */
4056 /* it will be a sign of the quotient */
4057 if (!mod) {
4058
4059 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4060 alu.op = ALU_OP2_XOR_INT;
4061
4062 alu.dst.sel = tmp2;
4063 alu.dst.chan = 2;
4064 alu.dst.write = 1;
4065
4066 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4067 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4068
4069 alu.last = 1;
4070 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4071 return r;
4072 }
4073
4074 /* tmp2.x = |src0| */
4075 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4076 alu.op = ALU_OP3_CNDGE_INT;
4077 alu.is_op3 = 1;
4078
4079 alu.dst.sel = tmp2;
4080 alu.dst.chan = 0;
4081 alu.dst.write = 1;
4082
4083 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4084 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4085 alu.src[2].sel = tmp2;
4086 alu.src[2].chan = 0;
4087
4088 alu.last = 1;
4089 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4090 return r;
4091
4092 /* tmp2.y = |src1| */
4093 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4094 alu.op = ALU_OP3_CNDGE_INT;
4095 alu.is_op3 = 1;
4096
4097 alu.dst.sel = tmp2;
4098 alu.dst.chan = 1;
4099 alu.dst.write = 1;
4100
4101 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4102 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4103 alu.src[2].sel = tmp2;
4104 alu.src[2].chan = 1;
4105
4106 alu.last = 1;
4107 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4108 return r;
4109
4110 }
4111
4112 /* 1. tmp0.x = rcp_u (src2) = 2^32/src2 + e, where e is rounding error */
4113 if (ctx->bc->chip_class == CAYMAN) {
4114 /* tmp3.x = u2f(src2) */
4115 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4116 alu.op = ALU_OP1_UINT_TO_FLT;
4117
4118 alu.dst.sel = tmp3;
4119 alu.dst.chan = 0;
4120 alu.dst.write = 1;
4121
4122 if (signed_op) {
4123 alu.src[0].sel = tmp2;
4124 alu.src[0].chan = 1;
4125 } else {
4126 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4127 }
4128
4129 alu.last = 1;
4130 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4131 return r;
4132
4133 /* tmp0.x = recip(tmp3.x) */
4134 for (j = 0 ; j < 3; j++) {
4135 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4136 alu.op = ALU_OP1_RECIP_IEEE;
4137
4138 alu.dst.sel = tmp0;
4139 alu.dst.chan = j;
4140 alu.dst.write = (j == 0);
4141
4142 alu.src[0].sel = tmp3;
4143 alu.src[0].chan = 0;
4144
4145 if (j == 2)
4146 alu.last = 1;
4147 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4148 return r;
4149 }
4150
4151 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4152 alu.op = ALU_OP2_MUL;
4153
4154 alu.src[0].sel = tmp0;
4155 alu.src[0].chan = 0;
4156
4157 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
4158 alu.src[1].value = 0x4f800000;
4159
4160 alu.dst.sel = tmp3;
4161 alu.dst.write = 1;
4162 alu.last = 1;
4163 r = r600_bytecode_add_alu(ctx->bc, &alu);
4164 if (r)
4165 return r;
4166
4167 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4168 alu.op = ALU_OP1_FLT_TO_UINT;
4169
4170 alu.dst.sel = tmp0;
4171 alu.dst.chan = 0;
4172 alu.dst.write = 1;
4173
4174 alu.src[0].sel = tmp3;
4175 alu.src[0].chan = 0;
4176
4177 alu.last = 1;
4178 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4179 return r;
4180
4181 } else {
4182 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4183 alu.op = ALU_OP1_RECIP_UINT;
4184
4185 alu.dst.sel = tmp0;
4186 alu.dst.chan = 0;
4187 alu.dst.write = 1;
4188
4189 if (signed_op) {
4190 alu.src[0].sel = tmp2;
4191 alu.src[0].chan = 1;
4192 } else {
4193 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4194 }
4195
4196 alu.last = 1;
4197 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4198 return r;
4199 }
4200
4201 /* 2. tmp0.z = lo (tmp0.x * src2) */
4202 if (ctx->bc->chip_class == CAYMAN) {
4203 for (j = 0 ; j < 4; j++) {
4204 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4205 alu.op = ALU_OP2_MULLO_UINT;
4206
4207 alu.dst.sel = tmp0;
4208 alu.dst.chan = j;
4209 alu.dst.write = (j == 2);
4210
4211 alu.src[0].sel = tmp0;
4212 alu.src[0].chan = 0;
4213 if (signed_op) {
4214 alu.src[1].sel = tmp2;
4215 alu.src[1].chan = 1;
4216 } else {
4217 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4218 }
4219
4220 alu.last = (j == 3);
4221 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4222 return r;
4223 }
4224 } else {
4225 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4226 alu.op = ALU_OP2_MULLO_UINT;
4227
4228 alu.dst.sel = tmp0;
4229 alu.dst.chan = 2;
4230 alu.dst.write = 1;
4231
4232 alu.src[0].sel = tmp0;
4233 alu.src[0].chan = 0;
4234 if (signed_op) {
4235 alu.src[1].sel = tmp2;
4236 alu.src[1].chan = 1;
4237 } else {
4238 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4239 }
4240
4241 alu.last = 1;
4242 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4243 return r;
4244 }
4245
4246 /* 3. tmp0.w = -tmp0.z */
4247 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4248 alu.op = ALU_OP2_SUB_INT;
4249
4250 alu.dst.sel = tmp0;
4251 alu.dst.chan = 3;
4252 alu.dst.write = 1;
4253
4254 alu.src[0].sel = V_SQ_ALU_SRC_0;
4255 alu.src[1].sel = tmp0;
4256 alu.src[1].chan = 2;
4257
4258 alu.last = 1;
4259 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4260 return r;
4261
4262 /* 4. tmp0.y = hi (tmp0.x * src2) */
4263 if (ctx->bc->chip_class == CAYMAN) {
4264 for (j = 0 ; j < 4; j++) {
4265 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4266 alu.op = ALU_OP2_MULHI_UINT;
4267
4268 alu.dst.sel = tmp0;
4269 alu.dst.chan = j;
4270 alu.dst.write = (j == 1);
4271
4272 alu.src[0].sel = tmp0;
4273 alu.src[0].chan = 0;
4274
4275 if (signed_op) {
4276 alu.src[1].sel = tmp2;
4277 alu.src[1].chan = 1;
4278 } else {
4279 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4280 }
4281 alu.last = (j == 3);
4282 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4283 return r;
4284 }
4285 } else {
4286 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4287 alu.op = ALU_OP2_MULHI_UINT;
4288
4289 alu.dst.sel = tmp0;
4290 alu.dst.chan = 1;
4291 alu.dst.write = 1;
4292
4293 alu.src[0].sel = tmp0;
4294 alu.src[0].chan = 0;
4295
4296 if (signed_op) {
4297 alu.src[1].sel = tmp2;
4298 alu.src[1].chan = 1;
4299 } else {
4300 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4301 }
4302
4303 alu.last = 1;
4304 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4305 return r;
4306 }
4307
4308 /* 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z) = abs(lo(rcp*src)) */
4309 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4310 alu.op = ALU_OP3_CNDE_INT;
4311 alu.is_op3 = 1;
4312
4313 alu.dst.sel = tmp0;
4314 alu.dst.chan = 2;
4315 alu.dst.write = 1;
4316
4317 alu.src[0].sel = tmp0;
4318 alu.src[0].chan = 1;
4319 alu.src[1].sel = tmp0;
4320 alu.src[1].chan = 3;
4321 alu.src[2].sel = tmp0;
4322 alu.src[2].chan = 2;
4323
4324 alu.last = 1;
4325 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4326 return r;
4327
4328 /* 6. tmp0.w = hi (tmp0.z * tmp0.x) = e, rounding error */
4329 if (ctx->bc->chip_class == CAYMAN) {
4330 for (j = 0 ; j < 4; j++) {
4331 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4332 alu.op = ALU_OP2_MULHI_UINT;
4333
4334 alu.dst.sel = tmp0;
4335 alu.dst.chan = j;
4336 alu.dst.write = (j == 3);
4337
4338 alu.src[0].sel = tmp0;
4339 alu.src[0].chan = 2;
4340
4341 alu.src[1].sel = tmp0;
4342 alu.src[1].chan = 0;
4343
4344 alu.last = (j == 3);
4345 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4346 return r;
4347 }
4348 } else {
4349 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4350 alu.op = ALU_OP2_MULHI_UINT;
4351
4352 alu.dst.sel = tmp0;
4353 alu.dst.chan = 3;
4354 alu.dst.write = 1;
4355
4356 alu.src[0].sel = tmp0;
4357 alu.src[0].chan = 2;
4358
4359 alu.src[1].sel = tmp0;
4360 alu.src[1].chan = 0;
4361
4362 alu.last = 1;
4363 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4364 return r;
4365 }
4366
4367 /* 7. tmp1.x = tmp0.x - tmp0.w */
4368 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4369 alu.op = ALU_OP2_SUB_INT;
4370
4371 alu.dst.sel = tmp1;
4372 alu.dst.chan = 0;
4373 alu.dst.write = 1;
4374
4375 alu.src[0].sel = tmp0;
4376 alu.src[0].chan = 0;
4377 alu.src[1].sel = tmp0;
4378 alu.src[1].chan = 3;
4379
4380 alu.last = 1;
4381 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4382 return r;
4383
4384 /* 8. tmp1.y = tmp0.x + tmp0.w */
4385 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4386 alu.op = ALU_OP2_ADD_INT;
4387
4388 alu.dst.sel = tmp1;
4389 alu.dst.chan = 1;
4390 alu.dst.write = 1;
4391
4392 alu.src[0].sel = tmp0;
4393 alu.src[0].chan = 0;
4394 alu.src[1].sel = tmp0;
4395 alu.src[1].chan = 3;
4396
4397 alu.last = 1;
4398 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4399 return r;
4400
4401 /* 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) */
4402 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4403 alu.op = ALU_OP3_CNDE_INT;
4404 alu.is_op3 = 1;
4405
4406 alu.dst.sel = tmp0;
4407 alu.dst.chan = 0;
4408 alu.dst.write = 1;
4409
4410 alu.src[0].sel = tmp0;
4411 alu.src[0].chan = 1;
4412 alu.src[1].sel = tmp1;
4413 alu.src[1].chan = 1;
4414 alu.src[2].sel = tmp1;
4415 alu.src[2].chan = 0;
4416
4417 alu.last = 1;
4418 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4419 return r;
4420
4421 /* 10. tmp0.z = hi(tmp0.x * src1) = q */
4422 if (ctx->bc->chip_class == CAYMAN) {
4423 for (j = 0 ; j < 4; j++) {
4424 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4425 alu.op = ALU_OP2_MULHI_UINT;
4426
4427 alu.dst.sel = tmp0;
4428 alu.dst.chan = j;
4429 alu.dst.write = (j == 2);
4430
4431 alu.src[0].sel = tmp0;
4432 alu.src[0].chan = 0;
4433
4434 if (signed_op) {
4435 alu.src[1].sel = tmp2;
4436 alu.src[1].chan = 0;
4437 } else {
4438 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4439 }
4440
4441 alu.last = (j == 3);
4442 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4443 return r;
4444 }
4445 } else {
4446 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4447 alu.op = ALU_OP2_MULHI_UINT;
4448
4449 alu.dst.sel = tmp0;
4450 alu.dst.chan = 2;
4451 alu.dst.write = 1;
4452
4453 alu.src[0].sel = tmp0;
4454 alu.src[0].chan = 0;
4455
4456 if (signed_op) {
4457 alu.src[1].sel = tmp2;
4458 alu.src[1].chan = 0;
4459 } else {
4460 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4461 }
4462
4463 alu.last = 1;
4464 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4465 return r;
4466 }
4467
4468 /* 11. tmp0.y = lo (src2 * tmp0.z) = src2*q = src1 - r */
4469 if (ctx->bc->chip_class == CAYMAN) {
4470 for (j = 0 ; j < 4; j++) {
4471 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4472 alu.op = ALU_OP2_MULLO_UINT;
4473
4474 alu.dst.sel = tmp0;
4475 alu.dst.chan = j;
4476 alu.dst.write = (j == 1);
4477
4478 if (signed_op) {
4479 alu.src[0].sel = tmp2;
4480 alu.src[0].chan = 1;
4481 } else {
4482 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4483 }
4484
4485 alu.src[1].sel = tmp0;
4486 alu.src[1].chan = 2;
4487
4488 alu.last = (j == 3);
4489 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4490 return r;
4491 }
4492 } else {
4493 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4494 alu.op = ALU_OP2_MULLO_UINT;
4495
4496 alu.dst.sel = tmp0;
4497 alu.dst.chan = 1;
4498 alu.dst.write = 1;
4499
4500 if (signed_op) {
4501 alu.src[0].sel = tmp2;
4502 alu.src[0].chan = 1;
4503 } else {
4504 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4505 }
4506
4507 alu.src[1].sel = tmp0;
4508 alu.src[1].chan = 2;
4509
4510 alu.last = 1;
4511 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4512 return r;
4513 }
4514
4515 /* 12. tmp0.w = src1 - tmp0.y = r */
4516 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4517 alu.op = ALU_OP2_SUB_INT;
4518
4519 alu.dst.sel = tmp0;
4520 alu.dst.chan = 3;
4521 alu.dst.write = 1;
4522
4523 if (signed_op) {
4524 alu.src[0].sel = tmp2;
4525 alu.src[0].chan = 0;
4526 } else {
4527 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4528 }
4529
4530 alu.src[1].sel = tmp0;
4531 alu.src[1].chan = 1;
4532
4533 alu.last = 1;
4534 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4535 return r;
4536
4537 /* 13. tmp1.x = tmp0.w >= src2 = r >= src2 */
4538 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4539 alu.op = ALU_OP2_SETGE_UINT;
4540
4541 alu.dst.sel = tmp1;
4542 alu.dst.chan = 0;
4543 alu.dst.write = 1;
4544
4545 alu.src[0].sel = tmp0;
4546 alu.src[0].chan = 3;
4547 if (signed_op) {
4548 alu.src[1].sel = tmp2;
4549 alu.src[1].chan = 1;
4550 } else {
4551 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4552 }
4553
4554 alu.last = 1;
4555 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4556 return r;
4557
4558 /* 14. tmp1.y = src1 >= tmp0.y = r >= 0 */
4559 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4560 alu.op = ALU_OP2_SETGE_UINT;
4561
4562 alu.dst.sel = tmp1;
4563 alu.dst.chan = 1;
4564 alu.dst.write = 1;
4565
4566 if (signed_op) {
4567 alu.src[0].sel = tmp2;
4568 alu.src[0].chan = 0;
4569 } else {
4570 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4571 }
4572
4573 alu.src[1].sel = tmp0;
4574 alu.src[1].chan = 1;
4575
4576 alu.last = 1;
4577 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4578 return r;
4579
4580 if (mod) { /* UMOD */
4581
4582 /* 15. tmp1.z = tmp0.w - src2 = r - src2 */
4583 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4584 alu.op = ALU_OP2_SUB_INT;
4585
4586 alu.dst.sel = tmp1;
4587 alu.dst.chan = 2;
4588 alu.dst.write = 1;
4589
4590 alu.src[0].sel = tmp0;
4591 alu.src[0].chan = 3;
4592
4593 if (signed_op) {
4594 alu.src[1].sel = tmp2;
4595 alu.src[1].chan = 1;
4596 } else {
4597 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4598 }
4599
4600 alu.last = 1;
4601 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4602 return r;
4603
4604 /* 16. tmp1.w = tmp0.w + src2 = r + src2 */
4605 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4606 alu.op = ALU_OP2_ADD_INT;
4607
4608 alu.dst.sel = tmp1;
4609 alu.dst.chan = 3;
4610 alu.dst.write = 1;
4611
4612 alu.src[0].sel = tmp0;
4613 alu.src[0].chan = 3;
4614 if (signed_op) {
4615 alu.src[1].sel = tmp2;
4616 alu.src[1].chan = 1;
4617 } else {
4618 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4619 }
4620
4621 alu.last = 1;
4622 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4623 return r;
4624
4625 } else { /* UDIV */
4626
4627 /* 15. tmp1.z = tmp0.z + 1 = q + 1 DIV */
4628 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4629 alu.op = ALU_OP2_ADD_INT;
4630
4631 alu.dst.sel = tmp1;
4632 alu.dst.chan = 2;
4633 alu.dst.write = 1;
4634
4635 alu.src[0].sel = tmp0;
4636 alu.src[0].chan = 2;
4637 alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
4638
4639 alu.last = 1;
4640 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4641 return r;
4642
4643 /* 16. tmp1.w = tmp0.z - 1 = q - 1 */
4644 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4645 alu.op = ALU_OP2_ADD_INT;
4646
4647 alu.dst.sel = tmp1;
4648 alu.dst.chan = 3;
4649 alu.dst.write = 1;
4650
4651 alu.src[0].sel = tmp0;
4652 alu.src[0].chan = 2;
4653 alu.src[1].sel = V_SQ_ALU_SRC_M_1_INT;
4654
4655 alu.last = 1;
4656 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4657 return r;
4658
4659 }
4660
4661 /* 17. tmp1.x = tmp1.x & tmp1.y */
4662 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4663 alu.op = ALU_OP2_AND_INT;
4664
4665 alu.dst.sel = tmp1;
4666 alu.dst.chan = 0;
4667 alu.dst.write = 1;
4668
4669 alu.src[0].sel = tmp1;
4670 alu.src[0].chan = 0;
4671 alu.src[1].sel = tmp1;
4672 alu.src[1].chan = 1;
4673
4674 alu.last = 1;
4675 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4676 return r;
4677
4678 /* 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z DIV */
4679 /* 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z MOD */
4680 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4681 alu.op = ALU_OP3_CNDE_INT;
4682 alu.is_op3 = 1;
4683
4684 alu.dst.sel = tmp0;
4685 alu.dst.chan = 2;
4686 alu.dst.write = 1;
4687
4688 alu.src[0].sel = tmp1;
4689 alu.src[0].chan = 0;
4690 alu.src[1].sel = tmp0;
4691 alu.src[1].chan = mod ? 3 : 2;
4692 alu.src[2].sel = tmp1;
4693 alu.src[2].chan = 2;
4694
4695 alu.last = 1;
4696 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4697 return r;
4698
4699 /* 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z */
4700 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4701 alu.op = ALU_OP3_CNDE_INT;
4702 alu.is_op3 = 1;
4703
4704 if (signed_op) {
4705 alu.dst.sel = tmp0;
4706 alu.dst.chan = 2;
4707 alu.dst.write = 1;
4708 } else {
4709 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4710 }
4711
4712 alu.src[0].sel = tmp1;
4713 alu.src[0].chan = 1;
4714 alu.src[1].sel = tmp1;
4715 alu.src[1].chan = 3;
4716 alu.src[2].sel = tmp0;
4717 alu.src[2].chan = 2;
4718
4719 alu.last = 1;
4720 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4721 return r;
4722
4723 if (signed_op) {
4724
4725 /* fix the sign of the result */
4726
4727 if (mod) {
4728
4729 /* tmp0.x = -tmp0.z */
4730 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4731 alu.op = ALU_OP2_SUB_INT;
4732
4733 alu.dst.sel = tmp0;
4734 alu.dst.chan = 0;
4735 alu.dst.write = 1;
4736
4737 alu.src[0].sel = V_SQ_ALU_SRC_0;
4738 alu.src[1].sel = tmp0;
4739 alu.src[1].chan = 2;
4740
4741 alu.last = 1;
4742 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4743 return r;
4744
4745 /* sign of the remainder is the same as the sign of src0 */
4746 /* tmp0.x = src0>=0 ? tmp0.z : tmp0.x */
4747 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4748 alu.op = ALU_OP3_CNDGE_INT;
4749 alu.is_op3 = 1;
4750
4751 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4752
4753 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4754 alu.src[1].sel = tmp0;
4755 alu.src[1].chan = 2;
4756 alu.src[2].sel = tmp0;
4757 alu.src[2].chan = 0;
4758
4759 alu.last = 1;
4760 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4761 return r;
4762
4763 } else {
4764
4765 /* tmp0.x = -tmp0.z */
4766 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4767 alu.op = ALU_OP2_SUB_INT;
4768
4769 alu.dst.sel = tmp0;
4770 alu.dst.chan = 0;
4771 alu.dst.write = 1;
4772
4773 alu.src[0].sel = V_SQ_ALU_SRC_0;
4774 alu.src[1].sel = tmp0;
4775 alu.src[1].chan = 2;
4776
4777 alu.last = 1;
4778 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4779 return r;
4780
4781 /* fix the quotient sign (same as the sign of src0*src1) */
4782 /* tmp0.x = tmp2.z>=0 ? tmp0.z : tmp0.x */
4783 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4784 alu.op = ALU_OP3_CNDGE_INT;
4785 alu.is_op3 = 1;
4786
4787 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4788
4789 alu.src[0].sel = tmp2;
4790 alu.src[0].chan = 2;
4791 alu.src[1].sel = tmp0;
4792 alu.src[1].chan = 2;
4793 alu.src[2].sel = tmp0;
4794 alu.src[2].chan = 0;
4795
4796 alu.last = 1;
4797 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4798 return r;
4799 }
4800 }
4801 }
4802 return 0;
4803 }
4804
4805 static int tgsi_udiv(struct r600_shader_ctx *ctx)
4806 {
4807 return tgsi_divmod(ctx, 0, 0);
4808 }
4809
4810 static int tgsi_umod(struct r600_shader_ctx *ctx)
4811 {
4812 return tgsi_divmod(ctx, 1, 0);
4813 }
4814
4815 static int tgsi_idiv(struct r600_shader_ctx *ctx)
4816 {
4817 return tgsi_divmod(ctx, 0, 1);
4818 }
4819
4820 static int tgsi_imod(struct r600_shader_ctx *ctx)
4821 {
4822 return tgsi_divmod(ctx, 1, 1);
4823 }
4824
4825
4826 static int tgsi_f2i(struct r600_shader_ctx *ctx)
4827 {
4828 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4829 struct r600_bytecode_alu alu;
4830 int i, r;
4831 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4832 int last_inst = tgsi_last_instruction(write_mask);
4833
4834 for (i = 0; i < 4; i++) {
4835 if (!(write_mask & (1<<i)))
4836 continue;
4837
4838 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4839 alu.op = ALU_OP1_TRUNC;
4840
4841 alu.dst.sel = ctx->temp_reg;
4842 alu.dst.chan = i;
4843 alu.dst.write = 1;
4844
4845 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4846 if (i == last_inst)
4847 alu.last = 1;
4848 r = r600_bytecode_add_alu(ctx->bc, &alu);
4849 if (r)
4850 return r;
4851 }
4852
4853 for (i = 0; i < 4; i++) {
4854 if (!(write_mask & (1<<i)))
4855 continue;
4856
4857 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4858 alu.op = ctx->inst_info->op;
4859
4860 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4861
4862 alu.src[0].sel = ctx->temp_reg;
4863 alu.src[0].chan = i;
4864
4865 if (i == last_inst || alu.op == ALU_OP1_FLT_TO_UINT)
4866 alu.last = 1;
4867 r = r600_bytecode_add_alu(ctx->bc, &alu);
4868 if (r)
4869 return r;
4870 }
4871
4872 return 0;
4873 }
4874
4875 static int tgsi_iabs(struct r600_shader_ctx *ctx)
4876 {
4877 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4878 struct r600_bytecode_alu alu;
4879 int i, r;
4880 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4881 int last_inst = tgsi_last_instruction(write_mask);
4882
4883 /* tmp = -src */
4884 for (i = 0; i < 4; i++) {
4885 if (!(write_mask & (1<<i)))
4886 continue;
4887
4888 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4889 alu.op = ALU_OP2_SUB_INT;
4890
4891 alu.dst.sel = ctx->temp_reg;
4892 alu.dst.chan = i;
4893 alu.dst.write = 1;
4894
4895 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4896 alu.src[0].sel = V_SQ_ALU_SRC_0;
4897
4898 if (i == last_inst)
4899 alu.last = 1;
4900 r = r600_bytecode_add_alu(ctx->bc, &alu);
4901 if (r)
4902 return r;
4903 }
4904
4905 /* dst = (src >= 0 ? src : tmp) */
4906 for (i = 0; i < 4; i++) {
4907 if (!(write_mask & (1<<i)))
4908 continue;
4909
4910 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4911 alu.op = ALU_OP3_CNDGE_INT;
4912 alu.is_op3 = 1;
4913 alu.dst.write = 1;
4914
4915 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4916
4917 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4918 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4919 alu.src[2].sel = ctx->temp_reg;
4920 alu.src[2].chan = i;
4921
4922 if (i == last_inst)
4923 alu.last = 1;
4924 r = r600_bytecode_add_alu(ctx->bc, &alu);
4925 if (r)
4926 return r;
4927 }
4928 return 0;
4929 }
4930
4931 static int tgsi_issg(struct r600_shader_ctx *ctx)
4932 {
4933 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4934 struct r600_bytecode_alu alu;
4935 int i, r;
4936 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4937 int last_inst = tgsi_last_instruction(write_mask);
4938
4939 /* tmp = (src >= 0 ? src : -1) */
4940 for (i = 0; i < 4; i++) {
4941 if (!(write_mask & (1<<i)))
4942 continue;
4943
4944 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4945 alu.op = ALU_OP3_CNDGE_INT;
4946 alu.is_op3 = 1;
4947
4948 alu.dst.sel = ctx->temp_reg;
4949 alu.dst.chan = i;
4950 alu.dst.write = 1;
4951
4952 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4953 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4954 alu.src[2].sel = V_SQ_ALU_SRC_M_1_INT;
4955
4956 if (i == last_inst)
4957 alu.last = 1;
4958 r = r600_bytecode_add_alu(ctx->bc, &alu);
4959 if (r)
4960 return r;
4961 }
4962
4963 /* dst = (tmp > 0 ? 1 : tmp) */
4964 for (i = 0; i < 4; i++) {
4965 if (!(write_mask & (1<<i)))
4966 continue;
4967
4968 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4969 alu.op = ALU_OP3_CNDGT_INT;
4970 alu.is_op3 = 1;
4971 alu.dst.write = 1;
4972
4973 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4974
4975 alu.src[0].sel = ctx->temp_reg;
4976 alu.src[0].chan = i;
4977
4978 alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
4979
4980 alu.src[2].sel = ctx->temp_reg;
4981 alu.src[2].chan = i;
4982
4983 if (i == last_inst)
4984 alu.last = 1;
4985 r = r600_bytecode_add_alu(ctx->bc, &alu);
4986 if (r)
4987 return r;
4988 }
4989 return 0;
4990 }
4991
4992
4993
4994 static int tgsi_ssg(struct r600_shader_ctx *ctx)
4995 {
4996 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4997 struct r600_bytecode_alu alu;
4998 int i, r;
4999
5000 /* tmp = (src > 0 ? 1 : src) */
5001 for (i = 0; i < 4; i++) {
5002 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5003 alu.op = ALU_OP3_CNDGT;
5004 alu.is_op3 = 1;
5005
5006 alu.dst.sel = ctx->temp_reg;
5007 alu.dst.chan = i;
5008
5009 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5010 alu.src[1].sel = V_SQ_ALU_SRC_1;
5011 r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
5012
5013 if (i == 3)
5014 alu.last = 1;
5015 r = r600_bytecode_add_alu(ctx->bc, &alu);
5016 if (r)
5017 return r;
5018 }
5019
5020 /* dst = (-tmp > 0 ? -1 : tmp) */
5021 for (i = 0; i < 4; i++) {
5022 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5023 alu.op = ALU_OP3_CNDGT;
5024 alu.is_op3 = 1;
5025 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5026
5027 alu.src[0].sel = ctx->temp_reg;
5028 alu.src[0].chan = i;
5029 alu.src[0].neg = 1;
5030
5031 alu.src[1].sel = V_SQ_ALU_SRC_1;
5032 alu.src[1].neg = 1;
5033
5034 alu.src[2].sel = ctx->temp_reg;
5035 alu.src[2].chan = i;
5036
5037 if (i == 3)
5038 alu.last = 1;
5039 r = r600_bytecode_add_alu(ctx->bc, &alu);
5040 if (r)
5041 return r;
5042 }
5043 return 0;
5044 }
5045
5046 static int tgsi_bfi(struct r600_shader_ctx *ctx)
5047 {
5048 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5049 struct r600_bytecode_alu alu;
5050 int i, r, t1, t2;
5051
5052 unsigned write_mask = inst->Dst[0].Register.WriteMask;
5053 int last_inst = tgsi_last_instruction(write_mask);
5054
5055 t1 = ctx->temp_reg;
5056
5057 for (i = 0; i < 4; i++) {
5058 if (!(write_mask & (1<<i)))
5059 continue;
5060
5061 /* create mask tmp */
5062 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5063 alu.op = ALU_OP2_BFM_INT;
5064 alu.dst.sel = t1;
5065 alu.dst.chan = i;
5066 alu.dst.write = 1;
5067 alu.last = i == last_inst;
5068
5069 r600_bytecode_src(&alu.src[0], &ctx->src[3], i);
5070 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
5071
5072 r = r600_bytecode_add_alu(ctx->bc, &alu);
5073 if (r)
5074 return r;
5075 }
5076
5077 t2 = r600_get_temp(ctx);
5078
5079 for (i = 0; i < 4; i++) {
5080 if (!(write_mask & (1<<i)))
5081 continue;
5082
5083 /* shift insert left */
5084 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5085 alu.op = ALU_OP2_LSHL_INT;
5086 alu.dst.sel = t2;
5087 alu.dst.chan = i;
5088 alu.dst.write = 1;
5089 alu.last = i == last_inst;
5090
5091 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5092 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
5093
5094 r = r600_bytecode_add_alu(ctx->bc, &alu);
5095 if (r)
5096 return r;
5097 }
5098
5099 for (i = 0; i < 4; i++) {
5100 if (!(write_mask & (1<<i)))
5101 continue;
5102
5103 /* actual bitfield insert */
5104 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5105 alu.op = ALU_OP3_BFI_INT;
5106 alu.is_op3 = 1;
5107 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5108 alu.dst.chan = i;
5109 alu.dst.write = 1;
5110 alu.last = i == last_inst;
5111
5112 alu.src[0].sel = t1;
5113 alu.src[0].chan = i;
5114 alu.src[1].sel = t2;
5115 alu.src[1].chan = i;
5116 r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
5117
5118 r = r600_bytecode_add_alu(ctx->bc, &alu);
5119 if (r)
5120 return r;
5121 }
5122
5123 return 0;
5124 }
5125
5126 static int tgsi_msb(struct r600_shader_ctx *ctx)
5127 {
5128 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5129 struct r600_bytecode_alu alu;
5130 int i, r, t1, t2;
5131
5132 unsigned write_mask = inst->Dst[0].Register.WriteMask;
5133 int last_inst = tgsi_last_instruction(write_mask);
5134
5135 assert(ctx->inst_info->op == ALU_OP1_FFBH_INT ||
5136 ctx->inst_info->op == ALU_OP1_FFBH_UINT);
5137
5138 t1 = ctx->temp_reg;
5139
5140 /* bit position is indexed from lsb by TGSI, and from msb by the hardware */
5141 for (i = 0; i < 4; i++) {
5142 if (!(write_mask & (1<<i)))
5143 continue;
5144
5145 /* t1 = FFBH_INT / FFBH_UINT */
5146 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5147 alu.op = ctx->inst_info->op;
5148 alu.dst.sel = t1;
5149 alu.dst.chan = i;
5150 alu.dst.write = 1;
5151 alu.last = i == last_inst;
5152
5153 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5154
5155 r = r600_bytecode_add_alu(ctx->bc, &alu);
5156 if (r)
5157 return r;
5158 }
5159
5160 t2 = r600_get_temp(ctx);
5161
5162 for (i = 0; i < 4; i++) {
5163 if (!(write_mask & (1<<i)))
5164 continue;
5165
5166 /* t2 = 31 - t1 */
5167 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5168 alu.op = ALU_OP2_SUB_INT;
5169 alu.dst.sel = t2;
5170 alu.dst.chan = i;
5171 alu.dst.write = 1;
5172 alu.last = i == last_inst;
5173
5174 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
5175 alu.src[0].value = 31;
5176 alu.src[1].sel = t1;
5177 alu.src[1].chan = i;
5178
5179 r = r600_bytecode_add_alu(ctx->bc, &alu);
5180 if (r)
5181 return r;
5182 }
5183
5184 for (i = 0; i < 4; i++) {
5185 if (!(write_mask & (1<<i)))
5186 continue;
5187
5188 /* result = t1 >= 0 ? t2 : t1 */
5189 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5190 alu.op = ALU_OP3_CNDGE_INT;
5191 alu.is_op3 = 1;
5192 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5193 alu.dst.chan = i;
5194 alu.dst.write = 1;
5195 alu.last = i == last_inst;
5196
5197 alu.src[0].sel = t1;
5198 alu.src[0].chan = i;
5199 alu.src[1].sel = t2;
5200 alu.src[1].chan = i;
5201 alu.src[2].sel = t1;
5202 alu.src[2].chan = i;
5203
5204 r = r600_bytecode_add_alu(ctx->bc, &alu);
5205 if (r)
5206 return r;
5207 }
5208
5209 return 0;
5210 }
5211
5212 static int tgsi_interp_egcm(struct r600_shader_ctx *ctx)
5213 {
5214 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5215 struct r600_bytecode_alu alu;
5216 int r, i = 0, k, interp_gpr, interp_base_chan, tmp, lasti;
5217 unsigned location;
5218 int input;
5219
5220 assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
5221
5222 input = inst->Src[0].Register.Index;
5223
5224 /* Interpolators have been marked for use already by allocate_system_value_inputs */
5225 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5226 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5227 location = TGSI_INTERPOLATE_LOC_CENTER; /* sample offset will be added explicitly */
5228 }
5229 else {
5230 location = TGSI_INTERPOLATE_LOC_CENTROID;
5231 }
5232
5233 k = eg_get_interpolator_index(ctx->shader->input[input].interpolate, location);
5234 if (k < 0)
5235 k = 0;
5236 interp_gpr = ctx->eg_interpolators[k].ij_index / 2;
5237 interp_base_chan = 2 * (ctx->eg_interpolators[k].ij_index % 2);
5238
5239 /* NOTE: currently offset is not perspective correct */
5240 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5241 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5242 int sample_gpr = -1;
5243 int gradientsH, gradientsV;
5244 struct r600_bytecode_tex tex;
5245
5246 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5247 sample_gpr = load_sample_position(ctx, &ctx->src[1], ctx->src[1].swizzle[0]);
5248 }
5249
5250 gradientsH = r600_get_temp(ctx);
5251 gradientsV = r600_get_temp(ctx);
5252 for (i = 0; i < 2; i++) {
5253 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
5254 tex.op = i == 0 ? FETCH_OP_GET_GRADIENTS_H : FETCH_OP_GET_GRADIENTS_V;
5255 tex.src_gpr = interp_gpr;
5256 tex.src_sel_x = interp_base_chan + 0;
5257 tex.src_sel_y = interp_base_chan + 1;
5258 tex.src_sel_z = 0;
5259 tex.src_sel_w = 0;
5260 tex.dst_gpr = i == 0 ? gradientsH : gradientsV;
5261 tex.dst_sel_x = 0;
5262 tex.dst_sel_y = 1;
5263 tex.dst_sel_z = 7;
5264 tex.dst_sel_w = 7;
5265 tex.inst_mod = 1; // Use per pixel gradient calculation
5266 tex.sampler_id = 0;
5267 tex.resource_id = tex.sampler_id;
5268 r = r600_bytecode_add_tex(ctx->bc, &tex);
5269 if (r)
5270 return r;
5271 }
5272
5273 for (i = 0; i < 2; i++) {
5274 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5275 alu.op = ALU_OP3_MULADD;
5276 alu.is_op3 = 1;
5277 alu.src[0].sel = gradientsH;
5278 alu.src[0].chan = i;
5279 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5280 alu.src[1].sel = sample_gpr;
5281 alu.src[1].chan = 2;
5282 }
5283 else {
5284 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
5285 }
5286 alu.src[2].sel = interp_gpr;
5287 alu.src[2].chan = interp_base_chan + i;
5288 alu.dst.sel = ctx->temp_reg;
5289 alu.dst.chan = i;
5290 alu.last = i == 1;
5291
5292 r = r600_bytecode_add_alu(ctx->bc, &alu);
5293 if (r)
5294 return r;
5295 }
5296
5297 for (i = 0; i < 2; i++) {
5298 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5299 alu.op = ALU_OP3_MULADD;
5300 alu.is_op3 = 1;
5301 alu.src[0].sel = gradientsV;
5302 alu.src[0].chan = i;
5303 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5304 alu.src[1].sel = sample_gpr;
5305 alu.src[1].chan = 3;
5306 }
5307 else {
5308 r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
5309 }
5310 alu.src[2].sel = ctx->temp_reg;
5311 alu.src[2].chan = i;
5312 alu.dst.sel = ctx->temp_reg;
5313 alu.dst.chan = i;
5314 alu.last = i == 1;
5315
5316 r = r600_bytecode_add_alu(ctx->bc, &alu);
5317 if (r)
5318 return r;
5319 }
5320 }
5321
5322 tmp = r600_get_temp(ctx);
5323 for (i = 0; i < 8; i++) {
5324 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5325 alu.op = i < 4 ? ALU_OP2_INTERP_ZW : ALU_OP2_INTERP_XY;
5326
5327 alu.dst.sel = tmp;
5328 if ((i > 1 && i < 6)) {
5329 alu.dst.write = 1;
5330 }
5331 else {
5332 alu.dst.write = 0;
5333 }
5334 alu.dst.chan = i % 4;
5335
5336 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
5337 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
5338 alu.src[0].sel = ctx->temp_reg;
5339 alu.src[0].chan = 1 - (i % 2);
5340 } else {
5341 alu.src[0].sel = interp_gpr;
5342 alu.src[0].chan = interp_base_chan + 1 - (i % 2);
5343 }
5344 alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
5345 alu.src[1].chan = 0;
5346
5347 alu.last = i % 4 == 3;
5348 alu.bank_swizzle_force = SQ_ALU_VEC_210;
5349
5350 r = r600_bytecode_add_alu(ctx->bc, &alu);
5351 if (r)
5352 return r;
5353 }
5354
5355 // INTERP can't swizzle dst
5356 lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5357 for (i = 0; i <= lasti; i++) {
5358 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5359 continue;
5360
5361 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5362 alu.op = ALU_OP1_MOV;
5363 alu.src[0].sel = tmp;
5364 alu.src[0].chan = ctx->src[0].swizzle[i];
5365 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5366 alu.dst.write = 1;
5367 alu.last = i == lasti;
5368 r = r600_bytecode_add_alu(ctx->bc, &alu);
5369 if (r)
5370 return r;
5371 }
5372
5373 return 0;
5374 }
5375
5376
5377 static int tgsi_helper_copy(struct r600_shader_ctx *ctx, struct tgsi_full_instruction *inst)
5378 {
5379 struct r600_bytecode_alu alu;
5380 int i, r;
5381
5382 for (i = 0; i < 4; i++) {
5383 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5384 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) {
5385 alu.op = ALU_OP0_NOP;
5386 alu.dst.chan = i;
5387 } else {
5388 alu.op = ALU_OP1_MOV;
5389 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5390 alu.src[0].sel = ctx->temp_reg;
5391 alu.src[0].chan = i;
5392 }
5393 if (i == 3) {
5394 alu.last = 1;
5395 }
5396 r = r600_bytecode_add_alu(ctx->bc, &alu);
5397 if (r)
5398 return r;
5399 }
5400 return 0;
5401 }
5402
5403 static int tgsi_make_src_for_op3(struct r600_shader_ctx *ctx,
5404 unsigned temp, int chan,
5405 struct r600_bytecode_alu_src *bc_src,
5406 const struct r600_shader_src *shader_src)
5407 {
5408 struct r600_bytecode_alu alu;
5409 int r;
5410
5411 r600_bytecode_src(bc_src, shader_src, chan);
5412
5413 /* op3 operands don't support abs modifier */
5414 if (bc_src->abs) {
5415 assert(temp!=0); /* we actually need the extra register, make sure it is allocated. */
5416 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5417 alu.op = ALU_OP1_MOV;
5418 alu.dst.sel = temp;
5419 alu.dst.chan = chan;
5420 alu.dst.write = 1;
5421
5422 alu.src[0] = *bc_src;
5423 alu.last = true; // sufficient?
5424 r = r600_bytecode_add_alu(ctx->bc, &alu);
5425 if (r)
5426 return r;
5427
5428 memset(bc_src, 0, sizeof(*bc_src));
5429 bc_src->sel = temp;
5430 bc_src->chan = chan;
5431 }
5432 return 0;
5433 }
5434
5435 static int tgsi_op3(struct r600_shader_ctx *ctx)
5436 {
5437 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5438 struct r600_bytecode_alu alu;
5439 int i, j, r;
5440 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5441 int temp_regs[4];
5442
5443 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
5444 temp_regs[j] = 0;
5445 if (ctx->src[j].abs)
5446 temp_regs[j] = r600_get_temp(ctx);
5447 }
5448 for (i = 0; i < lasti + 1; i++) {
5449 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5450 continue;
5451
5452 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5453 alu.op = ctx->inst_info->op;
5454 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
5455 r = tgsi_make_src_for_op3(ctx, temp_regs[j], i, &alu.src[j], &ctx->src[j]);
5456 if (r)
5457 return r;
5458 }
5459
5460 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5461 alu.dst.chan = i;
5462 alu.dst.write = 1;
5463 alu.is_op3 = 1;
5464 if (i == lasti) {
5465 alu.last = 1;
5466 }
5467 r = r600_bytecode_add_alu(ctx->bc, &alu);
5468 if (r)
5469 return r;
5470 }
5471 return 0;
5472 }
5473
5474 static int tgsi_dp(struct r600_shader_ctx *ctx)
5475 {
5476 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5477 struct r600_bytecode_alu alu;
5478 int i, j, r;
5479
5480 for (i = 0; i < 4; i++) {
5481 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5482 alu.op = ctx->inst_info->op;
5483 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
5484 r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
5485 }
5486
5487 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5488 alu.dst.chan = i;
5489 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
5490 /* handle some special cases */
5491 switch (inst->Instruction.Opcode) {
5492 case TGSI_OPCODE_DP2:
5493 if (i > 1) {
5494 alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
5495 alu.src[0].chan = alu.src[1].chan = 0;
5496 }
5497 break;
5498 case TGSI_OPCODE_DP3:
5499 if (i > 2) {
5500 alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
5501 alu.src[0].chan = alu.src[1].chan = 0;
5502 }
5503 break;
5504 case TGSI_OPCODE_DPH:
5505 if (i == 3) {
5506 alu.src[0].sel = V_SQ_ALU_SRC_1;
5507 alu.src[0].chan = 0;
5508 alu.src[0].neg = 0;
5509 }
5510 break;
5511 default:
5512 break;
5513 }
5514 if (i == 3) {
5515 alu.last = 1;
5516 }
5517 r = r600_bytecode_add_alu(ctx->bc, &alu);
5518 if (r)
5519 return r;
5520 }
5521 return 0;
5522 }
5523
5524 static inline boolean tgsi_tex_src_requires_loading(struct r600_shader_ctx *ctx,
5525 unsigned index)
5526 {
5527 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5528 return (inst->Src[index].Register.File != TGSI_FILE_TEMPORARY &&
5529 inst->Src[index].Register.File != TGSI_FILE_INPUT &&
5530 inst->Src[index].Register.File != TGSI_FILE_OUTPUT) ||
5531 ctx->src[index].neg || ctx->src[index].abs ||
5532 (inst->Src[index].Register.File == TGSI_FILE_INPUT && ctx->type == TGSI_PROCESSOR_GEOMETRY);
5533 }
5534
5535 static inline unsigned tgsi_tex_get_src_gpr(struct r600_shader_ctx *ctx,
5536 unsigned index)
5537 {
5538 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5539 return ctx->file_offset[inst->Src[index].Register.File] + inst->Src[index].Register.Index;
5540 }
5541
5542 static int do_vtx_fetch_inst(struct r600_shader_ctx *ctx, boolean src_requires_loading)
5543 {
5544 struct r600_bytecode_vtx vtx;
5545 struct r600_bytecode_alu alu;
5546 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5547 int src_gpr, r, i;
5548 int id = tgsi_tex_get_src_gpr(ctx, 1);
5549
5550 src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
5551 if (src_requires_loading) {
5552 for (i = 0; i < 4; i++) {
5553 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5554 alu.op = ALU_OP1_MOV;
5555 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5556 alu.dst.sel = ctx->temp_reg;
5557 alu.dst.chan = i;
5558 if (i == 3)
5559 alu.last = 1;
5560 alu.dst.write = 1;
5561 r = r600_bytecode_add_alu(ctx->bc, &alu);
5562 if (r)
5563 return r;
5564 }
5565 src_gpr = ctx->temp_reg;
5566 }
5567
5568 memset(&vtx, 0, sizeof(vtx));
5569 vtx.op = FETCH_OP_VFETCH;
5570 vtx.buffer_id = id + R600_MAX_CONST_BUFFERS;
5571 vtx.fetch_type = SQ_VTX_FETCH_NO_INDEX_OFFSET;
5572 vtx.src_gpr = src_gpr;
5573 vtx.mega_fetch_count = 16;
5574 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
5575 vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; /* SEL_X */
5576 vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; /* SEL_Y */
5577 vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; /* SEL_Z */
5578 vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; /* SEL_W */
5579 vtx.use_const_fields = 1;
5580
5581 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
5582 return r;
5583
5584 if (ctx->bc->chip_class >= EVERGREEN)
5585 return 0;
5586
5587 for (i = 0; i < 4; i++) {
5588 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5589 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5590 continue;
5591
5592 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5593 alu.op = ALU_OP2_AND_INT;
5594
5595 alu.dst.chan = i;
5596 alu.dst.sel = vtx.dst_gpr;
5597 alu.dst.write = 1;
5598
5599 alu.src[0].sel = vtx.dst_gpr;
5600 alu.src[0].chan = i;
5601
5602 alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL;
5603 alu.src[1].sel += (id * 2);
5604 alu.src[1].chan = i % 4;
5605 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
5606
5607 if (i == lasti)
5608 alu.last = 1;
5609 r = r600_bytecode_add_alu(ctx->bc, &alu);
5610 if (r)
5611 return r;
5612 }
5613
5614 if (inst->Dst[0].Register.WriteMask & 3) {
5615 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5616 alu.op = ALU_OP2_OR_INT;
5617
5618 alu.dst.chan = 3;
5619 alu.dst.sel = vtx.dst_gpr;
5620 alu.dst.write = 1;
5621
5622 alu.src[0].sel = vtx.dst_gpr;
5623 alu.src[0].chan = 3;
5624
5625 alu.src[1].sel = R600_SHADER_BUFFER_INFO_SEL + (id * 2) + 1;
5626 alu.src[1].chan = 0;
5627 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
5628
5629 alu.last = 1;
5630 r = r600_bytecode_add_alu(ctx->bc, &alu);
5631 if (r)
5632 return r;
5633 }
5634 return 0;
5635 }
5636
5637 static int r600_do_buffer_txq(struct r600_shader_ctx *ctx)
5638 {
5639 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5640 struct r600_bytecode_alu alu;
5641 int r;
5642 int id = tgsi_tex_get_src_gpr(ctx, 1);
5643
5644 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5645 alu.op = ALU_OP1_MOV;
5646 alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
5647 if (ctx->bc->chip_class >= EVERGREEN) {
5648 /* channel 0 or 2 of each word */
5649 alu.src[0].sel += (id / 2);
5650 alu.src[0].chan = (id % 2) * 2;
5651 } else {
5652 /* r600 we have them at channel 2 of the second dword */
5653 alu.src[0].sel += (id * 2) + 1;
5654 alu.src[0].chan = 1;
5655 }
5656 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
5657 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
5658 alu.last = 1;
5659 r = r600_bytecode_add_alu(ctx->bc, &alu);
5660 if (r)
5661 return r;
5662 return 0;
5663 }
5664
5665 static int tgsi_tex(struct r600_shader_ctx *ctx)
5666 {
5667 static float one_point_five = 1.5f;
5668 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5669 struct r600_bytecode_tex tex;
5670 struct r600_bytecode_alu alu;
5671 unsigned src_gpr;
5672 int r, i, j;
5673 int opcode;
5674 bool read_compressed_msaa = ctx->bc->has_compressed_msaa_texturing &&
5675 inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
5676 (inst->Texture.Texture == TGSI_TEXTURE_2D_MSAA ||
5677 inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA);
5678
5679 bool txf_add_offsets = inst->Texture.NumOffsets &&
5680 inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
5681 inst->Texture.Texture != TGSI_TEXTURE_BUFFER;
5682
5683 /* Texture fetch instructions can only use gprs as source.
5684 * Also they cannot negate the source or take the absolute value */
5685 const boolean src_requires_loading = (inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ &&
5686 inst->Instruction.Opcode != TGSI_OPCODE_TXQS &&
5687 tgsi_tex_src_requires_loading(ctx, 0)) ||
5688 read_compressed_msaa || txf_add_offsets;
5689
5690 boolean src_loaded = FALSE;
5691 unsigned sampler_src_reg = inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ ? 0 : 1;
5692 int8_t offset_x = 0, offset_y = 0, offset_z = 0;
5693 boolean has_txq_cube_array_z = false;
5694 unsigned sampler_index_mode;
5695
5696 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ &&
5697 ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
5698 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)))
5699 if (inst->Dst[0].Register.WriteMask & 4) {
5700 ctx->shader->has_txq_cube_array_z_comp = true;
5701 has_txq_cube_array_z = true;
5702 }
5703
5704 if (inst->Instruction.Opcode == TGSI_OPCODE_TEX2 ||
5705 inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
5706 inst->Instruction.Opcode == TGSI_OPCODE_TXL2 ||
5707 inst->Instruction.Opcode == TGSI_OPCODE_TG4)
5708 sampler_src_reg = 2;
5709
5710 /* TGSI moves the sampler to src reg 3 for TXD */
5711 if (inst->Instruction.Opcode == TGSI_OPCODE_TXD)
5712 sampler_src_reg = 3;
5713
5714 sampler_index_mode = inst->Src[sampler_src_reg].Indirect.Index == 2 ? 2 : 0; // CF_INDEX_1 : CF_INDEX_NONE
5715
5716 src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
5717
5718 if (inst->Texture.Texture == TGSI_TEXTURE_BUFFER) {
5719 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) {
5720 ctx->shader->uses_tex_buffers = true;
5721 return r600_do_buffer_txq(ctx);
5722 }
5723 else if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
5724 if (ctx->bc->chip_class < EVERGREEN)
5725 ctx->shader->uses_tex_buffers = true;
5726 return do_vtx_fetch_inst(ctx, src_requires_loading);
5727 }
5728 }
5729
5730 if (inst->Instruction.Opcode == TGSI_OPCODE_TXP) {
5731 int out_chan;
5732 /* Add perspective divide */
5733 if (ctx->bc->chip_class == CAYMAN) {
5734 out_chan = 2;
5735 for (i = 0; i < 3; i++) {
5736 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5737 alu.op = ALU_OP1_RECIP_IEEE;
5738 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
5739
5740 alu.dst.sel = ctx->temp_reg;
5741 alu.dst.chan = i;
5742 if (i == 2)
5743 alu.last = 1;
5744 if (out_chan == i)
5745 alu.dst.write = 1;
5746 r = r600_bytecode_add_alu(ctx->bc, &alu);
5747 if (r)
5748 return r;
5749 }
5750
5751 } else {
5752 out_chan = 3;
5753 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5754 alu.op = ALU_OP1_RECIP_IEEE;
5755 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
5756
5757 alu.dst.sel = ctx->temp_reg;
5758 alu.dst.chan = out_chan;
5759 alu.last = 1;
5760 alu.dst.write = 1;
5761 r = r600_bytecode_add_alu(ctx->bc, &alu);
5762 if (r)
5763 return r;
5764 }
5765
5766 for (i = 0; i < 3; i++) {
5767 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5768 alu.op = ALU_OP2_MUL;
5769 alu.src[0].sel = ctx->temp_reg;
5770 alu.src[0].chan = out_chan;
5771 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5772 alu.dst.sel = ctx->temp_reg;
5773 alu.dst.chan = i;
5774 alu.dst.write = 1;
5775 r = r600_bytecode_add_alu(ctx->bc, &alu);
5776 if (r)
5777 return r;
5778 }
5779 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5780 alu.op = ALU_OP1_MOV;
5781 alu.src[0].sel = V_SQ_ALU_SRC_1;
5782 alu.src[0].chan = 0;
5783 alu.dst.sel = ctx->temp_reg;
5784 alu.dst.chan = 3;
5785 alu.last = 1;
5786 alu.dst.write = 1;
5787 r = r600_bytecode_add_alu(ctx->bc, &alu);
5788 if (r)
5789 return r;
5790 src_loaded = TRUE;
5791 src_gpr = ctx->temp_reg;
5792 }
5793
5794
5795 if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
5796 inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
5797 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
5798 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
5799 inst->Instruction.Opcode != TGSI_OPCODE_TXQ &&
5800 inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ) {
5801
5802 static const unsigned src0_swizzle[] = {2, 2, 0, 1};
5803 static const unsigned src1_swizzle[] = {1, 0, 2, 2};
5804
5805 /* tmp1.xyzw = CUBE(R0.zzxy, R0.yxzz) */
5806 for (i = 0; i < 4; i++) {
5807 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5808 alu.op = ALU_OP2_CUBE;
5809 r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
5810 r600_bytecode_src(&alu.src[1], &ctx->src[0], src1_swizzle[i]);
5811 alu.dst.sel = ctx->temp_reg;
5812 alu.dst.chan = i;
5813 if (i == 3)
5814 alu.last = 1;
5815 alu.dst.write = 1;
5816 r = r600_bytecode_add_alu(ctx->bc, &alu);
5817 if (r)
5818 return r;
5819 }
5820
5821 /* tmp1.z = RCP_e(|tmp1.z|) */
5822 if (ctx->bc->chip_class == CAYMAN) {
5823 for (i = 0; i < 3; i++) {
5824 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5825 alu.op = ALU_OP1_RECIP_IEEE;
5826 alu.src[0].sel = ctx->temp_reg;
5827 alu.src[0].chan = 2;
5828 alu.src[0].abs = 1;
5829 alu.dst.sel = ctx->temp_reg;
5830 alu.dst.chan = i;
5831 if (i == 2)
5832 alu.dst.write = 1;
5833 if (i == 2)
5834 alu.last = 1;
5835 r = r600_bytecode_add_alu(ctx->bc, &alu);
5836 if (r)
5837 return r;
5838 }
5839 } else {
5840 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5841 alu.op = ALU_OP1_RECIP_IEEE;
5842 alu.src[0].sel = ctx->temp_reg;
5843 alu.src[0].chan = 2;
5844 alu.src[0].abs = 1;
5845 alu.dst.sel = ctx->temp_reg;
5846 alu.dst.chan = 2;
5847 alu.dst.write = 1;
5848 alu.last = 1;
5849 r = r600_bytecode_add_alu(ctx->bc, &alu);
5850 if (r)
5851 return r;
5852 }
5853
5854 /* MULADD R0.x, R0.x, PS1, (0x3FC00000, 1.5f).x
5855 * MULADD R0.y, R0.y, PS1, (0x3FC00000, 1.5f).x
5856 * muladd has no writemask, have to use another temp
5857 */
5858 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5859 alu.op = ALU_OP3_MULADD;
5860 alu.is_op3 = 1;
5861
5862 alu.src[0].sel = ctx->temp_reg;
5863 alu.src[0].chan = 0;
5864 alu.src[1].sel = ctx->temp_reg;
5865 alu.src[1].chan = 2;
5866
5867 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
5868 alu.src[2].chan = 0;
5869 alu.src[2].value = *(uint32_t *)&one_point_five;
5870
5871 alu.dst.sel = ctx->temp_reg;
5872 alu.dst.chan = 0;
5873 alu.dst.write = 1;
5874
5875 r = r600_bytecode_add_alu(ctx->bc, &alu);
5876 if (r)
5877 return r;
5878
5879 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5880 alu.op = ALU_OP3_MULADD;
5881 alu.is_op3 = 1;
5882
5883 alu.src[0].sel = ctx->temp_reg;
5884 alu.src[0].chan = 1;
5885 alu.src[1].sel = ctx->temp_reg;
5886 alu.src[1].chan = 2;
5887
5888 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
5889 alu.src[2].chan = 0;
5890 alu.src[2].value = *(uint32_t *)&one_point_five;
5891
5892 alu.dst.sel = ctx->temp_reg;
5893 alu.dst.chan = 1;
5894 alu.dst.write = 1;
5895
5896 alu.last = 1;
5897 r = r600_bytecode_add_alu(ctx->bc, &alu);
5898 if (r)
5899 return r;
5900 /* write initial compare value into Z component
5901 - W src 0 for shadow cube
5902 - X src 1 for shadow cube array */
5903 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
5904 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
5905 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5906 alu.op = ALU_OP1_MOV;
5907 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
5908 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
5909 else
5910 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
5911 alu.dst.sel = ctx->temp_reg;
5912 alu.dst.chan = 2;
5913 alu.dst.write = 1;
5914 alu.last = 1;
5915 r = r600_bytecode_add_alu(ctx->bc, &alu);
5916 if (r)
5917 return r;
5918 }
5919
5920 if (inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
5921 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
5922 if (ctx->bc->chip_class >= EVERGREEN) {
5923 int mytmp = r600_get_temp(ctx);
5924 static const float eight = 8.0f;
5925 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5926 alu.op = ALU_OP1_MOV;
5927 alu.src[0].sel = ctx->temp_reg;
5928 alu.src[0].chan = 3;
5929 alu.dst.sel = mytmp;
5930 alu.dst.chan = 0;
5931 alu.dst.write = 1;
5932 alu.last = 1;
5933 r = r600_bytecode_add_alu(ctx->bc, &alu);
5934 if (r)
5935 return r;
5936
5937 /* have to multiply original layer by 8 and add to face id (temp.w) in Z */
5938 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5939 alu.op = ALU_OP3_MULADD;
5940 alu.is_op3 = 1;
5941 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
5942 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
5943 alu.src[1].chan = 0;
5944 alu.src[1].value = *(uint32_t *)&eight;
5945 alu.src[2].sel = mytmp;
5946 alu.src[2].chan = 0;
5947 alu.dst.sel = ctx->temp_reg;
5948 alu.dst.chan = 3;
5949 alu.dst.write = 1;
5950 alu.last = 1;
5951 r = r600_bytecode_add_alu(ctx->bc, &alu);
5952 if (r)
5953 return r;
5954 } else if (ctx->bc->chip_class < EVERGREEN) {
5955 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
5956 tex.op = FETCH_OP_SET_CUBEMAP_INDEX;
5957 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
5958 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
5959 tex.src_gpr = r600_get_temp(ctx);
5960 tex.src_sel_x = 0;
5961 tex.src_sel_y = 0;
5962 tex.src_sel_z = 0;
5963 tex.src_sel_w = 0;
5964 tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
5965 tex.coord_type_x = 1;
5966 tex.coord_type_y = 1;
5967 tex.coord_type_z = 1;
5968 tex.coord_type_w = 1;
5969 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5970 alu.op = ALU_OP1_MOV;
5971 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
5972 alu.dst.sel = tex.src_gpr;
5973 alu.dst.chan = 0;
5974 alu.last = 1;
5975 alu.dst.write = 1;
5976 r = r600_bytecode_add_alu(ctx->bc, &alu);
5977 if (r)
5978 return r;
5979
5980 r = r600_bytecode_add_tex(ctx->bc, &tex);
5981 if (r)
5982 return r;
5983 }
5984
5985 }
5986
5987 /* for cube forms of lod and bias we need to route things */
5988 if (inst->Instruction.Opcode == TGSI_OPCODE_TXB ||
5989 inst->Instruction.Opcode == TGSI_OPCODE_TXL ||
5990 inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
5991 inst->Instruction.Opcode == TGSI_OPCODE_TXL2) {
5992 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5993 alu.op = ALU_OP1_MOV;
5994 if (inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
5995 inst->Instruction.Opcode == TGSI_OPCODE_TXL2)
5996 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
5997 else
5998 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
5999 alu.dst.sel = ctx->temp_reg;
6000 alu.dst.chan = 2;
6001 alu.last = 1;
6002 alu.dst.write = 1;
6003 r = r600_bytecode_add_alu(ctx->bc, &alu);
6004 if (r)
6005 return r;
6006 }
6007
6008 src_loaded = TRUE;
6009 src_gpr = ctx->temp_reg;
6010 }
6011
6012 if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) {
6013 int temp_h = 0, temp_v = 0;
6014 int start_val = 0;
6015
6016 /* if we've already loaded the src (i.e. CUBE don't reload it). */
6017 if (src_loaded == TRUE)
6018 start_val = 1;
6019 else
6020 src_loaded = TRUE;
6021 for (i = start_val; i < 3; i++) {
6022 int treg = r600_get_temp(ctx);
6023
6024 if (i == 0)
6025 src_gpr = treg;
6026 else if (i == 1)
6027 temp_h = treg;
6028 else
6029 temp_v = treg;
6030
6031 for (j = 0; j < 4; j++) {
6032 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6033 alu.op = ALU_OP1_MOV;
6034 r600_bytecode_src(&alu.src[0], &ctx->src[i], j);
6035 alu.dst.sel = treg;
6036 alu.dst.chan = j;
6037 if (j == 3)
6038 alu.last = 1;
6039 alu.dst.write = 1;
6040 r = r600_bytecode_add_alu(ctx->bc, &alu);
6041 if (r)
6042 return r;
6043 }
6044 }
6045 for (i = 1; i < 3; i++) {
6046 /* set gradients h/v */
6047 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
6048 tex.op = (i == 1) ? FETCH_OP_SET_GRADIENTS_H :
6049 FETCH_OP_SET_GRADIENTS_V;
6050 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
6051 tex.sampler_index_mode = sampler_index_mode;
6052 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
6053 tex.resource_index_mode = sampler_index_mode;
6054
6055 tex.src_gpr = (i == 1) ? temp_h : temp_v;
6056 tex.src_sel_x = 0;
6057 tex.src_sel_y = 1;
6058 tex.src_sel_z = 2;
6059 tex.src_sel_w = 3;
6060
6061 tex.dst_gpr = r600_get_temp(ctx); /* just to avoid confusing the asm scheduler */
6062 tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
6063 if (inst->Texture.Texture != TGSI_TEXTURE_RECT) {
6064 tex.coord_type_x = 1;
6065 tex.coord_type_y = 1;
6066 tex.coord_type_z = 1;
6067 tex.coord_type_w = 1;
6068 }
6069 r = r600_bytecode_add_tex(ctx->bc, &tex);
6070 if (r)
6071 return r;
6072 }
6073 }
6074
6075 if (src_requires_loading && !src_loaded) {
6076 for (i = 0; i < 4; i++) {
6077 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6078 alu.op = ALU_OP1_MOV;
6079 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6080 alu.dst.sel = ctx->temp_reg;
6081 alu.dst.chan = i;
6082 if (i == 3)
6083 alu.last = 1;
6084 alu.dst.write = 1;
6085 r = r600_bytecode_add_alu(ctx->bc, &alu);
6086 if (r)
6087 return r;
6088 }
6089 src_loaded = TRUE;
6090 src_gpr = ctx->temp_reg;
6091 }
6092
6093 /* get offset values */
6094 if (inst->Texture.NumOffsets) {
6095 assert(inst->Texture.NumOffsets == 1);
6096
6097 /* The texture offset feature doesn't work with the TXF instruction
6098 * and must be emulated by adding the offset to the texture coordinates. */
6099 if (txf_add_offsets) {
6100 const struct tgsi_texture_offset *off = inst->TexOffsets;
6101
6102 switch (inst->Texture.Texture) {
6103 case TGSI_TEXTURE_3D:
6104 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6105 alu.op = ALU_OP2_ADD_INT;
6106 alu.src[0].sel = src_gpr;
6107 alu.src[0].chan = 2;
6108 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6109 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleZ];
6110 alu.dst.sel = src_gpr;
6111 alu.dst.chan = 2;
6112 alu.dst.write = 1;
6113 alu.last = 1;
6114 r = r600_bytecode_add_alu(ctx->bc, &alu);
6115 if (r)
6116 return r;
6117 /* fall through */
6118
6119 case TGSI_TEXTURE_2D:
6120 case TGSI_TEXTURE_SHADOW2D:
6121 case TGSI_TEXTURE_RECT:
6122 case TGSI_TEXTURE_SHADOWRECT:
6123 case TGSI_TEXTURE_2D_ARRAY:
6124 case TGSI_TEXTURE_SHADOW2D_ARRAY:
6125 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6126 alu.op = ALU_OP2_ADD_INT;
6127 alu.src[0].sel = src_gpr;
6128 alu.src[0].chan = 1;
6129 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6130 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleY];
6131 alu.dst.sel = src_gpr;
6132 alu.dst.chan = 1;
6133 alu.dst.write = 1;
6134 alu.last = 1;
6135 r = r600_bytecode_add_alu(ctx->bc, &alu);
6136 if (r)
6137 return r;
6138 /* fall through */
6139
6140 case TGSI_TEXTURE_1D:
6141 case TGSI_TEXTURE_SHADOW1D:
6142 case TGSI_TEXTURE_1D_ARRAY:
6143 case TGSI_TEXTURE_SHADOW1D_ARRAY:
6144 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6145 alu.op = ALU_OP2_ADD_INT;
6146 alu.src[0].sel = src_gpr;
6147 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6148 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleX];
6149 alu.dst.sel = src_gpr;
6150 alu.dst.write = 1;
6151 alu.last = 1;
6152 r = r600_bytecode_add_alu(ctx->bc, &alu);
6153 if (r)
6154 return r;
6155 break;
6156 /* texture offsets do not apply to other texture targets */
6157 }
6158 } else {
6159 switch (inst->Texture.Texture) {
6160 case TGSI_TEXTURE_3D:
6161 offset_z = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleZ] << 1;
6162 /* fallthrough */
6163 case TGSI_TEXTURE_2D:
6164 case TGSI_TEXTURE_SHADOW2D:
6165 case TGSI_TEXTURE_RECT:
6166 case TGSI_TEXTURE_SHADOWRECT:
6167 case TGSI_TEXTURE_2D_ARRAY:
6168 case TGSI_TEXTURE_SHADOW2D_ARRAY:
6169 offset_y = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleY] << 1;
6170 /* fallthrough */
6171 case TGSI_TEXTURE_1D:
6172 case TGSI_TEXTURE_SHADOW1D:
6173 case TGSI_TEXTURE_1D_ARRAY:
6174 case TGSI_TEXTURE_SHADOW1D_ARRAY:
6175 offset_x = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleX] << 1;
6176 }
6177 }
6178 }
6179
6180 /* Obtain the sample index for reading a compressed MSAA color texture.
6181 * To read the FMASK, we use the ldfptr instruction, which tells us
6182 * where the samples are stored.
6183 * For uncompressed 8x MSAA surfaces, ldfptr should return 0x76543210,
6184 * which is the identity mapping. Each nibble says which physical sample
6185 * should be fetched to get that sample.
6186 *
6187 * Assume src.z contains the sample index. It should be modified like this:
6188 * src.z = (ldfptr() >> (src.z * 4)) & 0xF;
6189 * Then fetch the texel with src.
6190 */
6191 if (read_compressed_msaa) {
6192 unsigned sample_chan = 3;
6193 unsigned temp = r600_get_temp(ctx);
6194 assert(src_loaded);
6195
6196 /* temp.w = ldfptr() */
6197 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
6198 tex.op = FETCH_OP_LD;
6199 tex.inst_mod = 1; /* to indicate this is ldfptr */
6200 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
6201 tex.sampler_index_mode = sampler_index_mode;
6202 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
6203 tex.resource_index_mode = sampler_index_mode;
6204 tex.src_gpr = src_gpr;
6205 tex.dst_gpr = temp;
6206 tex.dst_sel_x = 7; /* mask out these components */
6207 tex.dst_sel_y = 7;
6208 tex.dst_sel_z = 7;
6209 tex.dst_sel_w = 0; /* store X */
6210 tex.src_sel_x = 0;
6211 tex.src_sel_y = 1;
6212 tex.src_sel_z = 2;
6213 tex.src_sel_w = 3;
6214 tex.offset_x = offset_x;
6215 tex.offset_y = offset_y;
6216 tex.offset_z = offset_z;
6217 r = r600_bytecode_add_tex(ctx->bc, &tex);
6218 if (r)
6219 return r;
6220
6221 /* temp.x = sample_index*4 */
6222 if (ctx->bc->chip_class == CAYMAN) {
6223 for (i = 0 ; i < 4; i++) {
6224 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6225 alu.op = ALU_OP2_MULLO_INT;
6226 alu.src[0].sel = src_gpr;
6227 alu.src[0].chan = sample_chan;
6228 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6229 alu.src[1].value = 4;
6230 alu.dst.sel = temp;
6231 alu.dst.chan = i;
6232 alu.dst.write = i == 0;
6233 if (i == 3)
6234 alu.last = 1;
6235 r = r600_bytecode_add_alu(ctx->bc, &alu);
6236 if (r)
6237 return r;
6238 }
6239 } else {
6240 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6241 alu.op = ALU_OP2_MULLO_INT;
6242 alu.src[0].sel = src_gpr;
6243 alu.src[0].chan = sample_chan;
6244 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6245 alu.src[1].value = 4;
6246 alu.dst.sel = temp;
6247 alu.dst.chan = 0;
6248 alu.dst.write = 1;
6249 alu.last = 1;
6250 r = r600_bytecode_add_alu(ctx->bc, &alu);
6251 if (r)
6252 return r;
6253 }
6254
6255 /* sample_index = temp.w >> temp.x */
6256 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6257 alu.op = ALU_OP2_LSHR_INT;
6258 alu.src[0].sel = temp;
6259 alu.src[0].chan = 3;
6260 alu.src[1].sel = temp;
6261 alu.src[1].chan = 0;
6262 alu.dst.sel = src_gpr;
6263 alu.dst.chan = sample_chan;
6264 alu.dst.write = 1;
6265 alu.last = 1;
6266 r = r600_bytecode_add_alu(ctx->bc, &alu);
6267 if (r)
6268 return r;
6269
6270 /* sample_index & 0xF */
6271 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6272 alu.op = ALU_OP2_AND_INT;
6273 alu.src[0].sel = src_gpr;
6274 alu.src[0].chan = sample_chan;
6275 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
6276 alu.src[1].value = 0xF;
6277 alu.dst.sel = src_gpr;
6278 alu.dst.chan = sample_chan;
6279 alu.dst.write = 1;
6280 alu.last = 1;
6281 r = r600_bytecode_add_alu(ctx->bc, &alu);
6282 if (r)
6283 return r;
6284 #if 0
6285 /* visualize the FMASK */
6286 for (i = 0; i < 4; i++) {
6287 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6288 alu.op = ALU_OP1_INT_TO_FLT;
6289 alu.src[0].sel = src_gpr;
6290 alu.src[0].chan = sample_chan;
6291 alu.dst.sel = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
6292 alu.dst.chan = i;
6293 alu.dst.write = 1;
6294 alu.last = 1;
6295 r = r600_bytecode_add_alu(ctx->bc, &alu);
6296 if (r)
6297 return r;
6298 }
6299 return 0;
6300 #endif
6301 }
6302
6303 /* does this shader want a num layers from TXQ for a cube array? */
6304 if (has_txq_cube_array_z) {
6305 int id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
6306
6307 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6308 alu.op = ALU_OP1_MOV;
6309
6310 alu.src[0].sel = R600_SHADER_BUFFER_INFO_SEL;
6311 if (ctx->bc->chip_class >= EVERGREEN) {
6312 /* channel 1 or 3 of each word */
6313 alu.src[0].sel += (id / 2);
6314 alu.src[0].chan = ((id % 2) * 2) + 1;
6315 } else {
6316 /* r600 we have them at channel 2 of the second dword */
6317 alu.src[0].sel += (id * 2) + 1;
6318 alu.src[0].chan = 2;
6319 }
6320 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
6321 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
6322 alu.last = 1;
6323 r = r600_bytecode_add_alu(ctx->bc, &alu);
6324 if (r)
6325 return r;
6326 /* disable writemask from texture instruction */
6327 inst->Dst[0].Register.WriteMask &= ~4;
6328 }
6329
6330 opcode = ctx->inst_info->op;
6331 if (opcode == FETCH_OP_GATHER4 &&
6332 inst->TexOffsets[0].File != TGSI_FILE_NULL &&
6333 inst->TexOffsets[0].File != TGSI_FILE_IMMEDIATE) {
6334 opcode = FETCH_OP_GATHER4_O;
6335
6336 /* GATHER4_O/GATHER4_C_O use offset values loaded by
6337 SET_TEXTURE_OFFSETS instruction. The immediate offset values
6338 encoded in the instruction are ignored. */
6339 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
6340 tex.op = FETCH_OP_SET_TEXTURE_OFFSETS;
6341 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
6342 tex.sampler_index_mode = sampler_index_mode;
6343 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
6344 tex.resource_index_mode = sampler_index_mode;
6345
6346 tex.src_gpr = ctx->file_offset[inst->TexOffsets[0].File] + inst->TexOffsets[0].Index;
6347 tex.src_sel_x = inst->TexOffsets[0].SwizzleX;
6348 tex.src_sel_y = inst->TexOffsets[0].SwizzleY;
6349 tex.src_sel_z = inst->TexOffsets[0].SwizzleZ;
6350 tex.src_sel_w = 4;
6351
6352 tex.dst_sel_x = 7;
6353 tex.dst_sel_y = 7;
6354 tex.dst_sel_z = 7;
6355 tex.dst_sel_w = 7;
6356
6357 r = r600_bytecode_add_tex(ctx->bc, &tex);
6358 if (r)
6359 return r;
6360 }
6361
6362 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
6363 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
6364 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
6365 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
6366 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY ||
6367 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
6368 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
6369 switch (opcode) {
6370 case FETCH_OP_SAMPLE:
6371 opcode = FETCH_OP_SAMPLE_C;
6372 break;
6373 case FETCH_OP_SAMPLE_L:
6374 opcode = FETCH_OP_SAMPLE_C_L;
6375 break;
6376 case FETCH_OP_SAMPLE_LB:
6377 opcode = FETCH_OP_SAMPLE_C_LB;
6378 break;
6379 case FETCH_OP_SAMPLE_G:
6380 opcode = FETCH_OP_SAMPLE_C_G;
6381 break;
6382 /* Texture gather variants */
6383 case FETCH_OP_GATHER4:
6384 opcode = FETCH_OP_GATHER4_C;
6385 break;
6386 case FETCH_OP_GATHER4_O:
6387 opcode = FETCH_OP_GATHER4_C_O;
6388 break;
6389 }
6390 }
6391
6392 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
6393 tex.op = opcode;
6394
6395 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
6396 tex.sampler_index_mode = sampler_index_mode;
6397 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
6398 tex.resource_index_mode = sampler_index_mode;
6399 tex.src_gpr = src_gpr;
6400 tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
6401
6402 if (inst->Instruction.Opcode == TGSI_OPCODE_DDX_FINE ||
6403 inst->Instruction.Opcode == TGSI_OPCODE_DDY_FINE) {
6404 tex.inst_mod = 1; /* per pixel gradient calculation instead of per 2x2 quad */
6405 }
6406
6407 if (inst->Instruction.Opcode == TGSI_OPCODE_TG4) {
6408 int8_t texture_component_select = ctx->literals[4 * inst->Src[1].Register.Index + inst->Src[1].Register.SwizzleX];
6409 tex.inst_mod = texture_component_select;
6410
6411 if (ctx->bc->chip_class == CAYMAN) {
6412 /* GATHER4 result order is different from TGSI TG4 */
6413 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 0 : 7;
6414 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 4) ? 1 : 7;
6415 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 1) ? 2 : 7;
6416 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
6417 } else {
6418 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
6419 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
6420 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
6421 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
6422 }
6423 }
6424 else if (inst->Instruction.Opcode == TGSI_OPCODE_LODQ) {
6425 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
6426 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
6427 tex.dst_sel_z = 7;
6428 tex.dst_sel_w = 7;
6429 }
6430 else if (inst->Instruction.Opcode == TGSI_OPCODE_TXQS) {
6431 tex.dst_sel_x = 3;
6432 tex.dst_sel_y = 7;
6433 tex.dst_sel_z = 7;
6434 tex.dst_sel_w = 7;
6435 }
6436 else {
6437 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
6438 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
6439 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
6440 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
6441 }
6442
6443
6444 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ ||
6445 inst->Instruction.Opcode == TGSI_OPCODE_TXQS) {
6446 tex.src_sel_x = 4;
6447 tex.src_sel_y = 4;
6448 tex.src_sel_z = 4;
6449 tex.src_sel_w = 4;
6450 } else if (src_loaded) {
6451 tex.src_sel_x = 0;
6452 tex.src_sel_y = 1;
6453 tex.src_sel_z = 2;
6454 tex.src_sel_w = 3;
6455 } else {
6456 tex.src_sel_x = ctx->src[0].swizzle[0];
6457 tex.src_sel_y = ctx->src[0].swizzle[1];
6458 tex.src_sel_z = ctx->src[0].swizzle[2];
6459 tex.src_sel_w = ctx->src[0].swizzle[3];
6460 tex.src_rel = ctx->src[0].rel;
6461 }
6462
6463 if (inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
6464 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
6465 inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
6466 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
6467 tex.src_sel_x = 1;
6468 tex.src_sel_y = 0;
6469 tex.src_sel_z = 3;
6470 tex.src_sel_w = 2; /* route Z compare or Lod value into W */
6471 }
6472
6473 if (inst->Texture.Texture != TGSI_TEXTURE_RECT &&
6474 inst->Texture.Texture != TGSI_TEXTURE_SHADOWRECT) {
6475 tex.coord_type_x = 1;
6476 tex.coord_type_y = 1;
6477 }
6478 tex.coord_type_z = 1;
6479 tex.coord_type_w = 1;
6480
6481 tex.offset_x = offset_x;
6482 tex.offset_y = offset_y;
6483 if (inst->Instruction.Opcode == TGSI_OPCODE_TG4 &&
6484 (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
6485 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY)) {
6486 tex.offset_z = 0;
6487 }
6488 else {
6489 tex.offset_z = offset_z;
6490 }
6491
6492 /* Put the depth for comparison in W.
6493 * TGSI_TEXTURE_SHADOW2D_ARRAY already has the depth in W.
6494 * Some instructions expect the depth in Z. */
6495 if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
6496 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
6497 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
6498 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) &&
6499 opcode != FETCH_OP_SAMPLE_C_L &&
6500 opcode != FETCH_OP_SAMPLE_C_LB) {
6501 tex.src_sel_w = tex.src_sel_z;
6502 }
6503
6504 if (inst->Texture.Texture == TGSI_TEXTURE_1D_ARRAY ||
6505 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) {
6506 if (opcode == FETCH_OP_SAMPLE_C_L ||
6507 opcode == FETCH_OP_SAMPLE_C_LB) {
6508 /* the array index is read from Y */
6509 tex.coord_type_y = 0;
6510 } else {
6511 /* the array index is read from Z */
6512 tex.coord_type_z = 0;
6513 tex.src_sel_z = tex.src_sel_y;
6514 }
6515 } else if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
6516 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
6517 ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
6518 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
6519 (ctx->bc->chip_class >= EVERGREEN)))
6520 /* the array index is read from Z */
6521 tex.coord_type_z = 0;
6522
6523 /* mask unused source components */
6524 if (opcode == FETCH_OP_SAMPLE || opcode == FETCH_OP_GATHER4) {
6525 switch (inst->Texture.Texture) {
6526 case TGSI_TEXTURE_2D:
6527 case TGSI_TEXTURE_RECT:
6528 tex.src_sel_z = 7;
6529 tex.src_sel_w = 7;
6530 break;
6531 case TGSI_TEXTURE_1D_ARRAY:
6532 tex.src_sel_y = 7;
6533 tex.src_sel_w = 7;
6534 break;
6535 case TGSI_TEXTURE_1D:
6536 tex.src_sel_y = 7;
6537 tex.src_sel_z = 7;
6538 tex.src_sel_w = 7;
6539 break;
6540 }
6541 }
6542
6543 r = r600_bytecode_add_tex(ctx->bc, &tex);
6544 if (r)
6545 return r;
6546
6547 /* add shadow ambient support - gallium doesn't do it yet */
6548 return 0;
6549 }
6550
6551 static int tgsi_lrp(struct r600_shader_ctx *ctx)
6552 {
6553 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6554 struct r600_bytecode_alu alu;
6555 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6556 unsigned i, temp_regs[2];
6557 int r;
6558
6559 /* optimize if it's just an equal balance */
6560 if (ctx->src[0].sel == V_SQ_ALU_SRC_0_5) {
6561 for (i = 0; i < lasti + 1; i++) {
6562 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6563 continue;
6564
6565 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6566 alu.op = ALU_OP2_ADD;
6567 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
6568 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6569 alu.omod = 3;
6570 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6571 alu.dst.chan = i;
6572 if (i == lasti) {
6573 alu.last = 1;
6574 }
6575 r = r600_bytecode_add_alu(ctx->bc, &alu);
6576 if (r)
6577 return r;
6578 }
6579 return 0;
6580 }
6581
6582 /* 1 - src0 */
6583 for (i = 0; i < lasti + 1; i++) {
6584 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6585 continue;
6586
6587 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6588 alu.op = ALU_OP2_ADD;
6589 alu.src[0].sel = V_SQ_ALU_SRC_1;
6590 alu.src[0].chan = 0;
6591 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
6592 r600_bytecode_src_toggle_neg(&alu.src[1]);
6593 alu.dst.sel = ctx->temp_reg;
6594 alu.dst.chan = i;
6595 if (i == lasti) {
6596 alu.last = 1;
6597 }
6598 alu.dst.write = 1;
6599 r = r600_bytecode_add_alu(ctx->bc, &alu);
6600 if (r)
6601 return r;
6602 }
6603
6604 /* (1 - src0) * src2 */
6605 for (i = 0; i < lasti + 1; i++) {
6606 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6607 continue;
6608
6609 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6610 alu.op = ALU_OP2_MUL;
6611 alu.src[0].sel = ctx->temp_reg;
6612 alu.src[0].chan = i;
6613 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6614 alu.dst.sel = ctx->temp_reg;
6615 alu.dst.chan = i;
6616 if (i == lasti) {
6617 alu.last = 1;
6618 }
6619 alu.dst.write = 1;
6620 r = r600_bytecode_add_alu(ctx->bc, &alu);
6621 if (r)
6622 return r;
6623 }
6624
6625 /* src0 * src1 + (1 - src0) * src2 */
6626 if (ctx->src[0].abs)
6627 temp_regs[0] = r600_get_temp(ctx);
6628 else
6629 temp_regs[0] = 0;
6630 if (ctx->src[1].abs)
6631 temp_regs[1] = r600_get_temp(ctx);
6632 else
6633 temp_regs[1] = 0;
6634
6635 for (i = 0; i < lasti + 1; i++) {
6636 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6637 continue;
6638
6639 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6640 alu.op = ALU_OP3_MULADD;
6641 alu.is_op3 = 1;
6642 r = tgsi_make_src_for_op3(ctx, temp_regs[0], i, &alu.src[0], &ctx->src[0]);
6643 if (r)
6644 return r;
6645 r = tgsi_make_src_for_op3(ctx, temp_regs[1], i, &alu.src[1], &ctx->src[1]);
6646 if (r)
6647 return r;
6648 alu.src[2].sel = ctx->temp_reg;
6649 alu.src[2].chan = i;
6650
6651 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6652 alu.dst.chan = i;
6653 if (i == lasti) {
6654 alu.last = 1;
6655 }
6656 r = r600_bytecode_add_alu(ctx->bc, &alu);
6657 if (r)
6658 return r;
6659 }
6660 return 0;
6661 }
6662
6663 static int tgsi_cmp(struct r600_shader_ctx *ctx)
6664 {
6665 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6666 struct r600_bytecode_alu alu;
6667 int i, r, j;
6668 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6669 int temp_regs[3];
6670
6671 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
6672 temp_regs[j] = 0;
6673 if (ctx->src[j].abs)
6674 temp_regs[j] = r600_get_temp(ctx);
6675 }
6676
6677 for (i = 0; i < lasti + 1; i++) {
6678 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6679 continue;
6680
6681 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6682 alu.op = ALU_OP3_CNDGE;
6683 r = tgsi_make_src_for_op3(ctx, temp_regs[0], i, &alu.src[0], &ctx->src[0]);
6684 if (r)
6685 return r;
6686 r = tgsi_make_src_for_op3(ctx, temp_regs[2], i, &alu.src[1], &ctx->src[2]);
6687 if (r)
6688 return r;
6689 r = tgsi_make_src_for_op3(ctx, temp_regs[1], i, &alu.src[2], &ctx->src[1]);
6690 if (r)
6691 return r;
6692 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6693 alu.dst.chan = i;
6694 alu.dst.write = 1;
6695 alu.is_op3 = 1;
6696 if (i == lasti)
6697 alu.last = 1;
6698 r = r600_bytecode_add_alu(ctx->bc, &alu);
6699 if (r)
6700 return r;
6701 }
6702 return 0;
6703 }
6704
6705 static int tgsi_ucmp(struct r600_shader_ctx *ctx)
6706 {
6707 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6708 struct r600_bytecode_alu alu;
6709 int i, r;
6710 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6711
6712 for (i = 0; i < lasti + 1; i++) {
6713 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6714 continue;
6715
6716 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6717 alu.op = ALU_OP3_CNDE_INT;
6718 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6719 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
6720 r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
6721 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6722 alu.dst.chan = i;
6723 alu.dst.write = 1;
6724 alu.is_op3 = 1;
6725 if (i == lasti)
6726 alu.last = 1;
6727 r = r600_bytecode_add_alu(ctx->bc, &alu);
6728 if (r)
6729 return r;
6730 }
6731 return 0;
6732 }
6733
6734 static int tgsi_xpd(struct r600_shader_ctx *ctx)
6735 {
6736 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6737 static const unsigned int src0_swizzle[] = {2, 0, 1};
6738 static const unsigned int src1_swizzle[] = {1, 2, 0};
6739 struct r600_bytecode_alu alu;
6740 uint32_t use_temp = 0;
6741 int i, r;
6742
6743 if (inst->Dst[0].Register.WriteMask != 0xf)
6744 use_temp = 1;
6745
6746 for (i = 0; i < 4; i++) {
6747 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6748 alu.op = ALU_OP2_MUL;
6749 if (i < 3) {
6750 r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
6751 r600_bytecode_src(&alu.src[1], &ctx->src[1], src1_swizzle[i]);
6752 } else {
6753 alu.src[0].sel = V_SQ_ALU_SRC_0;
6754 alu.src[0].chan = i;
6755 alu.src[1].sel = V_SQ_ALU_SRC_0;
6756 alu.src[1].chan = i;
6757 }
6758
6759 alu.dst.sel = ctx->temp_reg;
6760 alu.dst.chan = i;
6761 alu.dst.write = 1;
6762
6763 if (i == 3)
6764 alu.last = 1;
6765 r = r600_bytecode_add_alu(ctx->bc, &alu);
6766 if (r)
6767 return r;
6768 }
6769
6770 for (i = 0; i < 4; i++) {
6771 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6772 alu.op = ALU_OP3_MULADD;
6773
6774 if (i < 3) {
6775 r600_bytecode_src(&alu.src[0], &ctx->src[0], src1_swizzle[i]);
6776 r600_bytecode_src(&alu.src[1], &ctx->src[1], src0_swizzle[i]);
6777 } else {
6778 alu.src[0].sel = V_SQ_ALU_SRC_0;
6779 alu.src[0].chan = i;
6780 alu.src[1].sel = V_SQ_ALU_SRC_0;
6781 alu.src[1].chan = i;
6782 }
6783
6784 alu.src[2].sel = ctx->temp_reg;
6785 alu.src[2].neg = 1;
6786 alu.src[2].chan = i;
6787
6788 if (use_temp)
6789 alu.dst.sel = ctx->temp_reg;
6790 else
6791 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6792 alu.dst.chan = i;
6793 alu.dst.write = 1;
6794 alu.is_op3 = 1;
6795 if (i == 3)
6796 alu.last = 1;
6797 r = r600_bytecode_add_alu(ctx->bc, &alu);
6798 if (r)
6799 return r;
6800 }
6801 if (use_temp)
6802 return tgsi_helper_copy(ctx, inst);
6803 return 0;
6804 }
6805
6806 static int tgsi_exp(struct r600_shader_ctx *ctx)
6807 {
6808 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6809 struct r600_bytecode_alu alu;
6810 int r;
6811 int i;
6812
6813 /* result.x = 2^floor(src); */
6814 if (inst->Dst[0].Register.WriteMask & 1) {
6815 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6816
6817 alu.op = ALU_OP1_FLOOR;
6818 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6819
6820 alu.dst.sel = ctx->temp_reg;
6821 alu.dst.chan = 0;
6822 alu.dst.write = 1;
6823 alu.last = 1;
6824 r = r600_bytecode_add_alu(ctx->bc, &alu);
6825 if (r)
6826 return r;
6827
6828 if (ctx->bc->chip_class == CAYMAN) {
6829 for (i = 0; i < 3; i++) {
6830 alu.op = ALU_OP1_EXP_IEEE;
6831 alu.src[0].sel = ctx->temp_reg;
6832 alu.src[0].chan = 0;
6833
6834 alu.dst.sel = ctx->temp_reg;
6835 alu.dst.chan = i;
6836 alu.dst.write = i == 0;
6837 alu.last = i == 2;
6838 r = r600_bytecode_add_alu(ctx->bc, &alu);
6839 if (r)
6840 return r;
6841 }
6842 } else {
6843 alu.op = ALU_OP1_EXP_IEEE;
6844 alu.src[0].sel = ctx->temp_reg;
6845 alu.src[0].chan = 0;
6846
6847 alu.dst.sel = ctx->temp_reg;
6848 alu.dst.chan = 0;
6849 alu.dst.write = 1;
6850 alu.last = 1;
6851 r = r600_bytecode_add_alu(ctx->bc, &alu);
6852 if (r)
6853 return r;
6854 }
6855 }
6856
6857 /* result.y = tmp - floor(tmp); */
6858 if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
6859 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6860
6861 alu.op = ALU_OP1_FRACT;
6862 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6863
6864 alu.dst.sel = ctx->temp_reg;
6865 #if 0
6866 r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6867 if (r)
6868 return r;
6869 #endif
6870 alu.dst.write = 1;
6871 alu.dst.chan = 1;
6872
6873 alu.last = 1;
6874
6875 r = r600_bytecode_add_alu(ctx->bc, &alu);
6876 if (r)
6877 return r;
6878 }
6879
6880 /* result.z = RoughApprox2ToX(tmp);*/
6881 if ((inst->Dst[0].Register.WriteMask >> 2) & 0x1) {
6882 if (ctx->bc->chip_class == CAYMAN) {
6883 for (i = 0; i < 3; i++) {
6884 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6885 alu.op = ALU_OP1_EXP_IEEE;
6886 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6887
6888 alu.dst.sel = ctx->temp_reg;
6889 alu.dst.chan = i;
6890 if (i == 2) {
6891 alu.dst.write = 1;
6892 alu.last = 1;
6893 }
6894
6895 r = r600_bytecode_add_alu(ctx->bc, &alu);
6896 if (r)
6897 return r;
6898 }
6899 } else {
6900 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6901 alu.op = ALU_OP1_EXP_IEEE;
6902 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6903
6904 alu.dst.sel = ctx->temp_reg;
6905 alu.dst.write = 1;
6906 alu.dst.chan = 2;
6907
6908 alu.last = 1;
6909
6910 r = r600_bytecode_add_alu(ctx->bc, &alu);
6911 if (r)
6912 return r;
6913 }
6914 }
6915
6916 /* result.w = 1.0;*/
6917 if ((inst->Dst[0].Register.WriteMask >> 3) & 0x1) {
6918 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6919
6920 alu.op = ALU_OP1_MOV;
6921 alu.src[0].sel = V_SQ_ALU_SRC_1;
6922 alu.src[0].chan = 0;
6923
6924 alu.dst.sel = ctx->temp_reg;
6925 alu.dst.chan = 3;
6926 alu.dst.write = 1;
6927 alu.last = 1;
6928 r = r600_bytecode_add_alu(ctx->bc, &alu);
6929 if (r)
6930 return r;
6931 }
6932 return tgsi_helper_copy(ctx, inst);
6933 }
6934
6935 static int tgsi_log(struct r600_shader_ctx *ctx)
6936 {
6937 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6938 struct r600_bytecode_alu alu;
6939 int r;
6940 int i;
6941
6942 /* result.x = floor(log2(|src|)); */
6943 if (inst->Dst[0].Register.WriteMask & 1) {
6944 if (ctx->bc->chip_class == CAYMAN) {
6945 for (i = 0; i < 3; i++) {
6946 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6947
6948 alu.op = ALU_OP1_LOG_IEEE;
6949 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6950 r600_bytecode_src_set_abs(&alu.src[0]);
6951
6952 alu.dst.sel = ctx->temp_reg;
6953 alu.dst.chan = i;
6954 if (i == 0)
6955 alu.dst.write = 1;
6956 if (i == 2)
6957 alu.last = 1;
6958 r = r600_bytecode_add_alu(ctx->bc, &alu);
6959 if (r)
6960 return r;
6961 }
6962
6963 } else {
6964 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6965
6966 alu.op = ALU_OP1_LOG_IEEE;
6967 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6968 r600_bytecode_src_set_abs(&alu.src[0]);
6969
6970 alu.dst.sel = ctx->temp_reg;
6971 alu.dst.chan = 0;
6972 alu.dst.write = 1;
6973 alu.last = 1;
6974 r = r600_bytecode_add_alu(ctx->bc, &alu);
6975 if (r)
6976 return r;
6977 }
6978
6979 alu.op = ALU_OP1_FLOOR;
6980 alu.src[0].sel = ctx->temp_reg;
6981 alu.src[0].chan = 0;
6982
6983 alu.dst.sel = ctx->temp_reg;
6984 alu.dst.chan = 0;
6985 alu.dst.write = 1;
6986 alu.last = 1;
6987
6988 r = r600_bytecode_add_alu(ctx->bc, &alu);
6989 if (r)
6990 return r;
6991 }
6992
6993 /* result.y = |src.x| / (2 ^ floor(log2(|src.x|))); */
6994 if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
6995
6996 if (ctx->bc->chip_class == CAYMAN) {
6997 for (i = 0; i < 3; i++) {
6998 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6999
7000 alu.op = ALU_OP1_LOG_IEEE;
7001 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7002 r600_bytecode_src_set_abs(&alu.src[0]);
7003
7004 alu.dst.sel = ctx->temp_reg;
7005 alu.dst.chan = i;
7006 if (i == 1)
7007 alu.dst.write = 1;
7008 if (i == 2)
7009 alu.last = 1;
7010
7011 r = r600_bytecode_add_alu(ctx->bc, &alu);
7012 if (r)
7013 return r;
7014 }
7015 } else {
7016 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7017
7018 alu.op = ALU_OP1_LOG_IEEE;
7019 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7020 r600_bytecode_src_set_abs(&alu.src[0]);
7021
7022 alu.dst.sel = ctx->temp_reg;
7023 alu.dst.chan = 1;
7024 alu.dst.write = 1;
7025 alu.last = 1;
7026
7027 r = r600_bytecode_add_alu(ctx->bc, &alu);
7028 if (r)
7029 return r;
7030 }
7031
7032 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7033
7034 alu.op = ALU_OP1_FLOOR;
7035 alu.src[0].sel = ctx->temp_reg;
7036 alu.src[0].chan = 1;
7037
7038 alu.dst.sel = ctx->temp_reg;
7039 alu.dst.chan = 1;
7040 alu.dst.write = 1;
7041 alu.last = 1;
7042
7043 r = r600_bytecode_add_alu(ctx->bc, &alu);
7044 if (r)
7045 return r;
7046
7047 if (ctx->bc->chip_class == CAYMAN) {
7048 for (i = 0; i < 3; i++) {
7049 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7050 alu.op = ALU_OP1_EXP_IEEE;
7051 alu.src[0].sel = ctx->temp_reg;
7052 alu.src[0].chan = 1;
7053
7054 alu.dst.sel = ctx->temp_reg;
7055 alu.dst.chan = i;
7056 if (i == 1)
7057 alu.dst.write = 1;
7058 if (i == 2)
7059 alu.last = 1;
7060
7061 r = r600_bytecode_add_alu(ctx->bc, &alu);
7062 if (r)
7063 return r;
7064 }
7065 } else {
7066 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7067 alu.op = ALU_OP1_EXP_IEEE;
7068 alu.src[0].sel = ctx->temp_reg;
7069 alu.src[0].chan = 1;
7070
7071 alu.dst.sel = ctx->temp_reg;
7072 alu.dst.chan = 1;
7073 alu.dst.write = 1;
7074 alu.last = 1;
7075
7076 r = r600_bytecode_add_alu(ctx->bc, &alu);
7077 if (r)
7078 return r;
7079 }
7080
7081 if (ctx->bc->chip_class == CAYMAN) {
7082 for (i = 0; i < 3; i++) {
7083 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7084 alu.op = ALU_OP1_RECIP_IEEE;
7085 alu.src[0].sel = ctx->temp_reg;
7086 alu.src[0].chan = 1;
7087
7088 alu.dst.sel = ctx->temp_reg;
7089 alu.dst.chan = i;
7090 if (i == 1)
7091 alu.dst.write = 1;
7092 if (i == 2)
7093 alu.last = 1;
7094
7095 r = r600_bytecode_add_alu(ctx->bc, &alu);
7096 if (r)
7097 return r;
7098 }
7099 } else {
7100 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7101 alu.op = ALU_OP1_RECIP_IEEE;
7102 alu.src[0].sel = ctx->temp_reg;
7103 alu.src[0].chan = 1;
7104
7105 alu.dst.sel = ctx->temp_reg;
7106 alu.dst.chan = 1;
7107 alu.dst.write = 1;
7108 alu.last = 1;
7109
7110 r = r600_bytecode_add_alu(ctx->bc, &alu);
7111 if (r)
7112 return r;
7113 }
7114
7115 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7116
7117 alu.op = ALU_OP2_MUL;
7118
7119 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7120 r600_bytecode_src_set_abs(&alu.src[0]);
7121
7122 alu.src[1].sel = ctx->temp_reg;
7123 alu.src[1].chan = 1;
7124
7125 alu.dst.sel = ctx->temp_reg;
7126 alu.dst.chan = 1;
7127 alu.dst.write = 1;
7128 alu.last = 1;
7129
7130 r = r600_bytecode_add_alu(ctx->bc, &alu);
7131 if (r)
7132 return r;
7133 }
7134
7135 /* result.z = log2(|src|);*/
7136 if ((inst->Dst[0].Register.WriteMask >> 2) & 1) {
7137 if (ctx->bc->chip_class == CAYMAN) {
7138 for (i = 0; i < 3; i++) {
7139 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7140
7141 alu.op = ALU_OP1_LOG_IEEE;
7142 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7143 r600_bytecode_src_set_abs(&alu.src[0]);
7144
7145 alu.dst.sel = ctx->temp_reg;
7146 if (i == 2)
7147 alu.dst.write = 1;
7148 alu.dst.chan = i;
7149 if (i == 2)
7150 alu.last = 1;
7151
7152 r = r600_bytecode_add_alu(ctx->bc, &alu);
7153 if (r)
7154 return r;
7155 }
7156 } else {
7157 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7158
7159 alu.op = ALU_OP1_LOG_IEEE;
7160 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7161 r600_bytecode_src_set_abs(&alu.src[0]);
7162
7163 alu.dst.sel = ctx->temp_reg;
7164 alu.dst.write = 1;
7165 alu.dst.chan = 2;
7166 alu.last = 1;
7167
7168 r = r600_bytecode_add_alu(ctx->bc, &alu);
7169 if (r)
7170 return r;
7171 }
7172 }
7173
7174 /* result.w = 1.0; */
7175 if ((inst->Dst[0].Register.WriteMask >> 3) & 1) {
7176 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7177
7178 alu.op = ALU_OP1_MOV;
7179 alu.src[0].sel = V_SQ_ALU_SRC_1;
7180 alu.src[0].chan = 0;
7181
7182 alu.dst.sel = ctx->temp_reg;
7183 alu.dst.chan = 3;
7184 alu.dst.write = 1;
7185 alu.last = 1;
7186
7187 r = r600_bytecode_add_alu(ctx->bc, &alu);
7188 if (r)
7189 return r;
7190 }
7191
7192 return tgsi_helper_copy(ctx, inst);
7193 }
7194
7195 static int tgsi_eg_arl(struct r600_shader_ctx *ctx)
7196 {
7197 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7198 struct r600_bytecode_alu alu;
7199 int r;
7200 int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7201 unsigned reg = get_address_file_reg(ctx, inst->Dst[0].Register.Index);
7202
7203 assert(inst->Dst[0].Register.Index < 3);
7204 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7205
7206 switch (inst->Instruction.Opcode) {
7207 case TGSI_OPCODE_ARL:
7208 alu.op = ALU_OP1_FLT_TO_INT_FLOOR;
7209 break;
7210 case TGSI_OPCODE_ARR:
7211 alu.op = ALU_OP1_FLT_TO_INT;
7212 break;
7213 case TGSI_OPCODE_UARL:
7214 alu.op = ALU_OP1_MOV;
7215 break;
7216 default:
7217 assert(0);
7218 return -1;
7219 }
7220
7221 for (i = 0; i <= lasti; ++i) {
7222 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7223 continue;
7224 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7225 alu.last = i == lasti;
7226 alu.dst.sel = reg;
7227 alu.dst.chan = i;
7228 alu.dst.write = 1;
7229 r = r600_bytecode_add_alu(ctx->bc, &alu);
7230 if (r)
7231 return r;
7232 }
7233
7234 if (inst->Dst[0].Register.Index > 0)
7235 ctx->bc->index_loaded[inst->Dst[0].Register.Index - 1] = 0;
7236 else
7237 ctx->bc->ar_loaded = 0;
7238
7239 return 0;
7240 }
7241 static int tgsi_r600_arl(struct r600_shader_ctx *ctx)
7242 {
7243 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7244 struct r600_bytecode_alu alu;
7245 int r;
7246 int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7247
7248 switch (inst->Instruction.Opcode) {
7249 case TGSI_OPCODE_ARL:
7250 memset(&alu, 0, sizeof(alu));
7251 alu.op = ALU_OP1_FLOOR;
7252 alu.dst.sel = ctx->bc->ar_reg;
7253 alu.dst.write = 1;
7254 for (i = 0; i <= lasti; ++i) {
7255 if (inst->Dst[0].Register.WriteMask & (1 << i)) {
7256 alu.dst.chan = i;
7257 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7258 alu.last = i == lasti;
7259 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
7260 return r;
7261 }
7262 }
7263
7264 memset(&alu, 0, sizeof(alu));
7265 alu.op = ALU_OP1_FLT_TO_INT;
7266 alu.src[0].sel = ctx->bc->ar_reg;
7267 alu.dst.sel = ctx->bc->ar_reg;
7268 alu.dst.write = 1;
7269 /* FLT_TO_INT is trans-only on r600/r700 */
7270 alu.last = TRUE;
7271 for (i = 0; i <= lasti; ++i) {
7272 alu.dst.chan = i;
7273 alu.src[0].chan = i;
7274 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
7275 return r;
7276 }
7277 break;
7278 case TGSI_OPCODE_ARR:
7279 memset(&alu, 0, sizeof(alu));
7280 alu.op = ALU_OP1_FLT_TO_INT;
7281 alu.dst.sel = ctx->bc->ar_reg;
7282 alu.dst.write = 1;
7283 /* FLT_TO_INT is trans-only on r600/r700 */
7284 alu.last = TRUE;
7285 for (i = 0; i <= lasti; ++i) {
7286 if (inst->Dst[0].Register.WriteMask & (1 << i)) {
7287 alu.dst.chan = i;
7288 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7289 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
7290 return r;
7291 }
7292 }
7293 break;
7294 case TGSI_OPCODE_UARL:
7295 memset(&alu, 0, sizeof(alu));
7296 alu.op = ALU_OP1_MOV;
7297 alu.dst.sel = ctx->bc->ar_reg;
7298 alu.dst.write = 1;
7299 for (i = 0; i <= lasti; ++i) {
7300 if (inst->Dst[0].Register.WriteMask & (1 << i)) {
7301 alu.dst.chan = i;
7302 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7303 alu.last = i == lasti;
7304 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
7305 return r;
7306 }
7307 }
7308 break;
7309 default:
7310 assert(0);
7311 return -1;
7312 }
7313
7314 ctx->bc->ar_loaded = 0;
7315 return 0;
7316 }
7317
7318 static int tgsi_opdst(struct r600_shader_ctx *ctx)
7319 {
7320 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7321 struct r600_bytecode_alu alu;
7322 int i, r = 0;
7323
7324 for (i = 0; i < 4; i++) {
7325 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7326
7327 alu.op = ALU_OP2_MUL;
7328 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7329
7330 if (i == 0 || i == 3) {
7331 alu.src[0].sel = V_SQ_ALU_SRC_1;
7332 } else {
7333 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
7334 }
7335
7336 if (i == 0 || i == 2) {
7337 alu.src[1].sel = V_SQ_ALU_SRC_1;
7338 } else {
7339 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
7340 }
7341 if (i == 3)
7342 alu.last = 1;
7343 r = r600_bytecode_add_alu(ctx->bc, &alu);
7344 if (r)
7345 return r;
7346 }
7347 return 0;
7348 }
7349
7350 static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode, int alu_type)
7351 {
7352 struct r600_bytecode_alu alu;
7353 int r;
7354
7355 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7356 alu.op = opcode;
7357 alu.execute_mask = 1;
7358 alu.update_pred = 1;
7359
7360 alu.dst.sel = ctx->temp_reg;
7361 alu.dst.write = 1;
7362 alu.dst.chan = 0;
7363
7364 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
7365 alu.src[1].sel = V_SQ_ALU_SRC_0;
7366 alu.src[1].chan = 0;
7367
7368 alu.last = 1;
7369
7370 r = r600_bytecode_add_alu_type(ctx->bc, &alu, alu_type);
7371 if (r)
7372 return r;
7373 return 0;
7374 }
7375
7376 static int pops(struct r600_shader_ctx *ctx, int pops)
7377 {
7378 unsigned force_pop = ctx->bc->force_add_cf;
7379
7380 if (!force_pop) {
7381 int alu_pop = 3;
7382 if (ctx->bc->cf_last) {
7383 if (ctx->bc->cf_last->op == CF_OP_ALU)
7384 alu_pop = 0;
7385 else if (ctx->bc->cf_last->op == CF_OP_ALU_POP_AFTER)
7386 alu_pop = 1;
7387 }
7388 alu_pop += pops;
7389 if (alu_pop == 1) {
7390 ctx->bc->cf_last->op = CF_OP_ALU_POP_AFTER;
7391 ctx->bc->force_add_cf = 1;
7392 } else if (alu_pop == 2) {
7393 ctx->bc->cf_last->op = CF_OP_ALU_POP2_AFTER;
7394 ctx->bc->force_add_cf = 1;
7395 } else {
7396 force_pop = 1;
7397 }
7398 }
7399
7400 if (force_pop) {
7401 r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP);
7402 ctx->bc->cf_last->pop_count = pops;
7403 ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
7404 }
7405
7406 return 0;
7407 }
7408
7409 static inline void callstack_update_max_depth(struct r600_shader_ctx *ctx,
7410 unsigned reason)
7411 {
7412 struct r600_stack_info *stack = &ctx->bc->stack;
7413 unsigned elements, entries;
7414
7415 unsigned entry_size = stack->entry_size;
7416
7417 elements = (stack->loop + stack->push_wqm ) * entry_size;
7418 elements += stack->push;
7419
7420 switch (ctx->bc->chip_class) {
7421 case R600:
7422 case R700:
7423 /* pre-r8xx: if any non-WQM PUSH instruction is invoked, 2 elements on
7424 * the stack must be reserved to hold the current active/continue
7425 * masks */
7426 if (reason == FC_PUSH_VPM) {
7427 elements += 2;
7428 }
7429 break;
7430
7431 case CAYMAN:
7432 /* r9xx: any stack operation on empty stack consumes 2 additional
7433 * elements */
7434 elements += 2;
7435
7436 /* fallthrough */
7437 /* FIXME: do the two elements added above cover the cases for the
7438 * r8xx+ below? */
7439
7440 case EVERGREEN:
7441 /* r8xx+: 2 extra elements are not always required, but one extra
7442 * element must be added for each of the following cases:
7443 * 1. There is an ALU_ELSE_AFTER instruction at the point of greatest
7444 * stack usage.
7445 * (Currently we don't use ALU_ELSE_AFTER.)
7446 * 2. There are LOOP/WQM frames on the stack when any flavor of non-WQM
7447 * PUSH instruction executed.
7448 *
7449 * NOTE: it seems we also need to reserve additional element in some
7450 * other cases, e.g. when we have 4 levels of PUSH_VPM in the shader,
7451 * then STACK_SIZE should be 2 instead of 1 */
7452 if (reason == FC_PUSH_VPM) {
7453 elements += 1;
7454 }
7455 break;
7456
7457 default:
7458 assert(0);
7459 break;
7460 }
7461
7462 /* NOTE: it seems STACK_SIZE is interpreted by hw as if entry_size is 4
7463 * for all chips, so we use 4 in the final formula, not the real entry_size
7464 * for the chip */
7465 entry_size = 4;
7466
7467 entries = (elements + (entry_size - 1)) / entry_size;
7468
7469 if (entries > stack->max_entries)
7470 stack->max_entries = entries;
7471 }
7472
7473 static inline void callstack_pop(struct r600_shader_ctx *ctx, unsigned reason)
7474 {
7475 switch(reason) {
7476 case FC_PUSH_VPM:
7477 --ctx->bc->stack.push;
7478 assert(ctx->bc->stack.push >= 0);
7479 break;
7480 case FC_PUSH_WQM:
7481 --ctx->bc->stack.push_wqm;
7482 assert(ctx->bc->stack.push_wqm >= 0);
7483 break;
7484 case FC_LOOP:
7485 --ctx->bc->stack.loop;
7486 assert(ctx->bc->stack.loop >= 0);
7487 break;
7488 default:
7489 assert(0);
7490 break;
7491 }
7492 }
7493
7494 static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason)
7495 {
7496 switch (reason) {
7497 case FC_PUSH_VPM:
7498 ++ctx->bc->stack.push;
7499 break;
7500 case FC_PUSH_WQM:
7501 ++ctx->bc->stack.push_wqm;
7502 case FC_LOOP:
7503 ++ctx->bc->stack.loop;
7504 break;
7505 default:
7506 assert(0);
7507 }
7508
7509 callstack_update_max_depth(ctx, reason);
7510 }
7511
7512 static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp)
7513 {
7514 struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[fc_sp];
7515
7516 sp->mid = realloc((void *)sp->mid,
7517 sizeof(struct r600_bytecode_cf *) * (sp->num_mid + 1));
7518 sp->mid[sp->num_mid] = ctx->bc->cf_last;
7519 sp->num_mid++;
7520 }
7521
7522 static void fc_pushlevel(struct r600_shader_ctx *ctx, int type)
7523 {
7524 ctx->bc->fc_sp++;
7525 ctx->bc->fc_stack[ctx->bc->fc_sp].type = type;
7526 ctx->bc->fc_stack[ctx->bc->fc_sp].start = ctx->bc->cf_last;
7527 }
7528
7529 static void fc_poplevel(struct r600_shader_ctx *ctx)
7530 {
7531 struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[ctx->bc->fc_sp];
7532 free(sp->mid);
7533 sp->mid = NULL;
7534 sp->num_mid = 0;
7535 sp->start = NULL;
7536 sp->type = 0;
7537 ctx->bc->fc_sp--;
7538 }
7539
7540 #if 0
7541 static int emit_return(struct r600_shader_ctx *ctx)
7542 {
7543 r600_bytecode_add_cfinst(ctx->bc, CF_OP_RETURN));
7544 return 0;
7545 }
7546
7547 static int emit_jump_to_offset(struct r600_shader_ctx *ctx, int pops, int offset)
7548 {
7549
7550 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP));
7551 ctx->bc->cf_last->pop_count = pops;
7552 /* XXX work out offset */
7553 return 0;
7554 }
7555
7556 static int emit_setret_in_loop_flag(struct r600_shader_ctx *ctx, unsigned flag_value)
7557 {
7558 return 0;
7559 }
7560
7561 static void emit_testflag(struct r600_shader_ctx *ctx)
7562 {
7563
7564 }
7565
7566 static void emit_return_on_flag(struct r600_shader_ctx *ctx, unsigned ifidx)
7567 {
7568 emit_testflag(ctx);
7569 emit_jump_to_offset(ctx, 1, 4);
7570 emit_setret_in_loop_flag(ctx, V_SQ_ALU_SRC_0);
7571 pops(ctx, ifidx + 1);
7572 emit_return(ctx);
7573 }
7574
7575 static void break_loop_on_flag(struct r600_shader_ctx *ctx, unsigned fc_sp)
7576 {
7577 emit_testflag(ctx);
7578
7579 r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
7580 ctx->bc->cf_last->pop_count = 1;
7581
7582 fc_set_mid(ctx, fc_sp);
7583
7584 pops(ctx, 1);
7585 }
7586 #endif
7587
7588 static int emit_if(struct r600_shader_ctx *ctx, int opcode)
7589 {
7590 int alu_type = CF_OP_ALU_PUSH_BEFORE;
7591
7592 /* There is a hardware bug on Cayman where a BREAK/CONTINUE followed by
7593 * LOOP_STARTxxx for nested loops may put the branch stack into a state
7594 * such that ALU_PUSH_BEFORE doesn't work as expected. Workaround this
7595 * by replacing the ALU_PUSH_BEFORE with a PUSH + ALU */
7596 if (ctx->bc->chip_class == CAYMAN && ctx->bc->stack.loop > 1) {
7597 r600_bytecode_add_cfinst(ctx->bc, CF_OP_PUSH);
7598 ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
7599 alu_type = CF_OP_ALU;
7600 }
7601
7602 emit_logic_pred(ctx, opcode, alu_type);
7603
7604 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
7605
7606 fc_pushlevel(ctx, FC_IF);
7607
7608 callstack_push(ctx, FC_PUSH_VPM);
7609 return 0;
7610 }
7611
7612 static int tgsi_if(struct r600_shader_ctx *ctx)
7613 {
7614 return emit_if(ctx, ALU_OP2_PRED_SETNE);
7615 }
7616
7617 static int tgsi_uif(struct r600_shader_ctx *ctx)
7618 {
7619 return emit_if(ctx, ALU_OP2_PRED_SETNE_INT);
7620 }
7621
7622 static int tgsi_else(struct r600_shader_ctx *ctx)
7623 {
7624 r600_bytecode_add_cfinst(ctx->bc, CF_OP_ELSE);
7625 ctx->bc->cf_last->pop_count = 1;
7626
7627 fc_set_mid(ctx, ctx->bc->fc_sp);
7628 ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id;
7629 return 0;
7630 }
7631
7632 static int tgsi_endif(struct r600_shader_ctx *ctx)
7633 {
7634 pops(ctx, 1);
7635 if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_IF) {
7636 R600_ERR("if/endif unbalanced in shader\n");
7637 return -1;
7638 }
7639
7640 if (ctx->bc->fc_stack[ctx->bc->fc_sp].mid == NULL) {
7641 ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2;
7642 ctx->bc->fc_stack[ctx->bc->fc_sp].start->pop_count = 1;
7643 } else {
7644 ctx->bc->fc_stack[ctx->bc->fc_sp].mid[0]->cf_addr = ctx->bc->cf_last->id + 2;
7645 }
7646 fc_poplevel(ctx);
7647
7648 callstack_pop(ctx, FC_PUSH_VPM);
7649 return 0;
7650 }
7651
7652 static int tgsi_bgnloop(struct r600_shader_ctx *ctx)
7653 {
7654 /* LOOP_START_DX10 ignores the LOOP_CONFIG* registers, so it is not
7655 * limited to 4096 iterations, like the other LOOP_* instructions. */
7656 r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_START_DX10);
7657
7658 fc_pushlevel(ctx, FC_LOOP);
7659
7660 /* check stack depth */
7661 callstack_push(ctx, FC_LOOP);
7662 return 0;
7663 }
7664
7665 static int tgsi_endloop(struct r600_shader_ctx *ctx)
7666 {
7667 int i;
7668
7669 r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_END);
7670
7671 if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_LOOP) {
7672 R600_ERR("loop/endloop in shader code are not paired.\n");
7673 return -EINVAL;
7674 }
7675
7676 /* fixup loop pointers - from r600isa
7677 LOOP END points to CF after LOOP START,
7678 LOOP START point to CF after LOOP END
7679 BRK/CONT point to LOOP END CF
7680 */
7681 ctx->bc->cf_last->cf_addr = ctx->bc->fc_stack[ctx->bc->fc_sp].start->id + 2;
7682
7683 ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2;
7684
7685 for (i = 0; i < ctx->bc->fc_stack[ctx->bc->fc_sp].num_mid; i++) {
7686 ctx->bc->fc_stack[ctx->bc->fc_sp].mid[i]->cf_addr = ctx->bc->cf_last->id;
7687 }
7688 /* XXX add LOOPRET support */
7689 fc_poplevel(ctx);
7690 callstack_pop(ctx, FC_LOOP);
7691 return 0;
7692 }
7693
7694 static int tgsi_loop_breakc(struct r600_shader_ctx *ctx)
7695 {
7696 int r;
7697 unsigned int fscp;
7698
7699 for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
7700 {
7701 if (FC_LOOP == ctx->bc->fc_stack[fscp].type)
7702 break;
7703 }
7704 if (fscp == 0) {
7705 R600_ERR("BREAKC not inside loop/endloop pair\n");
7706 return -EINVAL;
7707 }
7708
7709 if (ctx->bc->chip_class == EVERGREEN &&
7710 ctx->bc->family != CHIP_CYPRESS &&
7711 ctx->bc->family != CHIP_JUNIPER) {
7712 /* HW bug: ALU_BREAK does not save the active mask correctly */
7713 r = tgsi_uif(ctx);
7714 if (r)
7715 return r;
7716
7717 r = r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_BREAK);
7718 if (r)
7719 return r;
7720 fc_set_mid(ctx, fscp);
7721
7722 return tgsi_endif(ctx);
7723 } else {
7724 r = emit_logic_pred(ctx, ALU_OP2_PRED_SETE_INT, CF_OP_ALU_BREAK);
7725 if (r)
7726 return r;
7727 fc_set_mid(ctx, fscp);
7728 }
7729
7730 return 0;
7731 }
7732
7733 static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx)
7734 {
7735 unsigned int fscp;
7736
7737 for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
7738 {
7739 if (FC_LOOP == ctx->bc->fc_stack[fscp].type)
7740 break;
7741 }
7742
7743 if (fscp == 0) {
7744 R600_ERR("Break not inside loop/endloop pair\n");
7745 return -EINVAL;
7746 }
7747
7748 r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
7749
7750 fc_set_mid(ctx, fscp);
7751
7752 return 0;
7753 }
7754
7755 static int tgsi_gs_emit(struct r600_shader_ctx *ctx)
7756 {
7757 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7758 int stream = ctx->literals[inst->Src[0].Register.Index * 4 + inst->Src[0].Register.SwizzleX];
7759 int r;
7760
7761 if (ctx->inst_info->op == CF_OP_EMIT_VERTEX)
7762 emit_gs_ring_writes(ctx, ctx->gs_stream_output_info, stream, TRUE);
7763
7764 r = r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
7765 if (!r)
7766 ctx->bc->cf_last->count = stream; // Count field for CUT/EMIT_VERTEX indicates which stream
7767 return r;
7768 }
7769
7770 static int tgsi_umad(struct r600_shader_ctx *ctx)
7771 {
7772 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7773 struct r600_bytecode_alu alu;
7774 int i, j, k, r;
7775 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7776
7777 /* src0 * src1 */
7778 for (i = 0; i < lasti + 1; i++) {
7779 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7780 continue;
7781
7782 if (ctx->bc->chip_class == CAYMAN) {
7783 for (j = 0 ; j < 4; j++) {
7784 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7785
7786 alu.op = ALU_OP2_MULLO_UINT;
7787 for (k = 0; k < inst->Instruction.NumSrcRegs; k++) {
7788 r600_bytecode_src(&alu.src[k], &ctx->src[k], i);
7789 }
7790 alu.dst.chan = j;
7791 alu.dst.sel = ctx->temp_reg;
7792 alu.dst.write = (j == i);
7793 if (j == 3)
7794 alu.last = 1;
7795 r = r600_bytecode_add_alu(ctx->bc, &alu);
7796 if (r)
7797 return r;
7798 }
7799 } else {
7800 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7801
7802 alu.dst.chan = i;
7803 alu.dst.sel = ctx->temp_reg;
7804 alu.dst.write = 1;
7805
7806 alu.op = ALU_OP2_MULLO_UINT;
7807 for (j = 0; j < 2; j++) {
7808 r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
7809 }
7810
7811 alu.last = 1;
7812 r = r600_bytecode_add_alu(ctx->bc, &alu);
7813 if (r)
7814 return r;
7815 }
7816 }
7817
7818
7819 for (i = 0; i < lasti + 1; i++) {
7820 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7821 continue;
7822
7823 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7824 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7825
7826 alu.op = ALU_OP2_ADD_INT;
7827
7828 alu.src[0].sel = ctx->temp_reg;
7829 alu.src[0].chan = i;
7830
7831 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
7832 if (i == lasti) {
7833 alu.last = 1;
7834 }
7835 r = r600_bytecode_add_alu(ctx->bc, &alu);
7836 if (r)
7837 return r;
7838 }
7839 return 0;
7840 }
7841
7842 static const struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
7843 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_r600_arl},
7844 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2},
7845 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit},
7846
7847 /* XXX:
7848 * For state trackers other than OpenGL, we'll want to use
7849 * _RECIP_IEEE instead.
7850 */
7851 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_CLAMPED, tgsi_trans_srcx_replicate},
7852
7853 [TGSI_OPCODE_RSQ] = { ALU_OP0_NOP, tgsi_rsq},
7854 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp},
7855 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log},
7856 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL, tgsi_op2},
7857 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2},
7858 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4, tgsi_dp},
7859 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4, tgsi_dp},
7860 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst},
7861 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN, tgsi_op2},
7862 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX, tgsi_op2},
7863 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap},
7864 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2},
7865 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD, tgsi_op3},
7866 [TGSI_OPCODE_SUB] = { ALU_OP2_ADD, tgsi_op2},
7867 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp},
7868 [TGSI_OPCODE_FMA] = { ALU_OP0_NOP, tgsi_unsupported},
7869 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
7870 [TGSI_OPCODE_DP2A] = { ALU_OP0_NOP, tgsi_unsupported},
7871 [22] = { ALU_OP0_NOP, tgsi_unsupported},
7872 [23] = { ALU_OP0_NOP, tgsi_unsupported},
7873 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2},
7874 [TGSI_OPCODE_CLAMP] = { ALU_OP0_NOP, tgsi_unsupported},
7875 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2},
7876 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2},
7877 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
7878 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
7879 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, tgsi_pow},
7880 [TGSI_OPCODE_XPD] = { ALU_OP0_NOP, tgsi_xpd},
7881 [32] = { ALU_OP0_NOP, tgsi_unsupported},
7882 [TGSI_OPCODE_ABS] = { ALU_OP1_MOV, tgsi_op2},
7883 [34] = { ALU_OP0_NOP, tgsi_unsupported},
7884 [TGSI_OPCODE_DPH] = { ALU_OP2_DOT4, tgsi_dp},
7885 [TGSI_OPCODE_COS] = { ALU_OP1_COS, tgsi_trig},
7886 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
7887 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
7888 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */
7889 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_unsupported},
7890 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported},
7891 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported},
7892 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported},
7893 [44] = { ALU_OP0_NOP, tgsi_unsupported},
7894 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2},
7895 [46] = { ALU_OP0_NOP, tgsi_unsupported},
7896 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2},
7897 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, tgsi_trig},
7898 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap},
7899 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2},
7900 [51] = { ALU_OP0_NOP, tgsi_unsupported},
7901 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex},
7902 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex},
7903 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex},
7904 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_unsupported},
7905 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported},
7906 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported},
7907 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported},
7908 [59] = { ALU_OP0_NOP, tgsi_unsupported},
7909 [60] = { ALU_OP0_NOP, tgsi_unsupported},
7910 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_r600_arl},
7911 [62] = { ALU_OP0_NOP, tgsi_unsupported},
7912 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported},
7913 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported},
7914 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg},
7915 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp},
7916 [TGSI_OPCODE_SCS] = { ALU_OP0_NOP, tgsi_scs},
7917 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
7918 [69] = { ALU_OP0_NOP, tgsi_unsupported},
7919 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported},
7920 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4, tgsi_dp},
7921 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex},
7922 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
7923 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if},
7924 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif},
7925 [76] = { ALU_OP0_NOP, tgsi_unsupported},
7926 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else},
7927 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif},
7928 [TGSI_OPCODE_DDX_FINE] = { ALU_OP0_NOP, tgsi_unsupported},
7929 [TGSI_OPCODE_DDY_FINE] = { ALU_OP0_NOP, tgsi_unsupported},
7930 [TGSI_OPCODE_PUSHA] = { ALU_OP0_NOP, tgsi_unsupported},
7931 [TGSI_OPCODE_POPA] = { ALU_OP0_NOP, tgsi_unsupported},
7932 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2},
7933 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
7934 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2},
7935 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2},
7936 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2_trans},
7937 [88] = { ALU_OP0_NOP, tgsi_unsupported},
7938 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2},
7939 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2},
7940 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod},
7941 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2},
7942 [TGSI_OPCODE_SAD] = { ALU_OP0_NOP, tgsi_unsupported},
7943 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex},
7944 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
7945 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
7946 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
7947 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit},
7948 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop},
7949 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported},
7950 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop},
7951 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported},
7952 [TGSI_OPCODE_TXQ_LZ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
7953 [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
7954 [105] = { ALU_OP0_NOP, tgsi_unsupported},
7955 [106] = { ALU_OP0_NOP, tgsi_unsupported},
7956 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported},
7957 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2},
7958 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2},
7959 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
7960 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
7961 [112] = { ALU_OP0_NOP, tgsi_unsupported},
7962 [TGSI_OPCODE_CALLNZ] = { ALU_OP0_NOP, tgsi_unsupported},
7963 [114] = { ALU_OP0_NOP, tgsi_unsupported},
7964 [TGSI_OPCODE_BREAKC] = { ALU_OP0_NOP, tgsi_loop_breakc},
7965 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */
7966 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */
7967 [118] = { ALU_OP0_NOP, tgsi_unsupported},
7968 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_op2_trans},
7969 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv},
7970 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2},
7971 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2},
7972 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg},
7973 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2},
7974 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2_trans},
7975 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap},
7976 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_op2_trans},
7977 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
7978 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2},
7979 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv},
7980 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad},
7981 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2},
7982 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2},
7983 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod},
7984 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_UINT, tgsi_op2_trans},
7985 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2},
7986 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2},
7987 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2_trans},
7988 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
7989 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2_swap},
7990 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
7991 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported},
7992 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported},
7993 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
7994 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported},
7995 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported},
7996 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported},
7997 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported},
7998 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported},
7999 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported},
8000 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported},
8001 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported},
8002 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported},
8003 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported},
8004 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported},
8005 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported},
8006 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_r600_arl},
8007 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp},
8008 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs},
8009 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg},
8010 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_unsupported},
8011 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_unsupported},
8012 [TGSI_OPCODE_MFENCE] = { ALU_OP0_NOP, tgsi_unsupported},
8013 [TGSI_OPCODE_LFENCE] = { ALU_OP0_NOP, tgsi_unsupported},
8014 [TGSI_OPCODE_SFENCE] = { ALU_OP0_NOP, tgsi_unsupported},
8015 [TGSI_OPCODE_BARRIER] = { ALU_OP0_NOP, tgsi_unsupported},
8016 [TGSI_OPCODE_ATOMUADD] = { ALU_OP0_NOP, tgsi_unsupported},
8017 [TGSI_OPCODE_ATOMXCHG] = { ALU_OP0_NOP, tgsi_unsupported},
8018 [TGSI_OPCODE_ATOMCAS] = { ALU_OP0_NOP, tgsi_unsupported},
8019 [TGSI_OPCODE_ATOMAND] = { ALU_OP0_NOP, tgsi_unsupported},
8020 [TGSI_OPCODE_ATOMOR] = { ALU_OP0_NOP, tgsi_unsupported},
8021 [TGSI_OPCODE_ATOMXOR] = { ALU_OP0_NOP, tgsi_unsupported},
8022 [TGSI_OPCODE_ATOMUMIN] = { ALU_OP0_NOP, tgsi_unsupported},
8023 [TGSI_OPCODE_ATOMUMAX] = { ALU_OP0_NOP, tgsi_unsupported},
8024 [TGSI_OPCODE_ATOMIMIN] = { ALU_OP0_NOP, tgsi_unsupported},
8025 [TGSI_OPCODE_ATOMIMAX] = { ALU_OP0_NOP, tgsi_unsupported},
8026 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex},
8027 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
8028 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex},
8029 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, tgsi_op2_trans},
8030 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, tgsi_op2_trans},
8031 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_unsupported},
8032 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_unsupported},
8033 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_unsupported},
8034 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_unsupported},
8035 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_unsupported},
8036 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_unsupported},
8037 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_unsupported},
8038 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_unsupported},
8039 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_unsupported},
8040 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_unsupported},
8041 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_unsupported},
8042 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_unsupported},
8043 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_unsupported},
8044 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported},
8045 };
8046
8047 static const struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = {
8048 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_eg_arl},
8049 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2},
8050 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit},
8051 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate},
8052 [TGSI_OPCODE_RSQ] = { ALU_OP1_RECIPSQRT_IEEE, tgsi_rsq},
8053 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp},
8054 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log},
8055 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL, tgsi_op2},
8056 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2},
8057 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4, tgsi_dp},
8058 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4, tgsi_dp},
8059 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst},
8060 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN, tgsi_op2},
8061 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX, tgsi_op2},
8062 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap},
8063 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2},
8064 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD, tgsi_op3},
8065 [TGSI_OPCODE_SUB] = { ALU_OP2_ADD, tgsi_op2},
8066 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp},
8067 [TGSI_OPCODE_FMA] = { ALU_OP0_NOP, tgsi_unsupported},
8068 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
8069 [TGSI_OPCODE_DP2A] = { ALU_OP0_NOP, tgsi_unsupported},
8070 [22] = { ALU_OP0_NOP, tgsi_unsupported},
8071 [23] = { ALU_OP0_NOP, tgsi_unsupported},
8072 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2},
8073 [TGSI_OPCODE_CLAMP] = { ALU_OP0_NOP, tgsi_unsupported},
8074 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2},
8075 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2},
8076 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
8077 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
8078 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, tgsi_pow},
8079 [TGSI_OPCODE_XPD] = { ALU_OP0_NOP, tgsi_xpd},
8080 [32] = { ALU_OP0_NOP, tgsi_unsupported},
8081 [TGSI_OPCODE_ABS] = { ALU_OP1_MOV, tgsi_op2},
8082 [34] = { ALU_OP0_NOP, tgsi_unsupported},
8083 [TGSI_OPCODE_DPH] = { ALU_OP2_DOT4, tgsi_dp},
8084 [TGSI_OPCODE_COS] = { ALU_OP1_COS, tgsi_trig},
8085 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
8086 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
8087 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */
8088 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_unsupported},
8089 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported},
8090 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported},
8091 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported},
8092 [44] = { ALU_OP0_NOP, tgsi_unsupported},
8093 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2},
8094 [46] = { ALU_OP0_NOP, tgsi_unsupported},
8095 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2},
8096 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, tgsi_trig},
8097 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap},
8098 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2},
8099 [51] = { ALU_OP0_NOP, tgsi_unsupported},
8100 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex},
8101 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex},
8102 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex},
8103 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_unsupported},
8104 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported},
8105 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported},
8106 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported},
8107 [59] = { ALU_OP0_NOP, tgsi_unsupported},
8108 [60] = { ALU_OP0_NOP, tgsi_unsupported},
8109 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_eg_arl},
8110 [62] = { ALU_OP0_NOP, tgsi_unsupported},
8111 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported},
8112 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported},
8113 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg},
8114 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp},
8115 [TGSI_OPCODE_SCS] = { ALU_OP0_NOP, tgsi_scs},
8116 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
8117 [69] = { ALU_OP0_NOP, tgsi_unsupported},
8118 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported},
8119 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4, tgsi_dp},
8120 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex},
8121 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
8122 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if},
8123 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif},
8124 [76] = { ALU_OP0_NOP, tgsi_unsupported},
8125 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else},
8126 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif},
8127 [TGSI_OPCODE_DDX_FINE] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
8128 [TGSI_OPCODE_DDY_FINE] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
8129 [TGSI_OPCODE_PUSHA] = { ALU_OP0_NOP, tgsi_unsupported},
8130 [TGSI_OPCODE_POPA] = { ALU_OP0_NOP, tgsi_unsupported},
8131 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2},
8132 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
8133 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2},
8134 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2},
8135 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2},
8136 [88] = { ALU_OP0_NOP, tgsi_unsupported},
8137 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2},
8138 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2},
8139 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod},
8140 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2},
8141 [TGSI_OPCODE_SAD] = { ALU_OP0_NOP, tgsi_unsupported},
8142 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex},
8143 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
8144 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
8145 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
8146 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit},
8147 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop},
8148 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported},
8149 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop},
8150 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported},
8151 [TGSI_OPCODE_TXQ_LZ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
8152 [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
8153 [105] = { ALU_OP0_NOP, tgsi_unsupported},
8154 [106] = { ALU_OP0_NOP, tgsi_unsupported},
8155 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported},
8156 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2},
8157 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2},
8158 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
8159 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
8160 [112] = { ALU_OP0_NOP, tgsi_unsupported},
8161 [TGSI_OPCODE_CALLNZ] = { ALU_OP0_NOP, tgsi_unsupported},
8162 [114] = { ALU_OP0_NOP, tgsi_unsupported},
8163 [TGSI_OPCODE_BREAKC] = { ALU_OP0_NOP, tgsi_unsupported},
8164 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */
8165 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */
8166 [118] = { ALU_OP0_NOP, tgsi_unsupported},
8167 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_f2i},
8168 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv},
8169 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2},
8170 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2},
8171 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg},
8172 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2},
8173 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2},
8174 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap},
8175 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_f2i},
8176 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
8177 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2},
8178 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv},
8179 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad},
8180 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2},
8181 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2},
8182 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod},
8183 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_UINT, tgsi_op2_trans},
8184 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2},
8185 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2},
8186 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2},
8187 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
8188 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2},
8189 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
8190 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported},
8191 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported},
8192 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
8193 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported},
8194 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported},
8195 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported},
8196 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported},
8197 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported},
8198 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported},
8199 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported},
8200 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported},
8201 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported},
8202 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported},
8203 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported},
8204 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported},
8205 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_eg_arl},
8206 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp},
8207 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs},
8208 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg},
8209 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_unsupported},
8210 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_unsupported},
8211 [TGSI_OPCODE_MFENCE] = { ALU_OP0_NOP, tgsi_unsupported},
8212 [TGSI_OPCODE_LFENCE] = { ALU_OP0_NOP, tgsi_unsupported},
8213 [TGSI_OPCODE_SFENCE] = { ALU_OP0_NOP, tgsi_unsupported},
8214 [TGSI_OPCODE_BARRIER] = { ALU_OP0_NOP, tgsi_unsupported},
8215 [TGSI_OPCODE_ATOMUADD] = { ALU_OP0_NOP, tgsi_unsupported},
8216 [TGSI_OPCODE_ATOMXCHG] = { ALU_OP0_NOP, tgsi_unsupported},
8217 [TGSI_OPCODE_ATOMCAS] = { ALU_OP0_NOP, tgsi_unsupported},
8218 [TGSI_OPCODE_ATOMAND] = { ALU_OP0_NOP, tgsi_unsupported},
8219 [TGSI_OPCODE_ATOMOR] = { ALU_OP0_NOP, tgsi_unsupported},
8220 [TGSI_OPCODE_ATOMXOR] = { ALU_OP0_NOP, tgsi_unsupported},
8221 [TGSI_OPCODE_ATOMUMIN] = { ALU_OP0_NOP, tgsi_unsupported},
8222 [TGSI_OPCODE_ATOMUMAX] = { ALU_OP0_NOP, tgsi_unsupported},
8223 [TGSI_OPCODE_ATOMIMIN] = { ALU_OP0_NOP, tgsi_unsupported},
8224 [TGSI_OPCODE_ATOMIMAX] = { ALU_OP0_NOP, tgsi_unsupported},
8225 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex},
8226 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
8227 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex},
8228 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, tgsi_op2_trans},
8229 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, tgsi_op2_trans},
8230 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_tex},
8231 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_tex},
8232 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_op3},
8233 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_op3},
8234 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_bfi},
8235 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_op2},
8236 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_op2},
8237 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_op2},
8238 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_msb},
8239 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_msb},
8240 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_interp_egcm},
8241 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_interp_egcm},
8242 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_interp_egcm},
8243 [TGSI_OPCODE_F2D] = { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},
8244 [TGSI_OPCODE_D2F] = { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},
8245 [TGSI_OPCODE_DABS] = { ALU_OP1_MOV, tgsi_op2_64},
8246 [TGSI_OPCODE_DNEG] = { ALU_OP2_ADD_64, tgsi_dneg},
8247 [TGSI_OPCODE_DADD] = { ALU_OP2_ADD_64, tgsi_op2_64},
8248 [TGSI_OPCODE_DMUL] = { ALU_OP2_MUL_64, cayman_mul_double_instr},
8249 [TGSI_OPCODE_DMAX] = { ALU_OP2_MAX_64, tgsi_op2_64},
8250 [TGSI_OPCODE_DMIN] = { ALU_OP2_MIN_64, tgsi_op2_64},
8251 [TGSI_OPCODE_DSLT] = { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
8252 [TGSI_OPCODE_DSGE] = { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},
8253 [TGSI_OPCODE_DSEQ] = { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},
8254 [TGSI_OPCODE_DSNE] = { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},
8255 [TGSI_OPCODE_DRCP] = { ALU_OP2_RECIP_64, cayman_emit_double_instr},
8256 [TGSI_OPCODE_DSQRT] = { ALU_OP2_SQRT_64, cayman_emit_double_instr},
8257 [TGSI_OPCODE_DMAD] = { ALU_OP3_FMA_64, tgsi_op3_64},
8258 [TGSI_OPCODE_DFRAC] = { ALU_OP1_FRACT_64, tgsi_op2_64},
8259 [TGSI_OPCODE_DLDEXP] = { ALU_OP2_LDEXP_64, tgsi_op2_64},
8260 [TGSI_OPCODE_DFRACEXP] = { ALU_OP1_FREXP_64, tgsi_dfracexp},
8261 [TGSI_OPCODE_D2I] = { ALU_OP1_FLT_TO_INT, egcm_double_to_int},
8262 [TGSI_OPCODE_I2D] = { ALU_OP1_INT_TO_FLT, egcm_int_to_double},
8263 [TGSI_OPCODE_D2U] = { ALU_OP1_FLT_TO_UINT, egcm_double_to_int},
8264 [TGSI_OPCODE_U2D] = { ALU_OP1_UINT_TO_FLT, egcm_int_to_double},
8265 [TGSI_OPCODE_DRSQ] = { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},
8266 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported},
8267 };
8268
8269 static const struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = {
8270 [TGSI_OPCODE_ARL] = { ALU_OP0_NOP, tgsi_eg_arl},
8271 [TGSI_OPCODE_MOV] = { ALU_OP1_MOV, tgsi_op2},
8272 [TGSI_OPCODE_LIT] = { ALU_OP0_NOP, tgsi_lit},
8273 [TGSI_OPCODE_RCP] = { ALU_OP1_RECIP_IEEE, cayman_emit_float_instr},
8274 [TGSI_OPCODE_RSQ] = { ALU_OP1_RECIPSQRT_IEEE, cayman_emit_float_instr},
8275 [TGSI_OPCODE_EXP] = { ALU_OP0_NOP, tgsi_exp},
8276 [TGSI_OPCODE_LOG] = { ALU_OP0_NOP, tgsi_log},
8277 [TGSI_OPCODE_MUL] = { ALU_OP2_MUL, tgsi_op2},
8278 [TGSI_OPCODE_ADD] = { ALU_OP2_ADD, tgsi_op2},
8279 [TGSI_OPCODE_DP3] = { ALU_OP2_DOT4, tgsi_dp},
8280 [TGSI_OPCODE_DP4] = { ALU_OP2_DOT4, tgsi_dp},
8281 [TGSI_OPCODE_DST] = { ALU_OP0_NOP, tgsi_opdst},
8282 [TGSI_OPCODE_MIN] = { ALU_OP2_MIN, tgsi_op2},
8283 [TGSI_OPCODE_MAX] = { ALU_OP2_MAX, tgsi_op2},
8284 [TGSI_OPCODE_SLT] = { ALU_OP2_SETGT, tgsi_op2_swap},
8285 [TGSI_OPCODE_SGE] = { ALU_OP2_SETGE, tgsi_op2},
8286 [TGSI_OPCODE_MAD] = { ALU_OP3_MULADD, tgsi_op3},
8287 [TGSI_OPCODE_SUB] = { ALU_OP2_ADD, tgsi_op2},
8288 [TGSI_OPCODE_LRP] = { ALU_OP0_NOP, tgsi_lrp},
8289 [TGSI_OPCODE_FMA] = { ALU_OP0_NOP, tgsi_unsupported},
8290 [TGSI_OPCODE_SQRT] = { ALU_OP1_SQRT_IEEE, cayman_emit_float_instr},
8291 [TGSI_OPCODE_DP2A] = { ALU_OP0_NOP, tgsi_unsupported},
8292 [22] = { ALU_OP0_NOP, tgsi_unsupported},
8293 [23] = { ALU_OP0_NOP, tgsi_unsupported},
8294 [TGSI_OPCODE_FRC] = { ALU_OP1_FRACT, tgsi_op2},
8295 [TGSI_OPCODE_CLAMP] = { ALU_OP0_NOP, tgsi_unsupported},
8296 [TGSI_OPCODE_FLR] = { ALU_OP1_FLOOR, tgsi_op2},
8297 [TGSI_OPCODE_ROUND] = { ALU_OP1_RNDNE, tgsi_op2},
8298 [TGSI_OPCODE_EX2] = { ALU_OP1_EXP_IEEE, cayman_emit_float_instr},
8299 [TGSI_OPCODE_LG2] = { ALU_OP1_LOG_IEEE, cayman_emit_float_instr},
8300 [TGSI_OPCODE_POW] = { ALU_OP0_NOP, cayman_pow},
8301 [TGSI_OPCODE_XPD] = { ALU_OP0_NOP, tgsi_xpd},
8302 [32] = { ALU_OP0_NOP, tgsi_unsupported},
8303 [TGSI_OPCODE_ABS] = { ALU_OP1_MOV, tgsi_op2},
8304 [34] = { ALU_OP0_NOP, tgsi_unsupported},
8305 [TGSI_OPCODE_DPH] = { ALU_OP2_DOT4, tgsi_dp},
8306 [TGSI_OPCODE_COS] = { ALU_OP1_COS, cayman_trig},
8307 [TGSI_OPCODE_DDX] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
8308 [TGSI_OPCODE_DDY] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
8309 [TGSI_OPCODE_KILL] = { ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */
8310 [TGSI_OPCODE_PK2H] = { ALU_OP0_NOP, tgsi_unsupported},
8311 [TGSI_OPCODE_PK2US] = { ALU_OP0_NOP, tgsi_unsupported},
8312 [TGSI_OPCODE_PK4B] = { ALU_OP0_NOP, tgsi_unsupported},
8313 [TGSI_OPCODE_PK4UB] = { ALU_OP0_NOP, tgsi_unsupported},
8314 [44] = { ALU_OP0_NOP, tgsi_unsupported},
8315 [TGSI_OPCODE_SEQ] = { ALU_OP2_SETE, tgsi_op2},
8316 [46] = { ALU_OP0_NOP, tgsi_unsupported},
8317 [TGSI_OPCODE_SGT] = { ALU_OP2_SETGT, tgsi_op2},
8318 [TGSI_OPCODE_SIN] = { ALU_OP1_SIN, cayman_trig},
8319 [TGSI_OPCODE_SLE] = { ALU_OP2_SETGE, tgsi_op2_swap},
8320 [TGSI_OPCODE_SNE] = { ALU_OP2_SETNE, tgsi_op2},
8321 [51] = { ALU_OP0_NOP, tgsi_unsupported},
8322 [TGSI_OPCODE_TEX] = { FETCH_OP_SAMPLE, tgsi_tex},
8323 [TGSI_OPCODE_TXD] = { FETCH_OP_SAMPLE_G, tgsi_tex},
8324 [TGSI_OPCODE_TXP] = { FETCH_OP_SAMPLE, tgsi_tex},
8325 [TGSI_OPCODE_UP2H] = { ALU_OP0_NOP, tgsi_unsupported},
8326 [TGSI_OPCODE_UP2US] = { ALU_OP0_NOP, tgsi_unsupported},
8327 [TGSI_OPCODE_UP4B] = { ALU_OP0_NOP, tgsi_unsupported},
8328 [TGSI_OPCODE_UP4UB] = { ALU_OP0_NOP, tgsi_unsupported},
8329 [59] = { ALU_OP0_NOP, tgsi_unsupported},
8330 [60] = { ALU_OP0_NOP, tgsi_unsupported},
8331 [TGSI_OPCODE_ARR] = { ALU_OP0_NOP, tgsi_eg_arl},
8332 [62] = { ALU_OP0_NOP, tgsi_unsupported},
8333 [TGSI_OPCODE_CAL] = { ALU_OP0_NOP, tgsi_unsupported},
8334 [TGSI_OPCODE_RET] = { ALU_OP0_NOP, tgsi_unsupported},
8335 [TGSI_OPCODE_SSG] = { ALU_OP0_NOP, tgsi_ssg},
8336 [TGSI_OPCODE_CMP] = { ALU_OP0_NOP, tgsi_cmp},
8337 [TGSI_OPCODE_SCS] = { ALU_OP0_NOP, tgsi_scs},
8338 [TGSI_OPCODE_TXB] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
8339 [69] = { ALU_OP0_NOP, tgsi_unsupported},
8340 [TGSI_OPCODE_DIV] = { ALU_OP0_NOP, tgsi_unsupported},
8341 [TGSI_OPCODE_DP2] = { ALU_OP2_DOT4, tgsi_dp},
8342 [TGSI_OPCODE_TXL] = { FETCH_OP_SAMPLE_L, tgsi_tex},
8343 [TGSI_OPCODE_BRK] = { CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
8344 [TGSI_OPCODE_IF] = { ALU_OP0_NOP, tgsi_if},
8345 [TGSI_OPCODE_UIF] = { ALU_OP0_NOP, tgsi_uif},
8346 [76] = { ALU_OP0_NOP, tgsi_unsupported},
8347 [TGSI_OPCODE_ELSE] = { ALU_OP0_NOP, tgsi_else},
8348 [TGSI_OPCODE_ENDIF] = { ALU_OP0_NOP, tgsi_endif},
8349 [TGSI_OPCODE_DDX_FINE] = { FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
8350 [TGSI_OPCODE_DDY_FINE] = { FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
8351 [TGSI_OPCODE_PUSHA] = { ALU_OP0_NOP, tgsi_unsupported},
8352 [TGSI_OPCODE_POPA] = { ALU_OP0_NOP, tgsi_unsupported},
8353 [TGSI_OPCODE_CEIL] = { ALU_OP1_CEIL, tgsi_op2},
8354 [TGSI_OPCODE_I2F] = { ALU_OP1_INT_TO_FLT, tgsi_op2},
8355 [TGSI_OPCODE_NOT] = { ALU_OP1_NOT_INT, tgsi_op2},
8356 [TGSI_OPCODE_TRUNC] = { ALU_OP1_TRUNC, tgsi_op2},
8357 [TGSI_OPCODE_SHL] = { ALU_OP2_LSHL_INT, tgsi_op2},
8358 [88] = { ALU_OP0_NOP, tgsi_unsupported},
8359 [TGSI_OPCODE_AND] = { ALU_OP2_AND_INT, tgsi_op2},
8360 [TGSI_OPCODE_OR] = { ALU_OP2_OR_INT, tgsi_op2},
8361 [TGSI_OPCODE_MOD] = { ALU_OP0_NOP, tgsi_imod},
8362 [TGSI_OPCODE_XOR] = { ALU_OP2_XOR_INT, tgsi_op2},
8363 [TGSI_OPCODE_SAD] = { ALU_OP0_NOP, tgsi_unsupported},
8364 [TGSI_OPCODE_TXF] = { FETCH_OP_LD, tgsi_tex},
8365 [TGSI_OPCODE_TXQ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
8366 [TGSI_OPCODE_CONT] = { CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
8367 [TGSI_OPCODE_EMIT] = { CF_OP_EMIT_VERTEX, tgsi_gs_emit},
8368 [TGSI_OPCODE_ENDPRIM] = { CF_OP_CUT_VERTEX, tgsi_gs_emit},
8369 [TGSI_OPCODE_BGNLOOP] = { ALU_OP0_NOP, tgsi_bgnloop},
8370 [TGSI_OPCODE_BGNSUB] = { ALU_OP0_NOP, tgsi_unsupported},
8371 [TGSI_OPCODE_ENDLOOP] = { ALU_OP0_NOP, tgsi_endloop},
8372 [TGSI_OPCODE_ENDSUB] = { ALU_OP0_NOP, tgsi_unsupported},
8373 [TGSI_OPCODE_TXQ_LZ] = { FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
8374 [TGSI_OPCODE_TXQS] = { FETCH_OP_GET_NUMBER_OF_SAMPLES, tgsi_tex},
8375 [105] = { ALU_OP0_NOP, tgsi_unsupported},
8376 [106] = { ALU_OP0_NOP, tgsi_unsupported},
8377 [TGSI_OPCODE_NOP] = { ALU_OP0_NOP, tgsi_unsupported},
8378 [TGSI_OPCODE_FSEQ] = { ALU_OP2_SETE_DX10, tgsi_op2},
8379 [TGSI_OPCODE_FSGE] = { ALU_OP2_SETGE_DX10, tgsi_op2},
8380 [TGSI_OPCODE_FSLT] = { ALU_OP2_SETGT_DX10, tgsi_op2_swap},
8381 [TGSI_OPCODE_FSNE] = { ALU_OP2_SETNE_DX10, tgsi_op2_swap},
8382 [112] = { ALU_OP0_NOP, tgsi_unsupported},
8383 [TGSI_OPCODE_CALLNZ] = { ALU_OP0_NOP, tgsi_unsupported},
8384 [114] = { ALU_OP0_NOP, tgsi_unsupported},
8385 [TGSI_OPCODE_BREAKC] = { ALU_OP0_NOP, tgsi_unsupported},
8386 [TGSI_OPCODE_KILL_IF] = { ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */
8387 [TGSI_OPCODE_END] = { ALU_OP0_NOP, tgsi_end}, /* aka HALT */
8388 [118] = { ALU_OP0_NOP, tgsi_unsupported},
8389 [TGSI_OPCODE_F2I] = { ALU_OP1_FLT_TO_INT, tgsi_op2},
8390 [TGSI_OPCODE_IDIV] = { ALU_OP0_NOP, tgsi_idiv},
8391 [TGSI_OPCODE_IMAX] = { ALU_OP2_MAX_INT, tgsi_op2},
8392 [TGSI_OPCODE_IMIN] = { ALU_OP2_MIN_INT, tgsi_op2},
8393 [TGSI_OPCODE_INEG] = { ALU_OP2_SUB_INT, tgsi_ineg},
8394 [TGSI_OPCODE_ISGE] = { ALU_OP2_SETGE_INT, tgsi_op2},
8395 [TGSI_OPCODE_ISHR] = { ALU_OP2_ASHR_INT, tgsi_op2},
8396 [TGSI_OPCODE_ISLT] = { ALU_OP2_SETGT_INT, tgsi_op2_swap},
8397 [TGSI_OPCODE_F2U] = { ALU_OP1_FLT_TO_UINT, tgsi_op2},
8398 [TGSI_OPCODE_U2F] = { ALU_OP1_UINT_TO_FLT, tgsi_op2},
8399 [TGSI_OPCODE_UADD] = { ALU_OP2_ADD_INT, tgsi_op2},
8400 [TGSI_OPCODE_UDIV] = { ALU_OP0_NOP, tgsi_udiv},
8401 [TGSI_OPCODE_UMAD] = { ALU_OP0_NOP, tgsi_umad},
8402 [TGSI_OPCODE_UMAX] = { ALU_OP2_MAX_UINT, tgsi_op2},
8403 [TGSI_OPCODE_UMIN] = { ALU_OP2_MIN_UINT, tgsi_op2},
8404 [TGSI_OPCODE_UMOD] = { ALU_OP0_NOP, tgsi_umod},
8405 [TGSI_OPCODE_UMUL] = { ALU_OP2_MULLO_INT, cayman_mul_int_instr},
8406 [TGSI_OPCODE_USEQ] = { ALU_OP2_SETE_INT, tgsi_op2},
8407 [TGSI_OPCODE_USGE] = { ALU_OP2_SETGE_UINT, tgsi_op2},
8408 [TGSI_OPCODE_USHR] = { ALU_OP2_LSHR_INT, tgsi_op2},
8409 [TGSI_OPCODE_USLT] = { ALU_OP2_SETGT_UINT, tgsi_op2_swap},
8410 [TGSI_OPCODE_USNE] = { ALU_OP2_SETNE_INT, tgsi_op2},
8411 [TGSI_OPCODE_SWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
8412 [TGSI_OPCODE_CASE] = { ALU_OP0_NOP, tgsi_unsupported},
8413 [TGSI_OPCODE_DEFAULT] = { ALU_OP0_NOP, tgsi_unsupported},
8414 [TGSI_OPCODE_ENDSWITCH] = { ALU_OP0_NOP, tgsi_unsupported},
8415 [TGSI_OPCODE_SAMPLE] = { 0, tgsi_unsupported},
8416 [TGSI_OPCODE_SAMPLE_I] = { 0, tgsi_unsupported},
8417 [TGSI_OPCODE_SAMPLE_I_MS] = { 0, tgsi_unsupported},
8418 [TGSI_OPCODE_SAMPLE_B] = { 0, tgsi_unsupported},
8419 [TGSI_OPCODE_SAMPLE_C] = { 0, tgsi_unsupported},
8420 [TGSI_OPCODE_SAMPLE_C_LZ] = { 0, tgsi_unsupported},
8421 [TGSI_OPCODE_SAMPLE_D] = { 0, tgsi_unsupported},
8422 [TGSI_OPCODE_SAMPLE_L] = { 0, tgsi_unsupported},
8423 [TGSI_OPCODE_GATHER4] = { 0, tgsi_unsupported},
8424 [TGSI_OPCODE_SVIEWINFO] = { 0, tgsi_unsupported},
8425 [TGSI_OPCODE_SAMPLE_POS] = { 0, tgsi_unsupported},
8426 [TGSI_OPCODE_SAMPLE_INFO] = { 0, tgsi_unsupported},
8427 [TGSI_OPCODE_UARL] = { ALU_OP1_MOVA_INT, tgsi_eg_arl},
8428 [TGSI_OPCODE_UCMP] = { ALU_OP0_NOP, tgsi_ucmp},
8429 [TGSI_OPCODE_IABS] = { 0, tgsi_iabs},
8430 [TGSI_OPCODE_ISSG] = { 0, tgsi_issg},
8431 [TGSI_OPCODE_LOAD] = { ALU_OP0_NOP, tgsi_unsupported},
8432 [TGSI_OPCODE_STORE] = { ALU_OP0_NOP, tgsi_unsupported},
8433 [TGSI_OPCODE_MFENCE] = { ALU_OP0_NOP, tgsi_unsupported},
8434 [TGSI_OPCODE_LFENCE] = { ALU_OP0_NOP, tgsi_unsupported},
8435 [TGSI_OPCODE_SFENCE] = { ALU_OP0_NOP, tgsi_unsupported},
8436 [TGSI_OPCODE_BARRIER] = { ALU_OP0_NOP, tgsi_unsupported},
8437 [TGSI_OPCODE_ATOMUADD] = { ALU_OP0_NOP, tgsi_unsupported},
8438 [TGSI_OPCODE_ATOMXCHG] = { ALU_OP0_NOP, tgsi_unsupported},
8439 [TGSI_OPCODE_ATOMCAS] = { ALU_OP0_NOP, tgsi_unsupported},
8440 [TGSI_OPCODE_ATOMAND] = { ALU_OP0_NOP, tgsi_unsupported},
8441 [TGSI_OPCODE_ATOMOR] = { ALU_OP0_NOP, tgsi_unsupported},
8442 [TGSI_OPCODE_ATOMXOR] = { ALU_OP0_NOP, tgsi_unsupported},
8443 [TGSI_OPCODE_ATOMUMIN] = { ALU_OP0_NOP, tgsi_unsupported},
8444 [TGSI_OPCODE_ATOMUMAX] = { ALU_OP0_NOP, tgsi_unsupported},
8445 [TGSI_OPCODE_ATOMIMIN] = { ALU_OP0_NOP, tgsi_unsupported},
8446 [TGSI_OPCODE_ATOMIMAX] = { ALU_OP0_NOP, tgsi_unsupported},
8447 [TGSI_OPCODE_TEX2] = { FETCH_OP_SAMPLE, tgsi_tex},
8448 [TGSI_OPCODE_TXB2] = { FETCH_OP_SAMPLE_LB, tgsi_tex},
8449 [TGSI_OPCODE_TXL2] = { FETCH_OP_SAMPLE_L, tgsi_tex},
8450 [TGSI_OPCODE_IMUL_HI] = { ALU_OP2_MULHI_INT, cayman_mul_int_instr},
8451 [TGSI_OPCODE_UMUL_HI] = { ALU_OP2_MULHI_UINT, cayman_mul_int_instr},
8452 [TGSI_OPCODE_TG4] = { FETCH_OP_GATHER4, tgsi_tex},
8453 [TGSI_OPCODE_LODQ] = { FETCH_OP_GET_LOD, tgsi_tex},
8454 [TGSI_OPCODE_IBFE] = { ALU_OP3_BFE_INT, tgsi_op3},
8455 [TGSI_OPCODE_UBFE] = { ALU_OP3_BFE_UINT, tgsi_op3},
8456 [TGSI_OPCODE_BFI] = { ALU_OP0_NOP, tgsi_bfi},
8457 [TGSI_OPCODE_BREV] = { ALU_OP1_BFREV_INT, tgsi_op2},
8458 [TGSI_OPCODE_POPC] = { ALU_OP1_BCNT_INT, tgsi_op2},
8459 [TGSI_OPCODE_LSB] = { ALU_OP1_FFBL_INT, tgsi_op2},
8460 [TGSI_OPCODE_IMSB] = { ALU_OP1_FFBH_INT, tgsi_msb},
8461 [TGSI_OPCODE_UMSB] = { ALU_OP1_FFBH_UINT, tgsi_msb},
8462 [TGSI_OPCODE_INTERP_CENTROID] = { ALU_OP0_NOP, tgsi_interp_egcm},
8463 [TGSI_OPCODE_INTERP_SAMPLE] = { ALU_OP0_NOP, tgsi_interp_egcm},
8464 [TGSI_OPCODE_INTERP_OFFSET] = { ALU_OP0_NOP, tgsi_interp_egcm},
8465 [TGSI_OPCODE_F2D] = { ALU_OP1_FLT32_TO_FLT64, tgsi_op2_64},
8466 [TGSI_OPCODE_D2F] = { ALU_OP1_FLT64_TO_FLT32, tgsi_op2_64_single_dest},
8467 [TGSI_OPCODE_DABS] = { ALU_OP1_MOV, tgsi_op2_64},
8468 [TGSI_OPCODE_DNEG] = { ALU_OP2_ADD_64, tgsi_dneg},
8469 [TGSI_OPCODE_DADD] = { ALU_OP2_ADD_64, tgsi_op2_64},
8470 [TGSI_OPCODE_DMUL] = { ALU_OP2_MUL_64, cayman_mul_double_instr},
8471 [TGSI_OPCODE_DMAX] = { ALU_OP2_MAX_64, tgsi_op2_64},
8472 [TGSI_OPCODE_DMIN] = { ALU_OP2_MIN_64, tgsi_op2_64},
8473 [TGSI_OPCODE_DSLT] = { ALU_OP2_SETGT_64, tgsi_op2_64_single_dest_s},
8474 [TGSI_OPCODE_DSGE] = { ALU_OP2_SETGE_64, tgsi_op2_64_single_dest},
8475 [TGSI_OPCODE_DSEQ] = { ALU_OP2_SETE_64, tgsi_op2_64_single_dest},
8476 [TGSI_OPCODE_DSNE] = { ALU_OP2_SETNE_64, tgsi_op2_64_single_dest},
8477 [TGSI_OPCODE_DRCP] = { ALU_OP2_RECIP_64, cayman_emit_double_instr},
8478 [TGSI_OPCODE_DSQRT] = { ALU_OP2_SQRT_64, cayman_emit_double_instr},
8479 [TGSI_OPCODE_DMAD] = { ALU_OP3_FMA_64, tgsi_op3_64},
8480 [TGSI_OPCODE_DFRAC] = { ALU_OP1_FRACT_64, tgsi_op2_64},
8481 [TGSI_OPCODE_DLDEXP] = { ALU_OP2_LDEXP_64, tgsi_op2_64},
8482 [TGSI_OPCODE_DFRACEXP] = { ALU_OP1_FREXP_64, tgsi_dfracexp},
8483 [TGSI_OPCODE_D2I] = { ALU_OP1_FLT_TO_INT, egcm_double_to_int},
8484 [TGSI_OPCODE_I2D] = { ALU_OP1_INT_TO_FLT, egcm_int_to_double},
8485 [TGSI_OPCODE_D2U] = { ALU_OP1_FLT_TO_UINT, egcm_double_to_int},
8486 [TGSI_OPCODE_U2D] = { ALU_OP1_UINT_TO_FLT, egcm_int_to_double},
8487 [TGSI_OPCODE_DRSQ] = { ALU_OP2_RECIPSQRT_64, cayman_emit_double_instr},
8488 [TGSI_OPCODE_LAST] = { ALU_OP0_NOP, tgsi_unsupported},
8489 };