r600g: Implement sm5 interpolation functions
[mesa.git] / src / gallium / drivers / r600 / r600_shader.c
1 /*
2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 */
23 #include "r600_sq.h"
24 #include "r600_llvm.h"
25 #include "r600_formats.h"
26 #include "r600_opcodes.h"
27 #include "r600_shader.h"
28 #include "r600d.h"
29
30 #include "sb/sb_public.h"
31
32 #include "pipe/p_shader_tokens.h"
33 #include "tgsi/tgsi_info.h"
34 #include "tgsi/tgsi_parse.h"
35 #include "tgsi/tgsi_scan.h"
36 #include "tgsi/tgsi_dump.h"
37 #include "util/u_memory.h"
38 #include "util/u_math.h"
39 #include <stdio.h>
40 #include <errno.h>
41
42 /* CAYMAN notes
43 Why CAYMAN got loops for lots of instructions is explained here.
44
45 -These 8xx t-slot only ops are implemented in all vector slots.
46 MUL_LIT, FLT_TO_UINT, INT_TO_FLT, UINT_TO_FLT
47 These 8xx t-slot only opcodes become vector ops, with all four
48 slots expecting the arguments on sources a and b. Result is
49 broadcast to all channels.
50 MULLO_INT, MULHI_INT, MULLO_UINT, MULHI_UINT
51 These 8xx t-slot only opcodes become vector ops in the z, y, and
52 x slots.
53 EXP_IEEE, LOG_IEEE/CLAMPED, RECIP_IEEE/CLAMPED/FF/INT/UINT/_64/CLAMPED_64
54 RECIPSQRT_IEEE/CLAMPED/FF/_64/CLAMPED_64
55 SQRT_IEEE/_64
56 SIN/COS
57 The w slot may have an independent co-issued operation, or if the
58 result is required to be in the w slot, the opcode above may be
59 issued in the w slot as well.
60 The compiler must issue the source argument to slots z, y, and x
61 */
62
63 static int r600_shader_from_tgsi(struct r600_context *rctx,
64 struct r600_pipe_shader *pipeshader,
65 struct r600_shader_key key);
66
67
68 static void r600_add_gpr_array(struct r600_shader *ps, int start_gpr,
69 int size, unsigned comp_mask) {
70
71 if (!size)
72 return;
73
74 if (ps->num_arrays == ps->max_arrays) {
75 ps->max_arrays += 64;
76 ps->arrays = realloc(ps->arrays, ps->max_arrays *
77 sizeof(struct r600_shader_array));
78 }
79
80 int n = ps->num_arrays;
81 ++ps->num_arrays;
82
83 ps->arrays[n].comp_mask = comp_mask;
84 ps->arrays[n].gpr_start = start_gpr;
85 ps->arrays[n].gpr_count = size;
86 }
87
88 static void r600_dump_streamout(struct pipe_stream_output_info *so)
89 {
90 unsigned i;
91
92 fprintf(stderr, "STREAMOUT\n");
93 for (i = 0; i < so->num_outputs; i++) {
94 unsigned mask = ((1 << so->output[i].num_components) - 1) <<
95 so->output[i].start_component;
96 fprintf(stderr, " %i: MEM_STREAM0_BUF%i[%i..%i] <- OUT[%i].%s%s%s%s%s\n",
97 i, so->output[i].output_buffer,
98 so->output[i].dst_offset, so->output[i].dst_offset + so->output[i].num_components - 1,
99 so->output[i].register_index,
100 mask & 1 ? "x" : "",
101 mask & 2 ? "y" : "",
102 mask & 4 ? "z" : "",
103 mask & 8 ? "w" : "",
104 so->output[i].dst_offset < so->output[i].start_component ? " (will lower)" : "");
105 }
106 }
107
108 static int store_shader(struct pipe_context *ctx,
109 struct r600_pipe_shader *shader)
110 {
111 struct r600_context *rctx = (struct r600_context *)ctx;
112 uint32_t *ptr, i;
113
114 if (shader->bo == NULL) {
115 shader->bo = (struct r600_resource*)
116 pipe_buffer_create(ctx->screen, PIPE_BIND_CUSTOM, PIPE_USAGE_IMMUTABLE, shader->shader.bc.ndw * 4);
117 if (shader->bo == NULL) {
118 return -ENOMEM;
119 }
120 ptr = r600_buffer_map_sync_with_rings(&rctx->b, shader->bo, PIPE_TRANSFER_WRITE);
121 if (R600_BIG_ENDIAN) {
122 for (i = 0; i < shader->shader.bc.ndw; ++i) {
123 ptr[i] = util_cpu_to_le32(shader->shader.bc.bytecode[i]);
124 }
125 } else {
126 memcpy(ptr, shader->shader.bc.bytecode, shader->shader.bc.ndw * sizeof(*ptr));
127 }
128 rctx->b.ws->buffer_unmap(shader->bo->cs_buf);
129 }
130
131 return 0;
132 }
133
134 int r600_pipe_shader_create(struct pipe_context *ctx,
135 struct r600_pipe_shader *shader,
136 struct r600_shader_key key)
137 {
138 struct r600_context *rctx = (struct r600_context *)ctx;
139 struct r600_pipe_shader_selector *sel = shader->selector;
140 int r;
141 bool dump = r600_can_dump_shader(&rctx->screen->b, sel->tokens);
142 unsigned use_sb = !(rctx->screen->b.debug_flags & DBG_NO_SB);
143 unsigned sb_disasm = use_sb || (rctx->screen->b.debug_flags & DBG_SB_DISASM);
144 unsigned export_shader = key.vs_as_es;
145
146 shader->shader.bc.isa = rctx->isa;
147
148 if (dump) {
149 fprintf(stderr, "--------------------------------------------------------------\n");
150 tgsi_dump(sel->tokens, 0);
151
152 if (sel->so.num_outputs) {
153 r600_dump_streamout(&sel->so);
154 }
155 }
156 r = r600_shader_from_tgsi(rctx, shader, key);
157 if (r) {
158 R600_ERR("translation from TGSI failed !\n");
159 goto error;
160 }
161
162 /* disable SB for geom shaders - it can't handle the CF_EMIT instructions */
163 use_sb &= (shader->shader.processor_type != TGSI_PROCESSOR_GEOMETRY);
164
165 /* Check if the bytecode has already been built. When using the llvm
166 * backend, r600_shader_from_tgsi() will take care of building the
167 * bytecode.
168 */
169 if (!shader->shader.bc.bytecode) {
170 r = r600_bytecode_build(&shader->shader.bc);
171 if (r) {
172 R600_ERR("building bytecode failed !\n");
173 goto error;
174 }
175 }
176
177 if (dump && !sb_disasm) {
178 fprintf(stderr, "--------------------------------------------------------------\n");
179 r600_bytecode_disasm(&shader->shader.bc);
180 fprintf(stderr, "______________________________________________________________\n");
181 } else if ((dump && sb_disasm) || use_sb) {
182 r = r600_sb_bytecode_process(rctx, &shader->shader.bc, &shader->shader,
183 dump, use_sb);
184 if (r) {
185 R600_ERR("r600_sb_bytecode_process failed !\n");
186 goto error;
187 }
188 }
189
190 if (shader->gs_copy_shader) {
191 if (dump) {
192 // dump copy shader
193 r = r600_sb_bytecode_process(rctx, &shader->gs_copy_shader->shader.bc,
194 &shader->gs_copy_shader->shader, dump, 0);
195 if (r)
196 goto error;
197 }
198
199 if ((r = store_shader(ctx, shader->gs_copy_shader)))
200 goto error;
201 }
202
203 /* Store the shader in a buffer. */
204 if ((r = store_shader(ctx, shader)))
205 goto error;
206
207 /* Build state. */
208 switch (shader->shader.processor_type) {
209 case TGSI_PROCESSOR_GEOMETRY:
210 if (rctx->b.chip_class >= EVERGREEN) {
211 evergreen_update_gs_state(ctx, shader);
212 evergreen_update_vs_state(ctx, shader->gs_copy_shader);
213 } else {
214 r600_update_gs_state(ctx, shader);
215 r600_update_vs_state(ctx, shader->gs_copy_shader);
216 }
217 break;
218 case TGSI_PROCESSOR_VERTEX:
219 if (rctx->b.chip_class >= EVERGREEN) {
220 if (export_shader)
221 evergreen_update_es_state(ctx, shader);
222 else
223 evergreen_update_vs_state(ctx, shader);
224 } else {
225 if (export_shader)
226 r600_update_es_state(ctx, shader);
227 else
228 r600_update_vs_state(ctx, shader);
229 }
230 break;
231 case TGSI_PROCESSOR_FRAGMENT:
232 if (rctx->b.chip_class >= EVERGREEN) {
233 evergreen_update_ps_state(ctx, shader);
234 } else {
235 r600_update_ps_state(ctx, shader);
236 }
237 break;
238 default:
239 r = -EINVAL;
240 goto error;
241 }
242 return 0;
243
244 error:
245 r600_pipe_shader_destroy(ctx, shader);
246 return r;
247 }
248
249 void r600_pipe_shader_destroy(struct pipe_context *ctx, struct r600_pipe_shader *shader)
250 {
251 pipe_resource_reference((struct pipe_resource**)&shader->bo, NULL);
252 r600_bytecode_clear(&shader->shader.bc);
253 r600_release_command_buffer(&shader->command_buffer);
254 }
255
256 /*
257 * tgsi -> r600 shader
258 */
259 struct r600_shader_tgsi_instruction;
260
261 struct r600_shader_src {
262 unsigned sel;
263 unsigned swizzle[4];
264 unsigned neg;
265 unsigned abs;
266 unsigned rel;
267 unsigned kc_bank;
268 uint32_t value[4];
269 };
270
271 struct eg_interp {
272 boolean enabled;
273 unsigned ij_index;
274 };
275
276 struct r600_shader_ctx {
277 struct tgsi_shader_info info;
278 struct tgsi_parse_context parse;
279 const struct tgsi_token *tokens;
280 unsigned type;
281 unsigned file_offset[TGSI_FILE_COUNT];
282 unsigned temp_reg;
283 struct r600_shader_tgsi_instruction *inst_info;
284 struct r600_bytecode *bc;
285 struct r600_shader *shader;
286 struct r600_shader_src src[4];
287 uint32_t *literals;
288 uint32_t nliterals;
289 uint32_t max_driver_temp_used;
290 boolean use_llvm;
291 /* needed for evergreen interpolation */
292 struct eg_interp eg_interpolators[6]; // indexed by Persp/Linear * 3 + sample/center/centroid
293 /* evergreen/cayman also store sample mask in face register */
294 int face_gpr;
295 /* sample id is .w component stored in fixed point position register */
296 int fixed_pt_position_gpr;
297 int colors_used;
298 boolean clip_vertex_write;
299 unsigned cv_output;
300 unsigned edgeflag_output;
301 int fragcoord_input;
302 int native_integers;
303 int next_ring_offset;
304 int gs_out_ring_offset;
305 int gs_next_vertex;
306 struct r600_shader *gs_for_vs;
307 int gs_export_gpr_treg;
308 };
309
310 struct r600_shader_tgsi_instruction {
311 unsigned tgsi_opcode;
312 unsigned is_op3;
313 unsigned op;
314 int (*process)(struct r600_shader_ctx *ctx);
315 };
316
317 static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, bool ind);
318 static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[], eg_shader_tgsi_instruction[], cm_shader_tgsi_instruction[];
319 static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx);
320 static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason);
321 static void fc_pushlevel(struct r600_shader_ctx *ctx, int type);
322 static int tgsi_else(struct r600_shader_ctx *ctx);
323 static int tgsi_endif(struct r600_shader_ctx *ctx);
324 static int tgsi_bgnloop(struct r600_shader_ctx *ctx);
325 static int tgsi_endloop(struct r600_shader_ctx *ctx);
326 static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx);
327 static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
328 unsigned int cb_idx, unsigned int offset, unsigned ar_chan,
329 unsigned int dst_reg);
330 static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
331 const struct r600_shader_src *shader_src,
332 unsigned chan);
333
334 static int tgsi_is_supported(struct r600_shader_ctx *ctx)
335 {
336 struct tgsi_full_instruction *i = &ctx->parse.FullToken.FullInstruction;
337 int j;
338
339 if (i->Instruction.NumDstRegs > 1) {
340 R600_ERR("too many dst (%d)\n", i->Instruction.NumDstRegs);
341 return -EINVAL;
342 }
343 if (i->Instruction.Predicate) {
344 R600_ERR("predicate unsupported\n");
345 return -EINVAL;
346 }
347 #if 0
348 if (i->Instruction.Label) {
349 R600_ERR("label unsupported\n");
350 return -EINVAL;
351 }
352 #endif
353 for (j = 0; j < i->Instruction.NumSrcRegs; j++) {
354 if (i->Src[j].Register.Dimension) {
355 switch (i->Src[j].Register.File) {
356 case TGSI_FILE_CONSTANT:
357 break;
358 case TGSI_FILE_INPUT:
359 if (ctx->type == TGSI_PROCESSOR_GEOMETRY)
360 break;
361 default:
362 R600_ERR("unsupported src %d (dimension %d)\n", j,
363 i->Src[j].Register.Dimension);
364 return -EINVAL;
365 }
366 }
367 }
368 for (j = 0; j < i->Instruction.NumDstRegs; j++) {
369 if (i->Dst[j].Register.Dimension) {
370 R600_ERR("unsupported dst (dimension)\n");
371 return -EINVAL;
372 }
373 }
374 return 0;
375 }
376
377 int eg_get_interpolator_index(unsigned interpolate, unsigned location)
378 {
379 if (interpolate == TGSI_INTERPOLATE_COLOR ||
380 interpolate == TGSI_INTERPOLATE_LINEAR ||
381 interpolate == TGSI_INTERPOLATE_PERSPECTIVE)
382 {
383 int is_linear = interpolate == TGSI_INTERPOLATE_LINEAR;
384 int loc;
385
386 switch(location) {
387 case TGSI_INTERPOLATE_LOC_CENTER:
388 loc = 1;
389 break;
390 case TGSI_INTERPOLATE_LOC_CENTROID:
391 loc = 2;
392 break;
393 case TGSI_INTERPOLATE_LOC_SAMPLE:
394 default:
395 loc = 0; break;
396 }
397
398 return is_linear * 3 + loc;
399 }
400
401 return -1;
402 }
403
404 static void evergreen_interp_assign_ij_index(struct r600_shader_ctx *ctx,
405 int input)
406 {
407 int i = eg_get_interpolator_index(
408 ctx->shader->input[input].interpolate,
409 ctx->shader->input[input].interpolate_location);
410 assert(i >= 0);
411 ctx->shader->input[input].ij_index = ctx->eg_interpolators[i].ij_index;
412 }
413
414 static int evergreen_interp_alu(struct r600_shader_ctx *ctx, int input)
415 {
416 int i, r;
417 struct r600_bytecode_alu alu;
418 int gpr = 0, base_chan = 0;
419 int ij_index = ctx->shader->input[input].ij_index;
420
421 /* work out gpr and base_chan from index */
422 gpr = ij_index / 2;
423 base_chan = (2 * (ij_index % 2)) + 1;
424
425 for (i = 0; i < 8; i++) {
426 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
427
428 if (i < 4)
429 alu.op = ALU_OP2_INTERP_ZW;
430 else
431 alu.op = ALU_OP2_INTERP_XY;
432
433 if ((i > 1) && (i < 6)) {
434 alu.dst.sel = ctx->shader->input[input].gpr;
435 alu.dst.write = 1;
436 }
437
438 alu.dst.chan = i % 4;
439
440 alu.src[0].sel = gpr;
441 alu.src[0].chan = (base_chan - (i % 2));
442
443 alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
444
445 alu.bank_swizzle_force = SQ_ALU_VEC_210;
446 if ((i % 4) == 3)
447 alu.last = 1;
448 r = r600_bytecode_add_alu(ctx->bc, &alu);
449 if (r)
450 return r;
451 }
452 return 0;
453 }
454
455 static int evergreen_interp_flat(struct r600_shader_ctx *ctx, int input)
456 {
457 int i, r;
458 struct r600_bytecode_alu alu;
459
460 for (i = 0; i < 4; i++) {
461 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
462
463 alu.op = ALU_OP1_INTERP_LOAD_P0;
464
465 alu.dst.sel = ctx->shader->input[input].gpr;
466 alu.dst.write = 1;
467
468 alu.dst.chan = i;
469
470 alu.src[0].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
471 alu.src[0].chan = i;
472
473 if (i == 3)
474 alu.last = 1;
475 r = r600_bytecode_add_alu(ctx->bc, &alu);
476 if (r)
477 return r;
478 }
479 return 0;
480 }
481
482 /*
483 * Special export handling in shaders
484 *
485 * shader export ARRAY_BASE for EXPORT_POS:
486 * 60 is position
487 * 61 is misc vector
488 * 62, 63 are clip distance vectors
489 *
490 * The use of the values exported in 61-63 are controlled by PA_CL_VS_OUT_CNTL:
491 * VS_OUT_MISC_VEC_ENA - enables the use of all fields in export 61
492 * USE_VTX_POINT_SIZE - point size in the X channel of export 61
493 * USE_VTX_EDGE_FLAG - edge flag in the Y channel of export 61
494 * USE_VTX_RENDER_TARGET_INDX - render target index in the Z channel of export 61
495 * USE_VTX_VIEWPORT_INDX - viewport index in the W channel of export 61
496 * USE_VTX_KILL_FLAG - kill flag in the Z channel of export 61 (mutually
497 * exclusive from render target index)
498 * VS_OUT_CCDIST0_VEC_ENA/VS_OUT_CCDIST1_VEC_ENA - enable clip distance vectors
499 *
500 *
501 * shader export ARRAY_BASE for EXPORT_PIXEL:
502 * 0-7 CB targets
503 * 61 computed Z vector
504 *
505 * The use of the values exported in the computed Z vector are controlled
506 * by DB_SHADER_CONTROL:
507 * Z_EXPORT_ENABLE - Z as a float in RED
508 * STENCIL_REF_EXPORT_ENABLE - stencil ref as int in GREEN
509 * COVERAGE_TO_MASK_ENABLE - alpha to mask in ALPHA
510 * MASK_EXPORT_ENABLE - pixel sample mask in BLUE
511 * DB_SOURCE_FORMAT - export control restrictions
512 *
513 */
514
515
516 /* Map name/sid pair from tgsi to the 8-bit semantic index for SPI setup */
517 static int r600_spi_sid(struct r600_shader_io * io)
518 {
519 int index, name = io->name;
520
521 /* These params are handled differently, they don't need
522 * semantic indices, so we'll use 0 for them.
523 */
524 if (name == TGSI_SEMANTIC_POSITION ||
525 name == TGSI_SEMANTIC_PSIZE ||
526 name == TGSI_SEMANTIC_EDGEFLAG ||
527 name == TGSI_SEMANTIC_FACE ||
528 name == TGSI_SEMANTIC_SAMPLEMASK)
529 index = 0;
530 else {
531 if (name == TGSI_SEMANTIC_GENERIC) {
532 /* For generic params simply use sid from tgsi */
533 index = io->sid;
534 } else {
535 /* For non-generic params - pack name and sid into 8 bits */
536 index = 0x80 | (name<<3) | (io->sid);
537 }
538
539 /* Make sure that all really used indices have nonzero value, so
540 * we can just compare it to 0 later instead of comparing the name
541 * with different values to detect special cases. */
542 index++;
543 }
544
545 return index;
546 };
547
548 /* turn input into interpolate on EG */
549 static int evergreen_interp_input(struct r600_shader_ctx *ctx, int index)
550 {
551 int r = 0;
552
553 if (ctx->shader->input[index].spi_sid) {
554 ctx->shader->input[index].lds_pos = ctx->shader->nlds++;
555 if (ctx->shader->input[index].interpolate > 0) {
556 evergreen_interp_assign_ij_index(ctx, index);
557 if (!ctx->use_llvm)
558 r = evergreen_interp_alu(ctx, index);
559 } else {
560 if (!ctx->use_llvm)
561 r = evergreen_interp_flat(ctx, index);
562 }
563 }
564 return r;
565 }
566
567 static int select_twoside_color(struct r600_shader_ctx *ctx, int front, int back)
568 {
569 struct r600_bytecode_alu alu;
570 int i, r;
571 int gpr_front = ctx->shader->input[front].gpr;
572 int gpr_back = ctx->shader->input[back].gpr;
573
574 for (i = 0; i < 4; i++) {
575 memset(&alu, 0, sizeof(alu));
576 alu.op = ALU_OP3_CNDGT;
577 alu.is_op3 = 1;
578 alu.dst.write = 1;
579 alu.dst.sel = gpr_front;
580 alu.src[0].sel = ctx->face_gpr;
581 alu.src[1].sel = gpr_front;
582 alu.src[2].sel = gpr_back;
583
584 alu.dst.chan = i;
585 alu.src[1].chan = i;
586 alu.src[2].chan = i;
587 alu.last = (i==3);
588
589 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
590 return r;
591 }
592
593 return 0;
594 }
595
596 static int tgsi_declaration(struct r600_shader_ctx *ctx)
597 {
598 struct tgsi_full_declaration *d = &ctx->parse.FullToken.FullDeclaration;
599 int r, i, j, count = d->Range.Last - d->Range.First + 1;
600
601 switch (d->Declaration.File) {
602 case TGSI_FILE_INPUT:
603 i = ctx->shader->ninput;
604 assert(i < Elements(ctx->shader->input));
605 ctx->shader->ninput += count;
606 ctx->shader->input[i].name = d->Semantic.Name;
607 ctx->shader->input[i].sid = d->Semantic.Index;
608 ctx->shader->input[i].interpolate = d->Interp.Interpolate;
609 ctx->shader->input[i].interpolate_location = d->Interp.Location;
610 ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First;
611 if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
612 ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]);
613 switch (ctx->shader->input[i].name) {
614 case TGSI_SEMANTIC_FACE:
615 if (ctx->face_gpr != -1)
616 ctx->shader->input[i].gpr = ctx->face_gpr; /* already allocated by allocate_system_value_inputs */
617 else
618 ctx->face_gpr = ctx->shader->input[i].gpr;
619 break;
620 case TGSI_SEMANTIC_COLOR:
621 ctx->colors_used++;
622 break;
623 case TGSI_SEMANTIC_POSITION:
624 ctx->fragcoord_input = i;
625 break;
626 }
627 if (ctx->bc->chip_class >= EVERGREEN) {
628 if ((r = evergreen_interp_input(ctx, i)))
629 return r;
630 }
631 } else if (ctx->type == TGSI_PROCESSOR_GEOMETRY) {
632 /* FIXME probably skip inputs if they aren't passed in the ring */
633 ctx->shader->input[i].ring_offset = ctx->next_ring_offset;
634 ctx->next_ring_offset += 16;
635 if (ctx->shader->input[i].name == TGSI_SEMANTIC_PRIMID)
636 ctx->shader->gs_prim_id_input = true;
637 }
638 for (j = 1; j < count; ++j) {
639 ctx->shader->input[i + j] = ctx->shader->input[i];
640 ctx->shader->input[i + j].gpr += j;
641 }
642 break;
643 case TGSI_FILE_OUTPUT:
644 i = ctx->shader->noutput++;
645 assert(i < Elements(ctx->shader->output));
646 ctx->shader->output[i].name = d->Semantic.Name;
647 ctx->shader->output[i].sid = d->Semantic.Index;
648 ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First;
649 ctx->shader->output[i].interpolate = d->Interp.Interpolate;
650 ctx->shader->output[i].write_mask = d->Declaration.UsageMask;
651 if (ctx->type == TGSI_PROCESSOR_VERTEX ||
652 ctx->type == TGSI_PROCESSOR_GEOMETRY) {
653 ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]);
654 switch (d->Semantic.Name) {
655 case TGSI_SEMANTIC_CLIPDIST:
656 ctx->shader->clip_dist_write |= d->Declaration.UsageMask << (d->Semantic.Index << 2);
657 break;
658 case TGSI_SEMANTIC_PSIZE:
659 ctx->shader->vs_out_misc_write = 1;
660 ctx->shader->vs_out_point_size = 1;
661 break;
662 case TGSI_SEMANTIC_EDGEFLAG:
663 ctx->shader->vs_out_misc_write = 1;
664 ctx->shader->vs_out_edgeflag = 1;
665 ctx->edgeflag_output = i;
666 break;
667 case TGSI_SEMANTIC_VIEWPORT_INDEX:
668 ctx->shader->vs_out_misc_write = 1;
669 ctx->shader->vs_out_viewport = 1;
670 break;
671 case TGSI_SEMANTIC_LAYER:
672 ctx->shader->vs_out_misc_write = 1;
673 ctx->shader->vs_out_layer = 1;
674 break;
675 case TGSI_SEMANTIC_CLIPVERTEX:
676 ctx->clip_vertex_write = TRUE;
677 ctx->cv_output = i;
678 break;
679 }
680 if (ctx->type == TGSI_PROCESSOR_GEOMETRY) {
681 ctx->gs_out_ring_offset += 16;
682 }
683 } else if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
684 switch (d->Semantic.Name) {
685 case TGSI_SEMANTIC_COLOR:
686 ctx->shader->nr_ps_max_color_exports++;
687 break;
688 }
689 }
690 break;
691 case TGSI_FILE_TEMPORARY:
692 if (ctx->info.indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
693 if (d->Array.ArrayID) {
694 r600_add_gpr_array(ctx->shader,
695 ctx->file_offset[TGSI_FILE_TEMPORARY] +
696 d->Range.First,
697 d->Range.Last - d->Range.First + 1, 0x0F);
698 }
699 }
700 break;
701
702 case TGSI_FILE_CONSTANT:
703 case TGSI_FILE_SAMPLER:
704 case TGSI_FILE_ADDRESS:
705 break;
706
707 case TGSI_FILE_SYSTEM_VALUE:
708 if (d->Semantic.Name == TGSI_SEMANTIC_SAMPLEMASK ||
709 d->Semantic.Name == TGSI_SEMANTIC_SAMPLEID ||
710 d->Semantic.Name == TGSI_SEMANTIC_SAMPLEPOS) {
711 break; /* Already handled from allocate_system_value_inputs */
712 } else if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) {
713 if (!ctx->native_integers) {
714 struct r600_bytecode_alu alu;
715 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
716
717 alu.op = ALU_OP1_INT_TO_FLT;
718 alu.src[0].sel = 0;
719 alu.src[0].chan = 3;
720
721 alu.dst.sel = 0;
722 alu.dst.chan = 3;
723 alu.dst.write = 1;
724 alu.last = 1;
725
726 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
727 return r;
728 }
729 break;
730 } else if (d->Semantic.Name == TGSI_SEMANTIC_VERTEXID)
731 break;
732 else if (d->Semantic.Name == TGSI_SEMANTIC_INVOCATIONID)
733 break;
734 default:
735 R600_ERR("unsupported file %d declaration\n", d->Declaration.File);
736 return -EINVAL;
737 }
738 return 0;
739 }
740
741 static int r600_get_temp(struct r600_shader_ctx *ctx)
742 {
743 return ctx->temp_reg + ctx->max_driver_temp_used++;
744 }
745
746 static int allocate_system_value_inputs(struct r600_shader_ctx *ctx, int gpr_offset)
747 {
748 struct tgsi_parse_context parse;
749 struct {
750 boolean enabled;
751 int *reg;
752 unsigned name, alternate_name;
753 } inputs[2] = {
754 { false, &ctx->face_gpr, TGSI_SEMANTIC_SAMPLEMASK, ~0u }, /* lives in Front Face GPR.z */
755
756 { false, &ctx->fixed_pt_position_gpr, TGSI_SEMANTIC_SAMPLEID, TGSI_SEMANTIC_SAMPLEPOS } /* SAMPLEID is in Fixed Point Position GPR.w */
757 };
758 int i, k, num_regs = 0;
759
760 if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) {
761 return 0;
762 }
763
764 /* need to scan shader for system values and interpolateAtSample/Offset/Centroid */
765 while (!tgsi_parse_end_of_tokens(&parse)) {
766 tgsi_parse_token(&parse);
767
768 if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) {
769 const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
770 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE ||
771 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
772 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID)
773 {
774 int interpolate, location, k;
775
776 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
777 location = TGSI_INTERPOLATE_LOC_CENTER;
778 inputs[1].enabled = true; /* needs SAMPLEID */
779 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
780 location = TGSI_INTERPOLATE_LOC_CENTER;
781 /* Needs sample positions, currently those are always available */
782 } else {
783 location = TGSI_INTERPOLATE_LOC_CENTROID;
784 }
785
786 interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index];
787 k = eg_get_interpolator_index(interpolate, location);
788 ctx->eg_interpolators[k].enabled = true;
789 }
790 } else if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_DECLARATION) {
791 struct tgsi_full_declaration *d = &parse.FullToken.FullDeclaration;
792 if (d->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
793 for (k = 0; k < Elements(inputs); k++) {
794 if (d->Semantic.Name == inputs[k].name ||
795 d->Semantic.Name == inputs[k].alternate_name) {
796 inputs[k].enabled = true;
797 }
798 }
799 }
800 }
801 }
802
803 tgsi_parse_free(&parse);
804
805 for (i = 0; i < Elements(inputs); i++) {
806 boolean enabled = inputs[i].enabled;
807 int *reg = inputs[i].reg;
808 unsigned name = inputs[i].name;
809
810 if (enabled) {
811 int gpr = gpr_offset + num_regs++;
812
813 // add to inputs, allocate a gpr
814 k = ctx->shader->ninput ++;
815 ctx->shader->input[k].name = name;
816 ctx->shader->input[k].sid = 0;
817 ctx->shader->input[k].interpolate = TGSI_INTERPOLATE_CONSTANT;
818 ctx->shader->input[k].interpolate_location = TGSI_INTERPOLATE_LOC_CENTER;
819 *reg = ctx->shader->input[k].gpr = gpr;
820 }
821 }
822
823 return gpr_offset + num_regs;
824 }
825
826 /*
827 * for evergreen we need to scan the shader to find the number of GPRs we need to
828 * reserve for interpolation and system values
829 *
830 * we need to know if we are going to emit
831 * any sample or centroid inputs
832 * if perspective and linear are required
833 */
834 static int evergreen_gpr_count(struct r600_shader_ctx *ctx)
835 {
836 int i;
837 int num_baryc;
838 struct tgsi_parse_context parse;
839
840 memset(&ctx->eg_interpolators, 0, sizeof(ctx->eg_interpolators));
841
842 for (i = 0; i < ctx->info.num_inputs; i++) {
843 int k;
844 /* skip position/face/mask/sampleid */
845 if (ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_POSITION ||
846 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_FACE ||
847 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEMASK ||
848 ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_SAMPLEID)
849 continue;
850
851 k = eg_get_interpolator_index(
852 ctx->info.input_interpolate[i],
853 ctx->info.input_interpolate_loc[i]);
854 if (k >= 0)
855 ctx->eg_interpolators[k].enabled = TRUE;
856 }
857
858 if (tgsi_parse_init(&parse, ctx->tokens) != TGSI_PARSE_OK) {
859 return 0;
860 }
861
862 /* need to scan shader for system values and interpolateAtSample/Offset/Centroid */
863 while (!tgsi_parse_end_of_tokens(&parse)) {
864 tgsi_parse_token(&parse);
865
866 if (parse.FullToken.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION) {
867 const struct tgsi_full_instruction *inst = &parse.FullToken.FullInstruction;
868 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE ||
869 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
870 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_CENTROID)
871 {
872 int interpolate, location, k;
873
874 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
875 location = TGSI_INTERPOLATE_LOC_CENTER;
876 } else if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET) {
877 location = TGSI_INTERPOLATE_LOC_CENTER;
878 } else {
879 location = TGSI_INTERPOLATE_LOC_CENTROID;
880 }
881
882 interpolate = ctx->info.input_interpolate[inst->Src[0].Register.Index];
883 k = eg_get_interpolator_index(interpolate, location);
884 ctx->eg_interpolators[k].enabled = true;
885 }
886 }
887 }
888
889 tgsi_parse_free(&parse);
890
891 /* assign gpr to each interpolator according to priority */
892 num_baryc = 0;
893 for (i = 0; i < Elements(ctx->eg_interpolators); i++) {
894 if (ctx->eg_interpolators[i].enabled) {
895 ctx->eg_interpolators[i].ij_index = num_baryc;
896 num_baryc ++;
897 }
898 }
899
900 /* XXX PULL MODEL and LINE STIPPLE */
901
902 num_baryc = (num_baryc + 1) >> 1;
903 return allocate_system_value_inputs(ctx, num_baryc);
904 }
905
906 /* sample_id_sel == NULL means fetch for current sample */
907 static int load_sample_position(struct r600_shader_ctx *ctx, struct r600_shader_src *sample_id, int chan_sel)
908 {
909 struct r600_bytecode_vtx vtx;
910 int r, t1;
911
912 assert(ctx->fixed_pt_position_gpr != -1);
913
914 t1 = r600_get_temp(ctx);
915
916 memset(&vtx, 0, sizeof(struct r600_bytecode_vtx));
917 vtx.op = FETCH_OP_VFETCH;
918 vtx.buffer_id = R600_SAMPLE_POSITIONS_CONST_BUFFER;
919 vtx.fetch_type = 2; /* VTX_FETCH_NO_INDEX_OFFSET */
920 if (sample_id == NULL) {
921 vtx.src_gpr = ctx->fixed_pt_position_gpr; // SAMPLEID is in .w;
922 vtx.src_sel_x = 3;
923 }
924 else {
925 struct r600_bytecode_alu alu;
926
927 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
928 alu.op = ALU_OP1_MOV;
929 r600_bytecode_src(&alu.src[0], sample_id, chan_sel);
930 alu.dst.sel = t1;
931 alu.dst.write = 1;
932 alu.last = 1;
933 r = r600_bytecode_add_alu(ctx->bc, &alu);
934 if (r)
935 return r;
936
937 vtx.src_gpr = t1;
938 vtx.src_sel_x = 0;
939 }
940 vtx.mega_fetch_count = 16;
941 vtx.dst_gpr = t1;
942 vtx.dst_sel_x = 0;
943 vtx.dst_sel_y = 1;
944 vtx.dst_sel_z = 2;
945 vtx.dst_sel_w = 3;
946 vtx.data_format = FMT_32_32_32_32_FLOAT;
947 vtx.num_format_all = 2;
948 vtx.format_comp_all = 1;
949 vtx.use_const_fields = 0;
950 vtx.offset = 1; // first element is size of buffer
951 vtx.endian = r600_endian_swap(32);
952 vtx.srf_mode_all = 1; /* SRF_MODE_NO_ZERO */
953
954 r = r600_bytecode_add_vtx(ctx->bc, &vtx);
955 if (r)
956 return r;
957
958 return t1;
959 }
960
961 static void tgsi_src(struct r600_shader_ctx *ctx,
962 const struct tgsi_full_src_register *tgsi_src,
963 struct r600_shader_src *r600_src)
964 {
965 memset(r600_src, 0, sizeof(*r600_src));
966 r600_src->swizzle[0] = tgsi_src->Register.SwizzleX;
967 r600_src->swizzle[1] = tgsi_src->Register.SwizzleY;
968 r600_src->swizzle[2] = tgsi_src->Register.SwizzleZ;
969 r600_src->swizzle[3] = tgsi_src->Register.SwizzleW;
970 r600_src->neg = tgsi_src->Register.Negate;
971 r600_src->abs = tgsi_src->Register.Absolute;
972
973 if (tgsi_src->Register.File == TGSI_FILE_IMMEDIATE) {
974 int index;
975 if ((tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleY) &&
976 (tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleZ) &&
977 (tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleW)) {
978
979 index = tgsi_src->Register.Index * 4 + tgsi_src->Register.SwizzleX;
980 r600_bytecode_special_constants(ctx->literals[index], &r600_src->sel, &r600_src->neg);
981 if (r600_src->sel != V_SQ_ALU_SRC_LITERAL)
982 return;
983 }
984 index = tgsi_src->Register.Index;
985 r600_src->sel = V_SQ_ALU_SRC_LITERAL;
986 memcpy(r600_src->value, ctx->literals + index * 4, sizeof(r600_src->value));
987 } else if (tgsi_src->Register.File == TGSI_FILE_SYSTEM_VALUE) {
988 if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEMASK) {
989 r600_src->swizzle[0] = 2; // Z value
990 r600_src->swizzle[1] = 2;
991 r600_src->swizzle[2] = 2;
992 r600_src->swizzle[3] = 2;
993 r600_src->sel = ctx->face_gpr;
994 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEID) {
995 r600_src->swizzle[0] = 3; // W value
996 r600_src->swizzle[1] = 3;
997 r600_src->swizzle[2] = 3;
998 r600_src->swizzle[3] = 3;
999 r600_src->sel = ctx->fixed_pt_position_gpr;
1000 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_SAMPLEPOS) {
1001 r600_src->swizzle[0] = 0;
1002 r600_src->swizzle[1] = 1;
1003 r600_src->swizzle[2] = 4;
1004 r600_src->swizzle[3] = 4;
1005 r600_src->sel = load_sample_position(ctx, NULL, -1);
1006 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INSTANCEID) {
1007 r600_src->swizzle[0] = 3;
1008 r600_src->swizzle[1] = 3;
1009 r600_src->swizzle[2] = 3;
1010 r600_src->swizzle[3] = 3;
1011 r600_src->sel = 0;
1012 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_VERTEXID) {
1013 r600_src->swizzle[0] = 0;
1014 r600_src->swizzle[1] = 0;
1015 r600_src->swizzle[2] = 0;
1016 r600_src->swizzle[3] = 0;
1017 r600_src->sel = 0;
1018 } else if (ctx->info.system_value_semantic_name[tgsi_src->Register.Index] == TGSI_SEMANTIC_INVOCATIONID) {
1019 r600_src->swizzle[0] = 3;
1020 r600_src->swizzle[1] = 3;
1021 r600_src->swizzle[2] = 3;
1022 r600_src->swizzle[3] = 3;
1023 r600_src->sel = 1;
1024 }
1025 } else {
1026 if (tgsi_src->Register.Indirect)
1027 r600_src->rel = V_SQ_REL_RELATIVE;
1028 r600_src->sel = tgsi_src->Register.Index;
1029 r600_src->sel += ctx->file_offset[tgsi_src->Register.File];
1030 }
1031 if (tgsi_src->Register.File == TGSI_FILE_CONSTANT) {
1032 if (tgsi_src->Register.Dimension) {
1033 r600_src->kc_bank = tgsi_src->Dimension.Index;
1034 }
1035 }
1036 }
1037
1038 static int tgsi_fetch_rel_const(struct r600_shader_ctx *ctx,
1039 unsigned int cb_idx, unsigned int offset, unsigned ar_chan,
1040 unsigned int dst_reg)
1041 {
1042 struct r600_bytecode_vtx vtx;
1043 unsigned int ar_reg;
1044 int r;
1045
1046 if (offset) {
1047 struct r600_bytecode_alu alu;
1048
1049 memset(&alu, 0, sizeof(alu));
1050
1051 alu.op = ALU_OP2_ADD_INT;
1052 alu.src[0].sel = ctx->bc->ar_reg;
1053 alu.src[0].chan = ar_chan;
1054
1055 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1056 alu.src[1].value = offset;
1057
1058 alu.dst.sel = dst_reg;
1059 alu.dst.chan = ar_chan;
1060 alu.dst.write = 1;
1061 alu.last = 1;
1062
1063 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
1064 return r;
1065
1066 ar_reg = dst_reg;
1067 } else {
1068 ar_reg = ctx->bc->ar_reg;
1069 }
1070
1071 memset(&vtx, 0, sizeof(vtx));
1072 vtx.buffer_id = cb_idx;
1073 vtx.fetch_type = 2; /* VTX_FETCH_NO_INDEX_OFFSET */
1074 vtx.src_gpr = ar_reg;
1075 vtx.src_sel_x = ar_chan;
1076 vtx.mega_fetch_count = 16;
1077 vtx.dst_gpr = dst_reg;
1078 vtx.dst_sel_x = 0; /* SEL_X */
1079 vtx.dst_sel_y = 1; /* SEL_Y */
1080 vtx.dst_sel_z = 2; /* SEL_Z */
1081 vtx.dst_sel_w = 3; /* SEL_W */
1082 vtx.data_format = FMT_32_32_32_32_FLOAT;
1083 vtx.num_format_all = 2; /* NUM_FORMAT_SCALED */
1084 vtx.format_comp_all = 1; /* FORMAT_COMP_SIGNED */
1085 vtx.endian = r600_endian_swap(32);
1086
1087 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1088 return r;
1089
1090 return 0;
1091 }
1092
1093 static int fetch_gs_input(struct r600_shader_ctx *ctx, struct tgsi_full_src_register *src, unsigned int dst_reg)
1094 {
1095 struct r600_bytecode_vtx vtx;
1096 int r;
1097 unsigned index = src->Register.Index;
1098 unsigned vtx_id = src->Dimension.Index;
1099 int offset_reg = vtx_id / 3;
1100 int offset_chan = vtx_id % 3;
1101
1102 /* offsets of per-vertex data in ESGS ring are passed to GS in R0.x, R0.y,
1103 * R0.w, R1.x, R1.y, R1.z (it seems R0.z is used for PrimitiveID) */
1104
1105 if (offset_reg == 0 && offset_chan == 2)
1106 offset_chan = 3;
1107
1108 if (src->Dimension.Indirect) {
1109 int treg[3];
1110 int t2;
1111 struct r600_bytecode_alu alu;
1112 int r, i;
1113
1114 /* you have got to be shitting me -
1115 we have to put the R0.x/y/w into Rt.x Rt+1.x Rt+2.x then index reg from Rt.
1116 at least this is what fglrx seems to do. */
1117 for (i = 0; i < 3; i++) {
1118 treg[i] = r600_get_temp(ctx);
1119 }
1120 t2 = r600_get_temp(ctx);
1121 for (i = 0; i < 3; i++) {
1122 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1123 alu.op = ALU_OP1_MOV;
1124 alu.src[0].sel = 0;
1125 alu.src[0].chan = i == 2 ? 3 : i;
1126 alu.dst.sel = treg[i];
1127 alu.dst.chan = 0;
1128 alu.dst.write = 1;
1129 alu.last = 1;
1130 r = r600_bytecode_add_alu(ctx->bc, &alu);
1131 if (r)
1132 return r;
1133 }
1134 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1135 alu.op = ALU_OP1_MOV;
1136 alu.src[0].sel = treg[0];
1137 alu.src[0].rel = 1;
1138 alu.dst.sel = t2;
1139 alu.dst.write = 1;
1140 alu.last = 1;
1141 r = r600_bytecode_add_alu(ctx->bc, &alu);
1142 if (r)
1143 return r;
1144 offset_reg = t2;
1145 }
1146
1147
1148 memset(&vtx, 0, sizeof(vtx));
1149 vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
1150 vtx.fetch_type = 2; /* VTX_FETCH_NO_INDEX_OFFSET */
1151 vtx.src_gpr = offset_reg;
1152 vtx.src_sel_x = offset_chan;
1153 vtx.offset = index * 16; /*bytes*/
1154 vtx.mega_fetch_count = 16;
1155 vtx.dst_gpr = dst_reg;
1156 vtx.dst_sel_x = 0; /* SEL_X */
1157 vtx.dst_sel_y = 1; /* SEL_Y */
1158 vtx.dst_sel_z = 2; /* SEL_Z */
1159 vtx.dst_sel_w = 3; /* SEL_W */
1160 if (ctx->bc->chip_class >= EVERGREEN) {
1161 vtx.use_const_fields = 1;
1162 } else {
1163 vtx.data_format = FMT_32_32_32_32_FLOAT;
1164 }
1165
1166 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
1167 return r;
1168
1169 return 0;
1170 }
1171
1172 static int tgsi_split_gs_inputs(struct r600_shader_ctx *ctx)
1173 {
1174 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1175 int i;
1176
1177 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1178 struct tgsi_full_src_register *src = &inst->Src[i];
1179
1180 if (src->Register.File == TGSI_FILE_INPUT) {
1181 if (ctx->shader->input[src->Register.Index].name == TGSI_SEMANTIC_PRIMID) {
1182 /* primitive id is in R0.z */
1183 ctx->src[i].sel = 0;
1184 ctx->src[i].swizzle[0] = 2;
1185 }
1186 }
1187 if (src->Register.File == TGSI_FILE_INPUT && src->Register.Dimension) {
1188 int treg = r600_get_temp(ctx);
1189
1190 fetch_gs_input(ctx, src, treg);
1191 ctx->src[i].sel = treg;
1192 }
1193 }
1194 return 0;
1195 }
1196
1197 static int tgsi_split_constant(struct r600_shader_ctx *ctx)
1198 {
1199 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1200 struct r600_bytecode_alu alu;
1201 int i, j, k, nconst, r;
1202
1203 for (i = 0, nconst = 0; i < inst->Instruction.NumSrcRegs; i++) {
1204 if (inst->Src[i].Register.File == TGSI_FILE_CONSTANT) {
1205 nconst++;
1206 }
1207 tgsi_src(ctx, &inst->Src[i], &ctx->src[i]);
1208 }
1209 for (i = 0, j = nconst - 1; i < inst->Instruction.NumSrcRegs; i++) {
1210 if (inst->Src[i].Register.File != TGSI_FILE_CONSTANT) {
1211 continue;
1212 }
1213
1214 if (ctx->src[i].rel) {
1215 int chan = inst->Src[i].Indirect.Swizzle;
1216 int treg = r600_get_temp(ctx);
1217 if ((r = tgsi_fetch_rel_const(ctx, ctx->src[i].kc_bank, ctx->src[i].sel - 512, chan, treg)))
1218 return r;
1219
1220 ctx->src[i].kc_bank = 0;
1221 ctx->src[i].sel = treg;
1222 ctx->src[i].rel = 0;
1223 j--;
1224 } else if (j > 0) {
1225 int treg = r600_get_temp(ctx);
1226 for (k = 0; k < 4; k++) {
1227 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1228 alu.op = ALU_OP1_MOV;
1229 alu.src[0].sel = ctx->src[i].sel;
1230 alu.src[0].chan = k;
1231 alu.src[0].rel = ctx->src[i].rel;
1232 alu.src[0].kc_bank = ctx->src[i].kc_bank;
1233 alu.dst.sel = treg;
1234 alu.dst.chan = k;
1235 alu.dst.write = 1;
1236 if (k == 3)
1237 alu.last = 1;
1238 r = r600_bytecode_add_alu(ctx->bc, &alu);
1239 if (r)
1240 return r;
1241 }
1242 ctx->src[i].sel = treg;
1243 ctx->src[i].rel =0;
1244 j--;
1245 }
1246 }
1247 return 0;
1248 }
1249
1250 /* need to move any immediate into a temp - for trig functions which use literal for PI stuff */
1251 static int tgsi_split_literal_constant(struct r600_shader_ctx *ctx)
1252 {
1253 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
1254 struct r600_bytecode_alu alu;
1255 int i, j, k, nliteral, r;
1256
1257 for (i = 0, nliteral = 0; i < inst->Instruction.NumSrcRegs; i++) {
1258 if (ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
1259 nliteral++;
1260 }
1261 }
1262 for (i = 0, j = nliteral - 1; i < inst->Instruction.NumSrcRegs; i++) {
1263 if (j > 0 && ctx->src[i].sel == V_SQ_ALU_SRC_LITERAL) {
1264 int treg = r600_get_temp(ctx);
1265 for (k = 0; k < 4; k++) {
1266 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1267 alu.op = ALU_OP1_MOV;
1268 alu.src[0].sel = ctx->src[i].sel;
1269 alu.src[0].chan = k;
1270 alu.src[0].value = ctx->src[i].value[k];
1271 alu.dst.sel = treg;
1272 alu.dst.chan = k;
1273 alu.dst.write = 1;
1274 if (k == 3)
1275 alu.last = 1;
1276 r = r600_bytecode_add_alu(ctx->bc, &alu);
1277 if (r)
1278 return r;
1279 }
1280 ctx->src[i].sel = treg;
1281 j--;
1282 }
1283 }
1284 return 0;
1285 }
1286
1287 static int process_twoside_color_inputs(struct r600_shader_ctx *ctx)
1288 {
1289 int i, r, count = ctx->shader->ninput;
1290
1291 for (i = 0; i < count; i++) {
1292 if (ctx->shader->input[i].name == TGSI_SEMANTIC_COLOR) {
1293 r = select_twoside_color(ctx, i, ctx->shader->input[i].back_color_input);
1294 if (r)
1295 return r;
1296 }
1297 }
1298 return 0;
1299 }
1300
1301 static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output_info *so)
1302 {
1303 unsigned so_gpr[PIPE_MAX_SHADER_OUTPUTS];
1304 int i, j, r;
1305
1306 /* Sanity checking. */
1307 if (so->num_outputs > PIPE_MAX_SHADER_OUTPUTS) {
1308 R600_ERR("Too many stream outputs: %d\n", so->num_outputs);
1309 r = -EINVAL;
1310 goto out_err;
1311 }
1312 for (i = 0; i < so->num_outputs; i++) {
1313 if (so->output[i].output_buffer >= 4) {
1314 R600_ERR("Exceeded the max number of stream output buffers, got: %d\n",
1315 so->output[i].output_buffer);
1316 r = -EINVAL;
1317 goto out_err;
1318 }
1319 }
1320
1321 /* Initialize locations where the outputs are stored. */
1322 for (i = 0; i < so->num_outputs; i++) {
1323 so_gpr[i] = ctx->shader->output[so->output[i].register_index].gpr;
1324
1325 /* Lower outputs with dst_offset < start_component.
1326 *
1327 * We can only output 4D vectors with a write mask, e.g. we can
1328 * only output the W component at offset 3, etc. If we want
1329 * to store Y, Z, or W at buffer offset 0, we need to use MOV
1330 * to move it to X and output X. */
1331 if (so->output[i].dst_offset < so->output[i].start_component) {
1332 unsigned tmp = r600_get_temp(ctx);
1333
1334 for (j = 0; j < so->output[i].num_components; j++) {
1335 struct r600_bytecode_alu alu;
1336 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1337 alu.op = ALU_OP1_MOV;
1338 alu.src[0].sel = so_gpr[i];
1339 alu.src[0].chan = so->output[i].start_component + j;
1340
1341 alu.dst.sel = tmp;
1342 alu.dst.chan = j;
1343 alu.dst.write = 1;
1344 if (j == so->output[i].num_components - 1)
1345 alu.last = 1;
1346 r = r600_bytecode_add_alu(ctx->bc, &alu);
1347 if (r)
1348 return r;
1349 }
1350 so->output[i].start_component = 0;
1351 so_gpr[i] = tmp;
1352 }
1353 }
1354
1355 /* Write outputs to buffers. */
1356 for (i = 0; i < so->num_outputs; i++) {
1357 struct r600_bytecode_output output;
1358
1359 memset(&output, 0, sizeof(struct r600_bytecode_output));
1360 output.gpr = so_gpr[i];
1361 output.elem_size = so->output[i].num_components;
1362 output.array_base = so->output[i].dst_offset - so->output[i].start_component;
1363 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
1364 output.burst_count = 1;
1365 /* array_size is an upper limit for the burst_count
1366 * with MEM_STREAM instructions */
1367 output.array_size = 0xFFF;
1368 output.comp_mask = ((1 << so->output[i].num_components) - 1) << so->output[i].start_component;
1369 if (ctx->bc->chip_class >= EVERGREEN) {
1370 switch (so->output[i].output_buffer) {
1371 case 0:
1372 output.op = CF_OP_MEM_STREAM0_BUF0;
1373 break;
1374 case 1:
1375 output.op = CF_OP_MEM_STREAM0_BUF1;
1376 break;
1377 case 2:
1378 output.op = CF_OP_MEM_STREAM0_BUF2;
1379 break;
1380 case 3:
1381 output.op = CF_OP_MEM_STREAM0_BUF3;
1382 break;
1383 }
1384 } else {
1385 switch (so->output[i].output_buffer) {
1386 case 0:
1387 output.op = CF_OP_MEM_STREAM0;
1388 break;
1389 case 1:
1390 output.op = CF_OP_MEM_STREAM1;
1391 break;
1392 case 2:
1393 output.op = CF_OP_MEM_STREAM2;
1394 break;
1395 case 3:
1396 output.op = CF_OP_MEM_STREAM3;
1397 break;
1398 }
1399 }
1400 r = r600_bytecode_add_output(ctx->bc, &output);
1401 if (r)
1402 goto out_err;
1403 }
1404 return 0;
1405 out_err:
1406 return r;
1407 }
1408
1409 static void convert_edgeflag_to_int(struct r600_shader_ctx *ctx)
1410 {
1411 struct r600_bytecode_alu alu;
1412 unsigned reg;
1413
1414 if (!ctx->shader->vs_out_edgeflag)
1415 return;
1416
1417 reg = ctx->shader->output[ctx->edgeflag_output].gpr;
1418
1419 /* clamp(x, 0, 1) */
1420 memset(&alu, 0, sizeof(alu));
1421 alu.op = ALU_OP1_MOV;
1422 alu.src[0].sel = reg;
1423 alu.dst.sel = reg;
1424 alu.dst.write = 1;
1425 alu.dst.clamp = 1;
1426 alu.last = 1;
1427 r600_bytecode_add_alu(ctx->bc, &alu);
1428
1429 memset(&alu, 0, sizeof(alu));
1430 alu.op = ALU_OP1_FLT_TO_INT;
1431 alu.src[0].sel = reg;
1432 alu.dst.sel = reg;
1433 alu.dst.write = 1;
1434 alu.last = 1;
1435 r600_bytecode_add_alu(ctx->bc, &alu);
1436 }
1437
1438 static int generate_gs_copy_shader(struct r600_context *rctx,
1439 struct r600_pipe_shader *gs,
1440 struct pipe_stream_output_info *so)
1441 {
1442 struct r600_shader_ctx ctx = {};
1443 struct r600_shader *gs_shader = &gs->shader;
1444 struct r600_pipe_shader *cshader;
1445 int ocnt = gs_shader->noutput;
1446 struct r600_bytecode_alu alu;
1447 struct r600_bytecode_vtx vtx;
1448 struct r600_bytecode_output output;
1449 struct r600_bytecode_cf *cf_jump, *cf_pop,
1450 *last_exp_pos = NULL, *last_exp_param = NULL;
1451 int i, next_clip_pos = 61, next_param = 0;
1452
1453 cshader = calloc(1, sizeof(struct r600_pipe_shader));
1454 if (!cshader)
1455 return 0;
1456
1457 memcpy(cshader->shader.output, gs_shader->output, ocnt *
1458 sizeof(struct r600_shader_io));
1459
1460 cshader->shader.noutput = ocnt;
1461
1462 ctx.shader = &cshader->shader;
1463 ctx.bc = &ctx.shader->bc;
1464 ctx.type = ctx.bc->type = TGSI_PROCESSOR_VERTEX;
1465
1466 r600_bytecode_init(ctx.bc, rctx->b.chip_class, rctx->b.family,
1467 rctx->screen->has_compressed_msaa_texturing);
1468
1469 ctx.bc->isa = rctx->isa;
1470
1471 /* R0.x = R0.x & 0x3fffffff */
1472 memset(&alu, 0, sizeof(alu));
1473 alu.op = ALU_OP2_AND_INT;
1474 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1475 alu.src[1].value = 0x3fffffff;
1476 alu.dst.write = 1;
1477 r600_bytecode_add_alu(ctx.bc, &alu);
1478
1479 /* R0.y = R0.x >> 30 */
1480 memset(&alu, 0, sizeof(alu));
1481 alu.op = ALU_OP2_LSHR_INT;
1482 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1483 alu.src[1].value = 0x1e;
1484 alu.dst.chan = 1;
1485 alu.dst.write = 1;
1486 alu.last = 1;
1487 r600_bytecode_add_alu(ctx.bc, &alu);
1488
1489 /* PRED_SETE_INT __, R0.y, 0 */
1490 memset(&alu, 0, sizeof(alu));
1491 alu.op = ALU_OP2_PRED_SETE_INT;
1492 alu.src[0].chan = 1;
1493 alu.src[1].sel = V_SQ_ALU_SRC_0;
1494 alu.execute_mask = 1;
1495 alu.update_pred = 1;
1496 alu.last = 1;
1497 r600_bytecode_add_alu_type(ctx.bc, &alu, CF_OP_ALU_PUSH_BEFORE);
1498
1499 r600_bytecode_add_cfinst(ctx.bc, CF_OP_JUMP);
1500 cf_jump = ctx.bc->cf_last;
1501
1502 /* fetch vertex data from GSVS ring */
1503 for (i = 0; i < ocnt; ++i) {
1504 struct r600_shader_io *out = &ctx.shader->output[i];
1505 out->gpr = i + 1;
1506 out->ring_offset = i * 16;
1507
1508 memset(&vtx, 0, sizeof(vtx));
1509 vtx.op = FETCH_OP_VFETCH;
1510 vtx.buffer_id = R600_GS_RING_CONST_BUFFER;
1511 vtx.fetch_type = 2;
1512 vtx.offset = out->ring_offset;
1513 vtx.dst_gpr = out->gpr;
1514 vtx.dst_sel_x = 0;
1515 vtx.dst_sel_y = 1;
1516 vtx.dst_sel_z = 2;
1517 vtx.dst_sel_w = 3;
1518 if (rctx->b.chip_class >= EVERGREEN) {
1519 vtx.use_const_fields = 1;
1520 } else {
1521 vtx.data_format = FMT_32_32_32_32_FLOAT;
1522 }
1523
1524 r600_bytecode_add_vtx(ctx.bc, &vtx);
1525 }
1526
1527 /* XXX handle clipvertex, streamout? */
1528 emit_streamout(&ctx, so);
1529
1530 /* export vertex data */
1531 /* XXX factor out common code with r600_shader_from_tgsi ? */
1532 for (i = 0; i < ocnt; ++i) {
1533 struct r600_shader_io *out = &ctx.shader->output[i];
1534
1535 if (out->name == TGSI_SEMANTIC_CLIPVERTEX)
1536 continue;
1537
1538 memset(&output, 0, sizeof(output));
1539 output.gpr = out->gpr;
1540 output.elem_size = 3;
1541 output.swizzle_x = 0;
1542 output.swizzle_y = 1;
1543 output.swizzle_z = 2;
1544 output.swizzle_w = 3;
1545 output.burst_count = 1;
1546 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
1547 output.op = CF_OP_EXPORT;
1548 switch (out->name) {
1549 case TGSI_SEMANTIC_POSITION:
1550 output.array_base = 60;
1551 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1552 break;
1553
1554 case TGSI_SEMANTIC_PSIZE:
1555 output.array_base = 61;
1556 if (next_clip_pos == 61)
1557 next_clip_pos = 62;
1558 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1559 output.swizzle_y = 7;
1560 output.swizzle_z = 7;
1561 output.swizzle_w = 7;
1562 ctx.shader->vs_out_misc_write = 1;
1563 ctx.shader->vs_out_point_size = 1;
1564 break;
1565 case TGSI_SEMANTIC_LAYER:
1566 if (out->spi_sid) {
1567 /* duplicate it as PARAM to pass to the pixel shader */
1568 output.array_base = next_param++;
1569 r600_bytecode_add_output(ctx.bc, &output);
1570 last_exp_param = ctx.bc->cf_last;
1571 }
1572 output.array_base = 61;
1573 if (next_clip_pos == 61)
1574 next_clip_pos = 62;
1575 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1576 output.swizzle_x = 7;
1577 output.swizzle_y = 7;
1578 output.swizzle_z = 0;
1579 output.swizzle_w = 7;
1580 ctx.shader->vs_out_misc_write = 1;
1581 ctx.shader->vs_out_layer = 1;
1582 break;
1583 case TGSI_SEMANTIC_VIEWPORT_INDEX:
1584 if (out->spi_sid) {
1585 /* duplicate it as PARAM to pass to the pixel shader */
1586 output.array_base = next_param++;
1587 r600_bytecode_add_output(ctx.bc, &output);
1588 last_exp_param = ctx.bc->cf_last;
1589 }
1590 output.array_base = 61;
1591 if (next_clip_pos == 61)
1592 next_clip_pos = 62;
1593 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1594 ctx.shader->vs_out_misc_write = 1;
1595 ctx.shader->vs_out_viewport = 1;
1596 output.swizzle_x = 7;
1597 output.swizzle_y = 7;
1598 output.swizzle_z = 7;
1599 output.swizzle_w = 0;
1600 break;
1601 case TGSI_SEMANTIC_CLIPDIST:
1602 /* spi_sid is 0 for clipdistance outputs that were generated
1603 * for clipvertex - we don't need to pass them to PS */
1604 ctx.shader->clip_dist_write = gs->shader.clip_dist_write;
1605 if (out->spi_sid) {
1606 /* duplicate it as PARAM to pass to the pixel shader */
1607 output.array_base = next_param++;
1608 r600_bytecode_add_output(ctx.bc, &output);
1609 last_exp_param = ctx.bc->cf_last;
1610 }
1611 output.array_base = next_clip_pos++;
1612 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1613 break;
1614 case TGSI_SEMANTIC_FOG:
1615 output.swizzle_y = 4; /* 0 */
1616 output.swizzle_z = 4; /* 0 */
1617 output.swizzle_w = 5; /* 1 */
1618 break;
1619 default:
1620 output.array_base = next_param++;
1621 break;
1622 }
1623 r600_bytecode_add_output(ctx.bc, &output);
1624 if (output.type == V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM)
1625 last_exp_param = ctx.bc->cf_last;
1626 else
1627 last_exp_pos = ctx.bc->cf_last;
1628 }
1629
1630 if (!last_exp_pos) {
1631 memset(&output, 0, sizeof(output));
1632 output.gpr = 0;
1633 output.elem_size = 3;
1634 output.swizzle_x = 7;
1635 output.swizzle_y = 7;
1636 output.swizzle_z = 7;
1637 output.swizzle_w = 7;
1638 output.burst_count = 1;
1639 output.type = 2;
1640 output.op = CF_OP_EXPORT;
1641 output.array_base = 60;
1642 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
1643 r600_bytecode_add_output(ctx.bc, &output);
1644 last_exp_pos = ctx.bc->cf_last;
1645 }
1646
1647 if (!last_exp_param) {
1648 memset(&output, 0, sizeof(output));
1649 output.gpr = 0;
1650 output.elem_size = 3;
1651 output.swizzle_x = 7;
1652 output.swizzle_y = 7;
1653 output.swizzle_z = 7;
1654 output.swizzle_w = 7;
1655 output.burst_count = 1;
1656 output.type = 2;
1657 output.op = CF_OP_EXPORT;
1658 output.array_base = next_param++;
1659 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
1660 r600_bytecode_add_output(ctx.bc, &output);
1661 last_exp_param = ctx.bc->cf_last;
1662 }
1663
1664 last_exp_pos->op = CF_OP_EXPORT_DONE;
1665 last_exp_param->op = CF_OP_EXPORT_DONE;
1666
1667 r600_bytecode_add_cfinst(ctx.bc, CF_OP_POP);
1668 cf_pop = ctx.bc->cf_last;
1669
1670 cf_jump->cf_addr = cf_pop->id + 2;
1671 cf_jump->pop_count = 1;
1672 cf_pop->cf_addr = cf_pop->id + 2;
1673 cf_pop->pop_count = 1;
1674
1675 if (ctx.bc->chip_class == CAYMAN)
1676 cm_bytecode_add_cf_end(ctx.bc);
1677 else {
1678 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
1679 ctx.bc->cf_last->end_of_program = 1;
1680 }
1681
1682 gs->gs_copy_shader = cshader;
1683
1684 ctx.bc->nstack = 1;
1685 cshader->shader.ring_item_size = ocnt * 16;
1686
1687 return r600_bytecode_build(ctx.bc);
1688 }
1689
1690 static int emit_gs_ring_writes(struct r600_shader_ctx *ctx, bool ind)
1691 {
1692 struct r600_bytecode_output output;
1693 int i, k, ring_offset;
1694
1695 for (i = 0; i < ctx->shader->noutput; i++) {
1696 if (ctx->gs_for_vs) {
1697 /* for ES we need to lookup corresponding ring offset expected by GS
1698 * (map this output to GS input by name and sid) */
1699 /* FIXME precompute offsets */
1700 ring_offset = -1;
1701 for(k = 0; k < ctx->gs_for_vs->ninput; ++k) {
1702 struct r600_shader_io *in = &ctx->gs_for_vs->input[k];
1703 struct r600_shader_io *out = &ctx->shader->output[i];
1704 if (in->name == out->name && in->sid == out->sid)
1705 ring_offset = in->ring_offset;
1706 }
1707
1708 if (ring_offset == -1)
1709 continue;
1710 } else
1711 ring_offset = i * 16;
1712
1713 /* next_ring_offset after parsing input decls contains total size of
1714 * single vertex data, gs_next_vertex - current vertex index */
1715 if (!ind)
1716 ring_offset += ctx->gs_out_ring_offset * ctx->gs_next_vertex;
1717
1718 /* get a temp and add the ring offset to the next vertex base in the shader */
1719 memset(&output, 0, sizeof(struct r600_bytecode_output));
1720 output.gpr = ctx->shader->output[i].gpr;
1721 output.elem_size = 3;
1722 output.comp_mask = 0xF;
1723 output.burst_count = 1;
1724
1725 if (ind)
1726 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE_IND;
1727 else
1728 output.type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_WRITE;
1729 output.op = CF_OP_MEM_RING;
1730
1731
1732 if (ind) {
1733 output.array_base = ring_offset >> 2; /* in dwords */
1734 output.array_size = 0xfff;
1735 output.index_gpr = ctx->gs_export_gpr_treg;
1736 } else
1737 output.array_base = ring_offset >> 2; /* in dwords */
1738 r600_bytecode_add_output(ctx->bc, &output);
1739 }
1740
1741 if (ind) {
1742 struct r600_bytecode_alu alu;
1743 int r;
1744
1745 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
1746 alu.op = ALU_OP2_ADD_INT;
1747 alu.src[0].sel = ctx->gs_export_gpr_treg;
1748 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
1749 alu.src[1].value = ctx->gs_out_ring_offset >> 4;
1750 alu.dst.sel = ctx->gs_export_gpr_treg;
1751 alu.dst.write = 1;
1752 alu.last = 1;
1753 r = r600_bytecode_add_alu(ctx->bc, &alu);
1754 if (r)
1755 return r;
1756 }
1757 ++ctx->gs_next_vertex;
1758 return 0;
1759 }
1760
1761 static int r600_shader_from_tgsi(struct r600_context *rctx,
1762 struct r600_pipe_shader *pipeshader,
1763 struct r600_shader_key key)
1764 {
1765 struct r600_screen *rscreen = rctx->screen;
1766 struct r600_shader *shader = &pipeshader->shader;
1767 struct tgsi_token *tokens = pipeshader->selector->tokens;
1768 struct pipe_stream_output_info so = pipeshader->selector->so;
1769 struct tgsi_full_immediate *immediate;
1770 struct tgsi_full_property *property;
1771 struct r600_shader_ctx ctx;
1772 struct r600_bytecode_output output[32];
1773 unsigned output_done, noutput;
1774 unsigned opcode;
1775 int i, j, k, r = 0;
1776 int next_param_base = 0, next_clip_base;
1777 int max_color_exports = MAX2(key.nr_cbufs, 1);
1778 /* Declarations used by llvm code */
1779 bool use_llvm = false;
1780 bool indirect_gprs;
1781 bool ring_outputs = false;
1782 bool pos_emitted = false;
1783
1784 #ifdef R600_USE_LLVM
1785 use_llvm = rscreen->b.debug_flags & DBG_LLVM;
1786 #endif
1787 ctx.bc = &shader->bc;
1788 ctx.shader = shader;
1789 ctx.native_integers = true;
1790
1791 shader->vs_as_es = key.vs_as_es;
1792
1793 r600_bytecode_init(ctx.bc, rscreen->b.chip_class, rscreen->b.family,
1794 rscreen->has_compressed_msaa_texturing);
1795 ctx.tokens = tokens;
1796 tgsi_scan_shader(tokens, &ctx.info);
1797 shader->indirect_files = ctx.info.indirect_files;
1798 indirect_gprs = ctx.info.indirect_files & ~(1 << TGSI_FILE_CONSTANT);
1799 tgsi_parse_init(&ctx.parse, tokens);
1800 ctx.type = ctx.parse.FullHeader.Processor.Processor;
1801 shader->processor_type = ctx.type;
1802 ctx.bc->type = shader->processor_type;
1803
1804 ring_outputs = key.vs_as_es || (ctx.type == TGSI_PROCESSOR_GEOMETRY);
1805
1806 if (key.vs_as_es) {
1807 ctx.gs_for_vs = &rctx->gs_shader->current->shader;
1808 } else {
1809 ctx.gs_for_vs = NULL;
1810 }
1811
1812 ctx.next_ring_offset = 0;
1813 ctx.gs_out_ring_offset = 0;
1814 ctx.gs_next_vertex = 0;
1815
1816 ctx.face_gpr = -1;
1817 ctx.fixed_pt_position_gpr = -1;
1818 ctx.fragcoord_input = -1;
1819 ctx.colors_used = 0;
1820 ctx.clip_vertex_write = 0;
1821
1822 shader->nr_ps_color_exports = 0;
1823 shader->nr_ps_max_color_exports = 0;
1824
1825 shader->two_side = key.color_two_side;
1826
1827 /* register allocations */
1828 /* Values [0,127] correspond to GPR[0..127].
1829 * Values [128,159] correspond to constant buffer bank 0
1830 * Values [160,191] correspond to constant buffer bank 1
1831 * Values [256,511] correspond to cfile constants c[0..255]. (Gone on EG)
1832 * Values [256,287] correspond to constant buffer bank 2 (EG)
1833 * Values [288,319] correspond to constant buffer bank 3 (EG)
1834 * Other special values are shown in the list below.
1835 * 244 ALU_SRC_1_DBL_L: special constant 1.0 double-float, LSW. (RV670+)
1836 * 245 ALU_SRC_1_DBL_M: special constant 1.0 double-float, MSW. (RV670+)
1837 * 246 ALU_SRC_0_5_DBL_L: special constant 0.5 double-float, LSW. (RV670+)
1838 * 247 ALU_SRC_0_5_DBL_M: special constant 0.5 double-float, MSW. (RV670+)
1839 * 248 SQ_ALU_SRC_0: special constant 0.0.
1840 * 249 SQ_ALU_SRC_1: special constant 1.0 float.
1841 * 250 SQ_ALU_SRC_1_INT: special constant 1 integer.
1842 * 251 SQ_ALU_SRC_M_1_INT: special constant -1 integer.
1843 * 252 SQ_ALU_SRC_0_5: special constant 0.5 float.
1844 * 253 SQ_ALU_SRC_LITERAL: literal constant.
1845 * 254 SQ_ALU_SRC_PV: previous vector result.
1846 * 255 SQ_ALU_SRC_PS: previous scalar result.
1847 */
1848 for (i = 0; i < TGSI_FILE_COUNT; i++) {
1849 ctx.file_offset[i] = 0;
1850 }
1851
1852 #ifdef R600_USE_LLVM
1853 if (use_llvm && ctx.info.indirect_files && (ctx.info.indirect_files & (1 << TGSI_FILE_CONSTANT)) != ctx.info.indirect_files) {
1854 fprintf(stderr, "Warning: R600 LLVM backend does not support "
1855 "indirect adressing. Falling back to TGSI "
1856 "backend.\n");
1857 use_llvm = 0;
1858 }
1859 #endif
1860 if (ctx.type == TGSI_PROCESSOR_VERTEX) {
1861 ctx.file_offset[TGSI_FILE_INPUT] = 1;
1862 if (!use_llvm) {
1863 r600_bytecode_add_cfinst(ctx.bc, CF_OP_CALL_FS);
1864 }
1865 }
1866 if (ctx.type == TGSI_PROCESSOR_FRAGMENT) {
1867 if (ctx.bc->chip_class >= EVERGREEN)
1868 ctx.file_offset[TGSI_FILE_INPUT] = evergreen_gpr_count(&ctx);
1869 else
1870 ctx.file_offset[TGSI_FILE_INPUT] = allocate_system_value_inputs(&ctx, ctx.file_offset[TGSI_FILE_INPUT]);
1871 }
1872 if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
1873 /* FIXME 1 would be enough in some cases (3 or less input vertices) */
1874 ctx.file_offset[TGSI_FILE_INPUT] = 2;
1875 }
1876 ctx.use_llvm = use_llvm;
1877
1878 if (use_llvm) {
1879 ctx.file_offset[TGSI_FILE_OUTPUT] =
1880 ctx.file_offset[TGSI_FILE_INPUT];
1881 } else {
1882 ctx.file_offset[TGSI_FILE_OUTPUT] =
1883 ctx.file_offset[TGSI_FILE_INPUT] +
1884 ctx.info.file_max[TGSI_FILE_INPUT] + 1;
1885 }
1886 ctx.file_offset[TGSI_FILE_TEMPORARY] = ctx.file_offset[TGSI_FILE_OUTPUT] +
1887 ctx.info.file_max[TGSI_FILE_OUTPUT] + 1;
1888
1889 /* Outside the GPR range. This will be translated to one of the
1890 * kcache banks later. */
1891 ctx.file_offset[TGSI_FILE_CONSTANT] = 512;
1892
1893 ctx.file_offset[TGSI_FILE_IMMEDIATE] = V_SQ_ALU_SRC_LITERAL;
1894 ctx.bc->ar_reg = ctx.file_offset[TGSI_FILE_TEMPORARY] +
1895 ctx.info.file_max[TGSI_FILE_TEMPORARY] + 1;
1896 if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
1897 ctx.gs_export_gpr_treg = ctx.bc->ar_reg + 1;
1898 ctx.temp_reg = ctx.bc->ar_reg + 2;
1899 } else
1900 ctx.temp_reg = ctx.bc->ar_reg + 1;
1901
1902 if (indirect_gprs) {
1903 shader->max_arrays = 0;
1904 shader->num_arrays = 0;
1905
1906 if (ctx.info.indirect_files & (1 << TGSI_FILE_INPUT)) {
1907 r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_INPUT],
1908 ctx.file_offset[TGSI_FILE_OUTPUT] -
1909 ctx.file_offset[TGSI_FILE_INPUT],
1910 0x0F);
1911 }
1912 if (ctx.info.indirect_files & (1 << TGSI_FILE_OUTPUT)) {
1913 r600_add_gpr_array(shader, ctx.file_offset[TGSI_FILE_OUTPUT],
1914 ctx.file_offset[TGSI_FILE_TEMPORARY] -
1915 ctx.file_offset[TGSI_FILE_OUTPUT],
1916 0x0F);
1917 }
1918 }
1919
1920 ctx.nliterals = 0;
1921 ctx.literals = NULL;
1922 shader->fs_write_all = FALSE;
1923 while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
1924 tgsi_parse_token(&ctx.parse);
1925 switch (ctx.parse.FullToken.Token.Type) {
1926 case TGSI_TOKEN_TYPE_IMMEDIATE:
1927 immediate = &ctx.parse.FullToken.FullImmediate;
1928 ctx.literals = realloc(ctx.literals, (ctx.nliterals + 1) * 16);
1929 if(ctx.literals == NULL) {
1930 r = -ENOMEM;
1931 goto out_err;
1932 }
1933 ctx.literals[ctx.nliterals * 4 + 0] = immediate->u[0].Uint;
1934 ctx.literals[ctx.nliterals * 4 + 1] = immediate->u[1].Uint;
1935 ctx.literals[ctx.nliterals * 4 + 2] = immediate->u[2].Uint;
1936 ctx.literals[ctx.nliterals * 4 + 3] = immediate->u[3].Uint;
1937 ctx.nliterals++;
1938 break;
1939 case TGSI_TOKEN_TYPE_DECLARATION:
1940 r = tgsi_declaration(&ctx);
1941 if (r)
1942 goto out_err;
1943 break;
1944 case TGSI_TOKEN_TYPE_INSTRUCTION:
1945 break;
1946 case TGSI_TOKEN_TYPE_PROPERTY:
1947 property = &ctx.parse.FullToken.FullProperty;
1948 switch (property->Property.PropertyName) {
1949 case TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS:
1950 if (property->u[0].Data == 1)
1951 shader->fs_write_all = TRUE;
1952 break;
1953 case TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION:
1954 if (property->u[0].Data == 1)
1955 shader->vs_position_window_space = TRUE;
1956 break;
1957 case TGSI_PROPERTY_VS_PROHIBIT_UCPS:
1958 /* we don't need this one */
1959 break;
1960 case TGSI_PROPERTY_GS_INPUT_PRIM:
1961 shader->gs_input_prim = property->u[0].Data;
1962 break;
1963 case TGSI_PROPERTY_GS_OUTPUT_PRIM:
1964 shader->gs_output_prim = property->u[0].Data;
1965 break;
1966 case TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES:
1967 shader->gs_max_out_vertices = property->u[0].Data;
1968 break;
1969 case TGSI_PROPERTY_GS_INVOCATIONS:
1970 shader->gs_num_invocations = property->u[0].Data;
1971 break;
1972 }
1973 break;
1974 default:
1975 R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type);
1976 r = -EINVAL;
1977 goto out_err;
1978 }
1979 }
1980
1981 shader->ring_item_size = ctx.next_ring_offset;
1982
1983 /* Process two side if needed */
1984 if (shader->two_side && ctx.colors_used) {
1985 int i, count = ctx.shader->ninput;
1986 unsigned next_lds_loc = ctx.shader->nlds;
1987
1988 /* additional inputs will be allocated right after the existing inputs,
1989 * we won't need them after the color selection, so we don't need to
1990 * reserve these gprs for the rest of the shader code and to adjust
1991 * output offsets etc. */
1992 int gpr = ctx.file_offset[TGSI_FILE_INPUT] +
1993 ctx.info.file_max[TGSI_FILE_INPUT] + 1;
1994
1995 /* if two sided and neither face or sample mask is used by shader, ensure face_gpr is emitted */
1996 if (ctx.face_gpr == -1) {
1997 i = ctx.shader->ninput++;
1998 ctx.shader->input[i].name = TGSI_SEMANTIC_FACE;
1999 ctx.shader->input[i].spi_sid = 0;
2000 ctx.shader->input[i].gpr = gpr++;
2001 ctx.face_gpr = ctx.shader->input[i].gpr;
2002 }
2003
2004 for (i = 0; i < count; i++) {
2005 if (ctx.shader->input[i].name == TGSI_SEMANTIC_COLOR) {
2006 int ni = ctx.shader->ninput++;
2007 memcpy(&ctx.shader->input[ni],&ctx.shader->input[i], sizeof(struct r600_shader_io));
2008 ctx.shader->input[ni].name = TGSI_SEMANTIC_BCOLOR;
2009 ctx.shader->input[ni].spi_sid = r600_spi_sid(&ctx.shader->input[ni]);
2010 ctx.shader->input[ni].gpr = gpr++;
2011 // TGSI to LLVM needs to know the lds position of inputs.
2012 // Non LLVM path computes it later (in process_twoside_color)
2013 ctx.shader->input[ni].lds_pos = next_lds_loc++;
2014 ctx.shader->input[i].back_color_input = ni;
2015 if (ctx.bc->chip_class >= EVERGREEN) {
2016 if ((r = evergreen_interp_input(&ctx, ni)))
2017 return r;
2018 }
2019 }
2020 }
2021 }
2022
2023 /* LLVM backend setup */
2024 #ifdef R600_USE_LLVM
2025 if (use_llvm) {
2026 struct radeon_llvm_context radeon_llvm_ctx;
2027 LLVMModuleRef mod;
2028 bool dump = r600_can_dump_shader(&rscreen->b, tokens);
2029 boolean use_kill = false;
2030
2031 memset(&radeon_llvm_ctx, 0, sizeof(radeon_llvm_ctx));
2032 radeon_llvm_ctx.type = ctx.type;
2033 radeon_llvm_ctx.two_side = shader->two_side;
2034 radeon_llvm_ctx.face_gpr = ctx.face_gpr;
2035 radeon_llvm_ctx.inputs_count = ctx.shader->ninput + 1;
2036 radeon_llvm_ctx.r600_inputs = ctx.shader->input;
2037 radeon_llvm_ctx.r600_outputs = ctx.shader->output;
2038 radeon_llvm_ctx.color_buffer_count = max_color_exports;
2039 radeon_llvm_ctx.chip_class = ctx.bc->chip_class;
2040 radeon_llvm_ctx.fs_color_all = shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN);
2041 radeon_llvm_ctx.stream_outputs = &so;
2042 radeon_llvm_ctx.clip_vertex = ctx.cv_output;
2043 radeon_llvm_ctx.alpha_to_one = key.alpha_to_one;
2044 radeon_llvm_ctx.has_compressed_msaa_texturing =
2045 ctx.bc->has_compressed_msaa_texturing;
2046 mod = r600_tgsi_llvm(&radeon_llvm_ctx, tokens);
2047 ctx.shader->has_txq_cube_array_z_comp = radeon_llvm_ctx.has_txq_cube_array_z_comp;
2048 ctx.shader->uses_tex_buffers = radeon_llvm_ctx.uses_tex_buffers;
2049
2050 if (r600_llvm_compile(mod, rscreen->b.family, ctx.bc, &use_kill, dump)) {
2051 radeon_llvm_dispose(&radeon_llvm_ctx);
2052 use_llvm = 0;
2053 fprintf(stderr, "R600 LLVM backend failed to compile "
2054 "shader. Falling back to TGSI\n");
2055 } else {
2056 ctx.file_offset[TGSI_FILE_OUTPUT] =
2057 ctx.file_offset[TGSI_FILE_INPUT];
2058 }
2059 if (use_kill)
2060 ctx.shader->uses_kill = use_kill;
2061 radeon_llvm_dispose(&radeon_llvm_ctx);
2062 }
2063 #endif
2064 /* End of LLVM backend setup */
2065
2066 if (shader->fs_write_all && rscreen->b.chip_class >= EVERGREEN)
2067 shader->nr_ps_max_color_exports = 8;
2068
2069 if (!use_llvm) {
2070 if (ctx.fragcoord_input >= 0) {
2071 if (ctx.bc->chip_class == CAYMAN) {
2072 for (j = 0 ; j < 4; j++) {
2073 struct r600_bytecode_alu alu;
2074 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2075 alu.op = ALU_OP1_RECIP_IEEE;
2076 alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
2077 alu.src[0].chan = 3;
2078
2079 alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
2080 alu.dst.chan = j;
2081 alu.dst.write = (j == 3);
2082 alu.last = 1;
2083 if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
2084 return r;
2085 }
2086 } else {
2087 struct r600_bytecode_alu alu;
2088 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2089 alu.op = ALU_OP1_RECIP_IEEE;
2090 alu.src[0].sel = shader->input[ctx.fragcoord_input].gpr;
2091 alu.src[0].chan = 3;
2092
2093 alu.dst.sel = shader->input[ctx.fragcoord_input].gpr;
2094 alu.dst.chan = 3;
2095 alu.dst.write = 1;
2096 alu.last = 1;
2097 if ((r = r600_bytecode_add_alu(ctx.bc, &alu)))
2098 return r;
2099 }
2100 }
2101
2102 if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
2103 struct r600_bytecode_alu alu;
2104 int r;
2105
2106 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2107 alu.op = ALU_OP1_MOV;
2108 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
2109 alu.src[0].value = 0;
2110 alu.dst.sel = ctx.gs_export_gpr_treg;
2111 alu.dst.write = 1;
2112 alu.last = 1;
2113 r = r600_bytecode_add_alu(ctx.bc, &alu);
2114 if (r)
2115 return r;
2116 }
2117 if (shader->two_side && ctx.colors_used) {
2118 if ((r = process_twoside_color_inputs(&ctx)))
2119 return r;
2120 }
2121
2122 tgsi_parse_init(&ctx.parse, tokens);
2123 while (!tgsi_parse_end_of_tokens(&ctx.parse)) {
2124 tgsi_parse_token(&ctx.parse);
2125 switch (ctx.parse.FullToken.Token.Type) {
2126 case TGSI_TOKEN_TYPE_INSTRUCTION:
2127 r = tgsi_is_supported(&ctx);
2128 if (r)
2129 goto out_err;
2130 ctx.max_driver_temp_used = 0;
2131 /* reserve first tmp for everyone */
2132 r600_get_temp(&ctx);
2133
2134 opcode = ctx.parse.FullToken.FullInstruction.Instruction.Opcode;
2135 if ((r = tgsi_split_constant(&ctx)))
2136 goto out_err;
2137 if ((r = tgsi_split_literal_constant(&ctx)))
2138 goto out_err;
2139 if (ctx.type == TGSI_PROCESSOR_GEOMETRY)
2140 if ((r = tgsi_split_gs_inputs(&ctx)))
2141 goto out_err;
2142 if (ctx.bc->chip_class == CAYMAN)
2143 ctx.inst_info = &cm_shader_tgsi_instruction[opcode];
2144 else if (ctx.bc->chip_class >= EVERGREEN)
2145 ctx.inst_info = &eg_shader_tgsi_instruction[opcode];
2146 else
2147 ctx.inst_info = &r600_shader_tgsi_instruction[opcode];
2148 r = ctx.inst_info->process(&ctx);
2149 if (r)
2150 goto out_err;
2151 break;
2152 default:
2153 break;
2154 }
2155 }
2156 }
2157
2158 /* Reset the temporary register counter. */
2159 ctx.max_driver_temp_used = 0;
2160
2161 noutput = shader->noutput;
2162
2163 if (!ring_outputs && ctx.clip_vertex_write) {
2164 unsigned clipdist_temp[2];
2165
2166 clipdist_temp[0] = r600_get_temp(&ctx);
2167 clipdist_temp[1] = r600_get_temp(&ctx);
2168
2169 /* need to convert a clipvertex write into clipdistance writes and not export
2170 the clip vertex anymore */
2171
2172 memset(&shader->output[noutput], 0, 2*sizeof(struct r600_shader_io));
2173 shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
2174 shader->output[noutput].gpr = clipdist_temp[0];
2175 noutput++;
2176 shader->output[noutput].name = TGSI_SEMANTIC_CLIPDIST;
2177 shader->output[noutput].gpr = clipdist_temp[1];
2178 noutput++;
2179
2180 /* reset spi_sid for clipvertex output to avoid confusing spi */
2181 shader->output[ctx.cv_output].spi_sid = 0;
2182
2183 shader->clip_dist_write = 0xFF;
2184
2185 for (i = 0; i < 8; i++) {
2186 int oreg = i >> 2;
2187 int ochan = i & 3;
2188
2189 for (j = 0; j < 4; j++) {
2190 struct r600_bytecode_alu alu;
2191 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2192 alu.op = ALU_OP2_DOT4;
2193 alu.src[0].sel = shader->output[ctx.cv_output].gpr;
2194 alu.src[0].chan = j;
2195
2196 alu.src[1].sel = 512 + i;
2197 alu.src[1].kc_bank = R600_UCP_CONST_BUFFER;
2198 alu.src[1].chan = j;
2199
2200 alu.dst.sel = clipdist_temp[oreg];
2201 alu.dst.chan = j;
2202 alu.dst.write = (j == ochan);
2203 if (j == 3)
2204 alu.last = 1;
2205 if (!use_llvm)
2206 r = r600_bytecode_add_alu(ctx.bc, &alu);
2207 if (r)
2208 return r;
2209 }
2210 }
2211 }
2212
2213 /* Add stream outputs. */
2214 if (!ring_outputs && ctx.type == TGSI_PROCESSOR_VERTEX &&
2215 so.num_outputs && !use_llvm)
2216 emit_streamout(&ctx, &so);
2217
2218 convert_edgeflag_to_int(&ctx);
2219
2220 if (ring_outputs) {
2221 if (key.vs_as_es)
2222 emit_gs_ring_writes(&ctx, FALSE);
2223 } else {
2224 /* Export output */
2225 next_clip_base = shader->vs_out_misc_write ? 62 : 61;
2226
2227 for (i = 0, j = 0; i < noutput; i++, j++) {
2228 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
2229 output[j].gpr = shader->output[i].gpr;
2230 output[j].elem_size = 3;
2231 output[j].swizzle_x = 0;
2232 output[j].swizzle_y = 1;
2233 output[j].swizzle_z = 2;
2234 output[j].swizzle_w = 3;
2235 output[j].burst_count = 1;
2236 output[j].type = -1;
2237 output[j].op = CF_OP_EXPORT;
2238 switch (ctx.type) {
2239 case TGSI_PROCESSOR_VERTEX:
2240 switch (shader->output[i].name) {
2241 case TGSI_SEMANTIC_POSITION:
2242 output[j].array_base = 60;
2243 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2244 pos_emitted = true;
2245 break;
2246
2247 case TGSI_SEMANTIC_PSIZE:
2248 output[j].array_base = 61;
2249 output[j].swizzle_y = 7;
2250 output[j].swizzle_z = 7;
2251 output[j].swizzle_w = 7;
2252 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2253 pos_emitted = true;
2254 break;
2255 case TGSI_SEMANTIC_EDGEFLAG:
2256 output[j].array_base = 61;
2257 output[j].swizzle_x = 7;
2258 output[j].swizzle_y = 0;
2259 output[j].swizzle_z = 7;
2260 output[j].swizzle_w = 7;
2261 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2262 pos_emitted = true;
2263 break;
2264 case TGSI_SEMANTIC_LAYER:
2265 /* spi_sid is 0 for outputs that are
2266 * not consumed by PS */
2267 if (shader->output[i].spi_sid) {
2268 output[j].array_base = next_param_base++;
2269 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2270 j++;
2271 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
2272 }
2273 output[j].array_base = 61;
2274 output[j].swizzle_x = 7;
2275 output[j].swizzle_y = 7;
2276 output[j].swizzle_z = 0;
2277 output[j].swizzle_w = 7;
2278 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2279 pos_emitted = true;
2280 break;
2281 case TGSI_SEMANTIC_VIEWPORT_INDEX:
2282 /* spi_sid is 0 for outputs that are
2283 * not consumed by PS */
2284 if (shader->output[i].spi_sid) {
2285 output[j].array_base = next_param_base++;
2286 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2287 j++;
2288 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
2289 }
2290 output[j].array_base = 61;
2291 output[j].swizzle_x = 7;
2292 output[j].swizzle_y = 7;
2293 output[j].swizzle_z = 7;
2294 output[j].swizzle_w = 0;
2295 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2296 pos_emitted = true;
2297 break;
2298 case TGSI_SEMANTIC_CLIPVERTEX:
2299 j--;
2300 break;
2301 case TGSI_SEMANTIC_CLIPDIST:
2302 output[j].array_base = next_clip_base++;
2303 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2304 pos_emitted = true;
2305 /* spi_sid is 0 for clipdistance outputs that were generated
2306 * for clipvertex - we don't need to pass them to PS */
2307 if (shader->output[i].spi_sid) {
2308 j++;
2309 /* duplicate it as PARAM to pass to the pixel shader */
2310 memcpy(&output[j], &output[j-1], sizeof(struct r600_bytecode_output));
2311 output[j].array_base = next_param_base++;
2312 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2313 }
2314 break;
2315 case TGSI_SEMANTIC_FOG:
2316 output[j].swizzle_y = 4; /* 0 */
2317 output[j].swizzle_z = 4; /* 0 */
2318 output[j].swizzle_w = 5; /* 1 */
2319 break;
2320 }
2321 break;
2322 case TGSI_PROCESSOR_FRAGMENT:
2323 if (shader->output[i].name == TGSI_SEMANTIC_COLOR) {
2324 /* never export more colors than the number of CBs */
2325 if (shader->output[i].sid >= max_color_exports) {
2326 /* skip export */
2327 j--;
2328 continue;
2329 }
2330 output[j].swizzle_w = key.alpha_to_one ? 5 : 3;
2331 output[j].array_base = shader->output[i].sid;
2332 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2333 shader->nr_ps_color_exports++;
2334 if (shader->fs_write_all && (rscreen->b.chip_class >= EVERGREEN)) {
2335 for (k = 1; k < max_color_exports; k++) {
2336 j++;
2337 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
2338 output[j].gpr = shader->output[i].gpr;
2339 output[j].elem_size = 3;
2340 output[j].swizzle_x = 0;
2341 output[j].swizzle_y = 1;
2342 output[j].swizzle_z = 2;
2343 output[j].swizzle_w = key.alpha_to_one ? 5 : 3;
2344 output[j].burst_count = 1;
2345 output[j].array_base = k;
2346 output[j].op = CF_OP_EXPORT;
2347 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2348 shader->nr_ps_color_exports++;
2349 }
2350 }
2351 } else if (shader->output[i].name == TGSI_SEMANTIC_POSITION) {
2352 output[j].array_base = 61;
2353 output[j].swizzle_x = 2;
2354 output[j].swizzle_y = 7;
2355 output[j].swizzle_z = output[j].swizzle_w = 7;
2356 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2357 } else if (shader->output[i].name == TGSI_SEMANTIC_STENCIL) {
2358 output[j].array_base = 61;
2359 output[j].swizzle_x = 7;
2360 output[j].swizzle_y = 1;
2361 output[j].swizzle_z = output[j].swizzle_w = 7;
2362 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2363 } else if (shader->output[i].name == TGSI_SEMANTIC_SAMPLEMASK) {
2364 output[j].array_base = 61;
2365 output[j].swizzle_x = 7;
2366 output[j].swizzle_y = 7;
2367 output[j].swizzle_z = 0;
2368 output[j].swizzle_w = 7;
2369 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2370 } else {
2371 R600_ERR("unsupported fragment output name %d\n", shader->output[i].name);
2372 r = -EINVAL;
2373 goto out_err;
2374 }
2375 break;
2376 default:
2377 R600_ERR("unsupported processor type %d\n", ctx.type);
2378 r = -EINVAL;
2379 goto out_err;
2380 }
2381
2382 if (output[j].type==-1) {
2383 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2384 output[j].array_base = next_param_base++;
2385 }
2386 }
2387
2388 /* add fake position export */
2389 if (ctx.type == TGSI_PROCESSOR_VERTEX && pos_emitted == false) {
2390 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
2391 output[j].gpr = 0;
2392 output[j].elem_size = 3;
2393 output[j].swizzle_x = 7;
2394 output[j].swizzle_y = 7;
2395 output[j].swizzle_z = 7;
2396 output[j].swizzle_w = 7;
2397 output[j].burst_count = 1;
2398 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_POS;
2399 output[j].array_base = 60;
2400 output[j].op = CF_OP_EXPORT;
2401 j++;
2402 }
2403
2404 /* add fake param output for vertex shader if no param is exported */
2405 if (ctx.type == TGSI_PROCESSOR_VERTEX && next_param_base == 0) {
2406 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
2407 output[j].gpr = 0;
2408 output[j].elem_size = 3;
2409 output[j].swizzle_x = 7;
2410 output[j].swizzle_y = 7;
2411 output[j].swizzle_z = 7;
2412 output[j].swizzle_w = 7;
2413 output[j].burst_count = 1;
2414 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PARAM;
2415 output[j].array_base = 0;
2416 output[j].op = CF_OP_EXPORT;
2417 j++;
2418 }
2419
2420 /* add fake pixel export */
2421 if (ctx.type == TGSI_PROCESSOR_FRAGMENT && shader->nr_ps_color_exports == 0) {
2422 memset(&output[j], 0, sizeof(struct r600_bytecode_output));
2423 output[j].gpr = 0;
2424 output[j].elem_size = 3;
2425 output[j].swizzle_x = 7;
2426 output[j].swizzle_y = 7;
2427 output[j].swizzle_z = 7;
2428 output[j].swizzle_w = 7;
2429 output[j].burst_count = 1;
2430 output[j].type = V_SQ_CF_ALLOC_EXPORT_WORD0_SQ_EXPORT_PIXEL;
2431 output[j].array_base = 0;
2432 output[j].op = CF_OP_EXPORT;
2433 j++;
2434 }
2435
2436 noutput = j;
2437
2438 /* set export done on last export of each type */
2439 for (i = noutput - 1, output_done = 0; i >= 0; i--) {
2440 if (!(output_done & (1 << output[i].type))) {
2441 output_done |= (1 << output[i].type);
2442 output[i].op = CF_OP_EXPORT_DONE;
2443 }
2444 }
2445 /* add output to bytecode */
2446 if (!use_llvm) {
2447 for (i = 0; i < noutput; i++) {
2448 r = r600_bytecode_add_output(ctx.bc, &output[i]);
2449 if (r)
2450 goto out_err;
2451 }
2452 }
2453 }
2454
2455 /* add program end */
2456 if (!use_llvm) {
2457 if (ctx.bc->chip_class == CAYMAN)
2458 cm_bytecode_add_cf_end(ctx.bc);
2459 else {
2460 const struct cf_op_info *last = NULL;
2461
2462 if (ctx.bc->cf_last)
2463 last = r600_isa_cf(ctx.bc->cf_last->op);
2464
2465 /* alu clause instructions don't have EOP bit, so add NOP */
2466 if (!last || last->flags & CF_ALU || ctx.bc->cf_last->op == CF_OP_LOOP_END || ctx.bc->cf_last->op == CF_OP_CALL_FS)
2467 r600_bytecode_add_cfinst(ctx.bc, CF_OP_NOP);
2468
2469 ctx.bc->cf_last->end_of_program = 1;
2470 }
2471 }
2472
2473 /* check GPR limit - we have 124 = 128 - 4
2474 * (4 are reserved as alu clause temporary registers) */
2475 if (ctx.bc->ngpr > 124) {
2476 R600_ERR("GPR limit exceeded - shader requires %d registers\n", ctx.bc->ngpr);
2477 r = -ENOMEM;
2478 goto out_err;
2479 }
2480
2481 if (ctx.type == TGSI_PROCESSOR_GEOMETRY) {
2482 if ((r = generate_gs_copy_shader(rctx, pipeshader, &so)))
2483 return r;
2484 }
2485
2486 free(ctx.literals);
2487 tgsi_parse_free(&ctx.parse);
2488 return 0;
2489 out_err:
2490 free(ctx.literals);
2491 tgsi_parse_free(&ctx.parse);
2492 return r;
2493 }
2494
2495 static int tgsi_unsupported(struct r600_shader_ctx *ctx)
2496 {
2497 R600_ERR("%s tgsi opcode unsupported\n",
2498 tgsi_get_opcode_name(ctx->inst_info->tgsi_opcode));
2499 return -EINVAL;
2500 }
2501
2502 static int tgsi_end(struct r600_shader_ctx *ctx)
2503 {
2504 return 0;
2505 }
2506
2507 static void r600_bytecode_src(struct r600_bytecode_alu_src *bc_src,
2508 const struct r600_shader_src *shader_src,
2509 unsigned chan)
2510 {
2511 bc_src->sel = shader_src->sel;
2512 bc_src->chan = shader_src->swizzle[chan];
2513 bc_src->neg = shader_src->neg;
2514 bc_src->abs = shader_src->abs;
2515 bc_src->rel = shader_src->rel;
2516 bc_src->value = shader_src->value[bc_src->chan];
2517 bc_src->kc_bank = shader_src->kc_bank;
2518 }
2519
2520 static void r600_bytecode_src_set_abs(struct r600_bytecode_alu_src *bc_src)
2521 {
2522 bc_src->abs = 1;
2523 bc_src->neg = 0;
2524 }
2525
2526 static void r600_bytecode_src_toggle_neg(struct r600_bytecode_alu_src *bc_src)
2527 {
2528 bc_src->neg = !bc_src->neg;
2529 }
2530
2531 static void tgsi_dst(struct r600_shader_ctx *ctx,
2532 const struct tgsi_full_dst_register *tgsi_dst,
2533 unsigned swizzle,
2534 struct r600_bytecode_alu_dst *r600_dst)
2535 {
2536 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2537
2538 r600_dst->sel = tgsi_dst->Register.Index;
2539 r600_dst->sel += ctx->file_offset[tgsi_dst->Register.File];
2540 r600_dst->chan = swizzle;
2541 r600_dst->write = 1;
2542 if (tgsi_dst->Register.Indirect)
2543 r600_dst->rel = V_SQ_REL_RELATIVE;
2544 if (inst->Instruction.Saturate) {
2545 r600_dst->clamp = 1;
2546 }
2547 }
2548
2549 static int tgsi_last_instruction(unsigned writemask)
2550 {
2551 int i, lasti = 0;
2552
2553 for (i = 0; i < 4; i++) {
2554 if (writemask & (1 << i)) {
2555 lasti = i;
2556 }
2557 }
2558 return lasti;
2559 }
2560
2561 static int tgsi_op2_s(struct r600_shader_ctx *ctx, int swap, int trans_only)
2562 {
2563 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2564 struct r600_bytecode_alu alu;
2565 unsigned write_mask = inst->Dst[0].Register.WriteMask;
2566 int i, j, r, lasti = tgsi_last_instruction(write_mask);
2567 /* use temp register if trans_only and more than one dst component */
2568 int use_tmp = trans_only && (write_mask ^ (1 << lasti));
2569
2570 for (i = 0; i <= lasti; i++) {
2571 if (!(write_mask & (1 << i)))
2572 continue;
2573
2574 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2575 if (use_tmp) {
2576 alu.dst.sel = ctx->temp_reg;
2577 alu.dst.chan = i;
2578 alu.dst.write = 1;
2579 } else
2580 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2581
2582 alu.op = ctx->inst_info->op;
2583 if (!swap) {
2584 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
2585 r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
2586 }
2587 } else {
2588 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
2589 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
2590 }
2591 /* handle some special cases */
2592 switch (ctx->inst_info->tgsi_opcode) {
2593 case TGSI_OPCODE_SUB:
2594 r600_bytecode_src_toggle_neg(&alu.src[1]);
2595 break;
2596 case TGSI_OPCODE_ABS:
2597 r600_bytecode_src_set_abs(&alu.src[0]);
2598 break;
2599 default:
2600 break;
2601 }
2602 if (i == lasti || trans_only) {
2603 alu.last = 1;
2604 }
2605 r = r600_bytecode_add_alu(ctx->bc, &alu);
2606 if (r)
2607 return r;
2608 }
2609
2610 if (use_tmp) {
2611 /* move result from temp to dst */
2612 for (i = 0; i <= lasti; i++) {
2613 if (!(write_mask & (1 << i)))
2614 continue;
2615
2616 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2617 alu.op = ALU_OP1_MOV;
2618 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2619 alu.src[0].sel = ctx->temp_reg;
2620 alu.src[0].chan = i;
2621 alu.last = (i == lasti);
2622
2623 r = r600_bytecode_add_alu(ctx->bc, &alu);
2624 if (r)
2625 return r;
2626 }
2627 }
2628 return 0;
2629 }
2630
2631 static int tgsi_op2(struct r600_shader_ctx *ctx)
2632 {
2633 return tgsi_op2_s(ctx, 0, 0);
2634 }
2635
2636 static int tgsi_op2_swap(struct r600_shader_ctx *ctx)
2637 {
2638 return tgsi_op2_s(ctx, 1, 0);
2639 }
2640
2641 static int tgsi_op2_trans(struct r600_shader_ctx *ctx)
2642 {
2643 return tgsi_op2_s(ctx, 0, 1);
2644 }
2645
2646 static int tgsi_ineg(struct r600_shader_ctx *ctx)
2647 {
2648 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2649 struct r600_bytecode_alu alu;
2650 int i, r;
2651 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
2652
2653 for (i = 0; i < lasti + 1; i++) {
2654
2655 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
2656 continue;
2657 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2658 alu.op = ctx->inst_info->op;
2659
2660 alu.src[0].sel = V_SQ_ALU_SRC_0;
2661
2662 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
2663
2664 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2665
2666 if (i == lasti) {
2667 alu.last = 1;
2668 }
2669 r = r600_bytecode_add_alu(ctx->bc, &alu);
2670 if (r)
2671 return r;
2672 }
2673 return 0;
2674
2675 }
2676
2677 static int cayman_emit_float_instr(struct r600_shader_ctx *ctx)
2678 {
2679 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2680 int i, j, r;
2681 struct r600_bytecode_alu alu;
2682 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
2683
2684 for (i = 0 ; i < last_slot; i++) {
2685 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2686 alu.op = ctx->inst_info->op;
2687 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
2688 r600_bytecode_src(&alu.src[j], &ctx->src[j], 0);
2689
2690 /* RSQ should take the absolute value of src */
2691 if (ctx->inst_info->tgsi_opcode == TGSI_OPCODE_RSQ) {
2692 r600_bytecode_src_set_abs(&alu.src[j]);
2693 }
2694 }
2695 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2696 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
2697
2698 if (i == last_slot - 1)
2699 alu.last = 1;
2700 r = r600_bytecode_add_alu(ctx->bc, &alu);
2701 if (r)
2702 return r;
2703 }
2704 return 0;
2705 }
2706
2707 static int cayman_mul_int_instr(struct r600_shader_ctx *ctx)
2708 {
2709 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2710 int i, j, k, r;
2711 struct r600_bytecode_alu alu;
2712 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
2713 for (k = 0; k < last_slot; k++) {
2714 if (!(inst->Dst[0].Register.WriteMask & (1 << k)))
2715 continue;
2716
2717 for (i = 0 ; i < 4; i++) {
2718 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2719 alu.op = ctx->inst_info->op;
2720 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
2721 r600_bytecode_src(&alu.src[j], &ctx->src[j], k);
2722 }
2723 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2724 alu.dst.write = (i == k);
2725 if (i == 3)
2726 alu.last = 1;
2727 r = r600_bytecode_add_alu(ctx->bc, &alu);
2728 if (r)
2729 return r;
2730 }
2731 }
2732 return 0;
2733 }
2734
2735 /*
2736 * r600 - trunc to -PI..PI range
2737 * r700 - normalize by dividing by 2PI
2738 * see fdo bug 27901
2739 */
2740 static int tgsi_setup_trig(struct r600_shader_ctx *ctx)
2741 {
2742 static float half_inv_pi = 1.0 /(3.1415926535 * 2);
2743 static float double_pi = 3.1415926535 * 2;
2744 static float neg_pi = -3.1415926535;
2745
2746 int r;
2747 struct r600_bytecode_alu alu;
2748
2749 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2750 alu.op = ALU_OP3_MULADD;
2751 alu.is_op3 = 1;
2752
2753 alu.dst.chan = 0;
2754 alu.dst.sel = ctx->temp_reg;
2755 alu.dst.write = 1;
2756
2757 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
2758
2759 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2760 alu.src[1].chan = 0;
2761 alu.src[1].value = *(uint32_t *)&half_inv_pi;
2762 alu.src[2].sel = V_SQ_ALU_SRC_0_5;
2763 alu.src[2].chan = 0;
2764 alu.last = 1;
2765 r = r600_bytecode_add_alu(ctx->bc, &alu);
2766 if (r)
2767 return r;
2768
2769 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2770 alu.op = ALU_OP1_FRACT;
2771
2772 alu.dst.chan = 0;
2773 alu.dst.sel = ctx->temp_reg;
2774 alu.dst.write = 1;
2775
2776 alu.src[0].sel = ctx->temp_reg;
2777 alu.src[0].chan = 0;
2778 alu.last = 1;
2779 r = r600_bytecode_add_alu(ctx->bc, &alu);
2780 if (r)
2781 return r;
2782
2783 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2784 alu.op = ALU_OP3_MULADD;
2785 alu.is_op3 = 1;
2786
2787 alu.dst.chan = 0;
2788 alu.dst.sel = ctx->temp_reg;
2789 alu.dst.write = 1;
2790
2791 alu.src[0].sel = ctx->temp_reg;
2792 alu.src[0].chan = 0;
2793
2794 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
2795 alu.src[1].chan = 0;
2796 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
2797 alu.src[2].chan = 0;
2798
2799 if (ctx->bc->chip_class == R600) {
2800 alu.src[1].value = *(uint32_t *)&double_pi;
2801 alu.src[2].value = *(uint32_t *)&neg_pi;
2802 } else {
2803 alu.src[1].sel = V_SQ_ALU_SRC_1;
2804 alu.src[2].sel = V_SQ_ALU_SRC_0_5;
2805 alu.src[2].neg = 1;
2806 }
2807
2808 alu.last = 1;
2809 r = r600_bytecode_add_alu(ctx->bc, &alu);
2810 if (r)
2811 return r;
2812 return 0;
2813 }
2814
2815 static int cayman_trig(struct r600_shader_ctx *ctx)
2816 {
2817 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2818 struct r600_bytecode_alu alu;
2819 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
2820 int i, r;
2821
2822 r = tgsi_setup_trig(ctx);
2823 if (r)
2824 return r;
2825
2826
2827 for (i = 0; i < last_slot; i++) {
2828 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2829 alu.op = ctx->inst_info->op;
2830 alu.dst.chan = i;
2831
2832 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2833 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
2834
2835 alu.src[0].sel = ctx->temp_reg;
2836 alu.src[0].chan = 0;
2837 if (i == last_slot - 1)
2838 alu.last = 1;
2839 r = r600_bytecode_add_alu(ctx->bc, &alu);
2840 if (r)
2841 return r;
2842 }
2843 return 0;
2844 }
2845
2846 static int tgsi_trig(struct r600_shader_ctx *ctx)
2847 {
2848 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2849 struct r600_bytecode_alu alu;
2850 int i, r;
2851 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
2852
2853 r = tgsi_setup_trig(ctx);
2854 if (r)
2855 return r;
2856
2857 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2858 alu.op = ctx->inst_info->op;
2859 alu.dst.chan = 0;
2860 alu.dst.sel = ctx->temp_reg;
2861 alu.dst.write = 1;
2862
2863 alu.src[0].sel = ctx->temp_reg;
2864 alu.src[0].chan = 0;
2865 alu.last = 1;
2866 r = r600_bytecode_add_alu(ctx->bc, &alu);
2867 if (r)
2868 return r;
2869
2870 /* replicate result */
2871 for (i = 0; i < lasti + 1; i++) {
2872 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
2873 continue;
2874
2875 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2876 alu.op = ALU_OP1_MOV;
2877
2878 alu.src[0].sel = ctx->temp_reg;
2879 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2880 if (i == lasti)
2881 alu.last = 1;
2882 r = r600_bytecode_add_alu(ctx->bc, &alu);
2883 if (r)
2884 return r;
2885 }
2886 return 0;
2887 }
2888
2889 static int tgsi_scs(struct r600_shader_ctx *ctx)
2890 {
2891 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
2892 struct r600_bytecode_alu alu;
2893 int i, r;
2894
2895 /* We'll only need the trig stuff if we are going to write to the
2896 * X or Y components of the destination vector.
2897 */
2898 if (likely(inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_XY)) {
2899 r = tgsi_setup_trig(ctx);
2900 if (r)
2901 return r;
2902 }
2903
2904 /* dst.x = COS */
2905 if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_X) {
2906 if (ctx->bc->chip_class == CAYMAN) {
2907 for (i = 0 ; i < 3; i++) {
2908 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2909 alu.op = ALU_OP1_COS;
2910 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2911
2912 if (i == 0)
2913 alu.dst.write = 1;
2914 else
2915 alu.dst.write = 0;
2916 alu.src[0].sel = ctx->temp_reg;
2917 alu.src[0].chan = 0;
2918 if (i == 2)
2919 alu.last = 1;
2920 r = r600_bytecode_add_alu(ctx->bc, &alu);
2921 if (r)
2922 return r;
2923 }
2924 } else {
2925 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2926 alu.op = ALU_OP1_COS;
2927 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
2928
2929 alu.src[0].sel = ctx->temp_reg;
2930 alu.src[0].chan = 0;
2931 alu.last = 1;
2932 r = r600_bytecode_add_alu(ctx->bc, &alu);
2933 if (r)
2934 return r;
2935 }
2936 }
2937
2938 /* dst.y = SIN */
2939 if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Y) {
2940 if (ctx->bc->chip_class == CAYMAN) {
2941 for (i = 0 ; i < 3; i++) {
2942 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2943 alu.op = ALU_OP1_SIN;
2944 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
2945 if (i == 1)
2946 alu.dst.write = 1;
2947 else
2948 alu.dst.write = 0;
2949 alu.src[0].sel = ctx->temp_reg;
2950 alu.src[0].chan = 0;
2951 if (i == 2)
2952 alu.last = 1;
2953 r = r600_bytecode_add_alu(ctx->bc, &alu);
2954 if (r)
2955 return r;
2956 }
2957 } else {
2958 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2959 alu.op = ALU_OP1_SIN;
2960 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
2961
2962 alu.src[0].sel = ctx->temp_reg;
2963 alu.src[0].chan = 0;
2964 alu.last = 1;
2965 r = r600_bytecode_add_alu(ctx->bc, &alu);
2966 if (r)
2967 return r;
2968 }
2969 }
2970
2971 /* dst.z = 0.0; */
2972 if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_Z) {
2973 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2974
2975 alu.op = ALU_OP1_MOV;
2976
2977 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
2978
2979 alu.src[0].sel = V_SQ_ALU_SRC_0;
2980 alu.src[0].chan = 0;
2981
2982 alu.last = 1;
2983
2984 r = r600_bytecode_add_alu(ctx->bc, &alu);
2985 if (r)
2986 return r;
2987 }
2988
2989 /* dst.w = 1.0; */
2990 if (inst->Dst[0].Register.WriteMask & TGSI_WRITEMASK_W) {
2991 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
2992
2993 alu.op = ALU_OP1_MOV;
2994
2995 tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
2996
2997 alu.src[0].sel = V_SQ_ALU_SRC_1;
2998 alu.src[0].chan = 0;
2999
3000 alu.last = 1;
3001
3002 r = r600_bytecode_add_alu(ctx->bc, &alu);
3003 if (r)
3004 return r;
3005 }
3006
3007 return 0;
3008 }
3009
3010 static int tgsi_kill(struct r600_shader_ctx *ctx)
3011 {
3012 struct r600_bytecode_alu alu;
3013 int i, r;
3014
3015 for (i = 0; i < 4; i++) {
3016 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3017 alu.op = ctx->inst_info->op;
3018
3019 alu.dst.chan = i;
3020
3021 alu.src[0].sel = V_SQ_ALU_SRC_0;
3022
3023 if (ctx->inst_info->tgsi_opcode == TGSI_OPCODE_KILL) {
3024 alu.src[1].sel = V_SQ_ALU_SRC_1;
3025 alu.src[1].neg = 1;
3026 } else {
3027 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3028 }
3029 if (i == 3) {
3030 alu.last = 1;
3031 }
3032 r = r600_bytecode_add_alu(ctx->bc, &alu);
3033 if (r)
3034 return r;
3035 }
3036
3037 /* kill must be last in ALU */
3038 ctx->bc->force_add_cf = 1;
3039 ctx->shader->uses_kill = TRUE;
3040 return 0;
3041 }
3042
3043 static int tgsi_lit(struct r600_shader_ctx *ctx)
3044 {
3045 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3046 struct r600_bytecode_alu alu;
3047 int r;
3048
3049 /* tmp.x = max(src.y, 0.0) */
3050 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3051 alu.op = ALU_OP2_MAX;
3052 r600_bytecode_src(&alu.src[0], &ctx->src[0], 1);
3053 alu.src[1].sel = V_SQ_ALU_SRC_0; /*0.0*/
3054 alu.src[1].chan = 1;
3055
3056 alu.dst.sel = ctx->temp_reg;
3057 alu.dst.chan = 0;
3058 alu.dst.write = 1;
3059
3060 alu.last = 1;
3061 r = r600_bytecode_add_alu(ctx->bc, &alu);
3062 if (r)
3063 return r;
3064
3065 if (inst->Dst[0].Register.WriteMask & (1 << 2))
3066 {
3067 int chan;
3068 int sel;
3069 int i;
3070
3071 if (ctx->bc->chip_class == CAYMAN) {
3072 for (i = 0; i < 3; i++) {
3073 /* tmp.z = log(tmp.x) */
3074 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3075 alu.op = ALU_OP1_LOG_CLAMPED;
3076 alu.src[0].sel = ctx->temp_reg;
3077 alu.src[0].chan = 0;
3078 alu.dst.sel = ctx->temp_reg;
3079 alu.dst.chan = i;
3080 if (i == 2) {
3081 alu.dst.write = 1;
3082 alu.last = 1;
3083 } else
3084 alu.dst.write = 0;
3085
3086 r = r600_bytecode_add_alu(ctx->bc, &alu);
3087 if (r)
3088 return r;
3089 }
3090 } else {
3091 /* tmp.z = log(tmp.x) */
3092 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3093 alu.op = ALU_OP1_LOG_CLAMPED;
3094 alu.src[0].sel = ctx->temp_reg;
3095 alu.src[0].chan = 0;
3096 alu.dst.sel = ctx->temp_reg;
3097 alu.dst.chan = 2;
3098 alu.dst.write = 1;
3099 alu.last = 1;
3100 r = r600_bytecode_add_alu(ctx->bc, &alu);
3101 if (r)
3102 return r;
3103 }
3104
3105 chan = alu.dst.chan;
3106 sel = alu.dst.sel;
3107
3108 /* tmp.x = amd MUL_LIT(tmp.z, src.w, src.x ) */
3109 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3110 alu.op = ALU_OP3_MUL_LIT;
3111 alu.src[0].sel = sel;
3112 alu.src[0].chan = chan;
3113 r600_bytecode_src(&alu.src[1], &ctx->src[0], 3);
3114 r600_bytecode_src(&alu.src[2], &ctx->src[0], 0);
3115 alu.dst.sel = ctx->temp_reg;
3116 alu.dst.chan = 0;
3117 alu.dst.write = 1;
3118 alu.is_op3 = 1;
3119 alu.last = 1;
3120 r = r600_bytecode_add_alu(ctx->bc, &alu);
3121 if (r)
3122 return r;
3123
3124 if (ctx->bc->chip_class == CAYMAN) {
3125 for (i = 0; i < 3; i++) {
3126 /* dst.z = exp(tmp.x) */
3127 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3128 alu.op = ALU_OP1_EXP_IEEE;
3129 alu.src[0].sel = ctx->temp_reg;
3130 alu.src[0].chan = 0;
3131 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3132 if (i == 2) {
3133 alu.dst.write = 1;
3134 alu.last = 1;
3135 } else
3136 alu.dst.write = 0;
3137 r = r600_bytecode_add_alu(ctx->bc, &alu);
3138 if (r)
3139 return r;
3140 }
3141 } else {
3142 /* dst.z = exp(tmp.x) */
3143 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3144 alu.op = ALU_OP1_EXP_IEEE;
3145 alu.src[0].sel = ctx->temp_reg;
3146 alu.src[0].chan = 0;
3147 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
3148 alu.last = 1;
3149 r = r600_bytecode_add_alu(ctx->bc, &alu);
3150 if (r)
3151 return r;
3152 }
3153 }
3154
3155 /* dst.x, <- 1.0 */
3156 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3157 alu.op = ALU_OP1_MOV;
3158 alu.src[0].sel = V_SQ_ALU_SRC_1; /*1.0*/
3159 alu.src[0].chan = 0;
3160 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
3161 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 0) & 1;
3162 r = r600_bytecode_add_alu(ctx->bc, &alu);
3163 if (r)
3164 return r;
3165
3166 /* dst.y = max(src.x, 0.0) */
3167 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3168 alu.op = ALU_OP2_MAX;
3169 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
3170 alu.src[1].sel = V_SQ_ALU_SRC_0; /*0.0*/
3171 alu.src[1].chan = 0;
3172 tgsi_dst(ctx, &inst->Dst[0], 1, &alu.dst);
3173 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 1) & 1;
3174 r = r600_bytecode_add_alu(ctx->bc, &alu);
3175 if (r)
3176 return r;
3177
3178 /* dst.w, <- 1.0 */
3179 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3180 alu.op = ALU_OP1_MOV;
3181 alu.src[0].sel = V_SQ_ALU_SRC_1;
3182 alu.src[0].chan = 0;
3183 tgsi_dst(ctx, &inst->Dst[0], 3, &alu.dst);
3184 alu.dst.write = (inst->Dst[0].Register.WriteMask >> 3) & 1;
3185 alu.last = 1;
3186 r = r600_bytecode_add_alu(ctx->bc, &alu);
3187 if (r)
3188 return r;
3189
3190 return 0;
3191 }
3192
3193 static int tgsi_rsq(struct r600_shader_ctx *ctx)
3194 {
3195 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3196 struct r600_bytecode_alu alu;
3197 int i, r;
3198
3199 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3200
3201 /* XXX:
3202 * For state trackers other than OpenGL, we'll want to use
3203 * _RECIPSQRT_IEEE instead.
3204 */
3205 alu.op = ALU_OP1_RECIPSQRT_CLAMPED;
3206
3207 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
3208 r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
3209 r600_bytecode_src_set_abs(&alu.src[i]);
3210 }
3211 alu.dst.sel = ctx->temp_reg;
3212 alu.dst.write = 1;
3213 alu.last = 1;
3214 r = r600_bytecode_add_alu(ctx->bc, &alu);
3215 if (r)
3216 return r;
3217 /* replicate result */
3218 return tgsi_helper_tempx_replicate(ctx);
3219 }
3220
3221 static int tgsi_helper_tempx_replicate(struct r600_shader_ctx *ctx)
3222 {
3223 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3224 struct r600_bytecode_alu alu;
3225 int i, r;
3226
3227 for (i = 0; i < 4; i++) {
3228 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3229 alu.src[0].sel = ctx->temp_reg;
3230 alu.op = ALU_OP1_MOV;
3231 alu.dst.chan = i;
3232 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3233 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
3234 if (i == 3)
3235 alu.last = 1;
3236 r = r600_bytecode_add_alu(ctx->bc, &alu);
3237 if (r)
3238 return r;
3239 }
3240 return 0;
3241 }
3242
3243 static int tgsi_trans_srcx_replicate(struct r600_shader_ctx *ctx)
3244 {
3245 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3246 struct r600_bytecode_alu alu;
3247 int i, r;
3248
3249 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3250 alu.op = ctx->inst_info->op;
3251 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
3252 r600_bytecode_src(&alu.src[i], &ctx->src[i], 0);
3253 }
3254 alu.dst.sel = ctx->temp_reg;
3255 alu.dst.write = 1;
3256 alu.last = 1;
3257 r = r600_bytecode_add_alu(ctx->bc, &alu);
3258 if (r)
3259 return r;
3260 /* replicate result */
3261 return tgsi_helper_tempx_replicate(ctx);
3262 }
3263
3264 static int cayman_pow(struct r600_shader_ctx *ctx)
3265 {
3266 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3267 int i, r;
3268 struct r600_bytecode_alu alu;
3269 int last_slot = (inst->Dst[0].Register.WriteMask & 0x8) ? 4 : 3;
3270
3271 for (i = 0; i < 3; i++) {
3272 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3273 alu.op = ALU_OP1_LOG_IEEE;
3274 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
3275 alu.dst.sel = ctx->temp_reg;
3276 alu.dst.chan = i;
3277 alu.dst.write = 1;
3278 if (i == 2)
3279 alu.last = 1;
3280 r = r600_bytecode_add_alu(ctx->bc, &alu);
3281 if (r)
3282 return r;
3283 }
3284
3285 /* b * LOG2(a) */
3286 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3287 alu.op = ALU_OP2_MUL;
3288 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
3289 alu.src[1].sel = ctx->temp_reg;
3290 alu.dst.sel = ctx->temp_reg;
3291 alu.dst.write = 1;
3292 alu.last = 1;
3293 r = r600_bytecode_add_alu(ctx->bc, &alu);
3294 if (r)
3295 return r;
3296
3297 for (i = 0; i < last_slot; i++) {
3298 /* POW(a,b) = EXP2(b * LOG2(a))*/
3299 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3300 alu.op = ALU_OP1_EXP_IEEE;
3301 alu.src[0].sel = ctx->temp_reg;
3302
3303 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
3304 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
3305 if (i == last_slot - 1)
3306 alu.last = 1;
3307 r = r600_bytecode_add_alu(ctx->bc, &alu);
3308 if (r)
3309 return r;
3310 }
3311 return 0;
3312 }
3313
3314 static int tgsi_pow(struct r600_shader_ctx *ctx)
3315 {
3316 struct r600_bytecode_alu alu;
3317 int r;
3318
3319 /* LOG2(a) */
3320 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3321 alu.op = ALU_OP1_LOG_IEEE;
3322 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
3323 alu.dst.sel = ctx->temp_reg;
3324 alu.dst.write = 1;
3325 alu.last = 1;
3326 r = r600_bytecode_add_alu(ctx->bc, &alu);
3327 if (r)
3328 return r;
3329 /* b * LOG2(a) */
3330 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3331 alu.op = ALU_OP2_MUL;
3332 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
3333 alu.src[1].sel = ctx->temp_reg;
3334 alu.dst.sel = ctx->temp_reg;
3335 alu.dst.write = 1;
3336 alu.last = 1;
3337 r = r600_bytecode_add_alu(ctx->bc, &alu);
3338 if (r)
3339 return r;
3340 /* POW(a,b) = EXP2(b * LOG2(a))*/
3341 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3342 alu.op = ALU_OP1_EXP_IEEE;
3343 alu.src[0].sel = ctx->temp_reg;
3344 alu.dst.sel = ctx->temp_reg;
3345 alu.dst.write = 1;
3346 alu.last = 1;
3347 r = r600_bytecode_add_alu(ctx->bc, &alu);
3348 if (r)
3349 return r;
3350 return tgsi_helper_tempx_replicate(ctx);
3351 }
3352
3353 static int tgsi_divmod(struct r600_shader_ctx *ctx, int mod, int signed_op)
3354 {
3355 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
3356 struct r600_bytecode_alu alu;
3357 int i, r, j;
3358 unsigned write_mask = inst->Dst[0].Register.WriteMask;
3359 int tmp0 = ctx->temp_reg;
3360 int tmp1 = r600_get_temp(ctx);
3361 int tmp2 = r600_get_temp(ctx);
3362 int tmp3 = r600_get_temp(ctx);
3363 /* Unsigned path:
3364 *
3365 * we need to represent src1 as src2*q + r, where q - quotient, r - remainder
3366 *
3367 * 1. tmp0.x = rcp (src2) = 2^32/src2 + e, where e is rounding error
3368 * 2. tmp0.z = lo (tmp0.x * src2)
3369 * 3. tmp0.w = -tmp0.z
3370 * 4. tmp0.y = hi (tmp0.x * src2)
3371 * 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z) = abs(lo(rcp*src2))
3372 * 6. tmp0.w = hi (tmp0.z * tmp0.x) = e, rounding error
3373 * 7. tmp1.x = tmp0.x - tmp0.w
3374 * 8. tmp1.y = tmp0.x + tmp0.w
3375 * 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x)
3376 * 10. tmp0.z = hi(tmp0.x * src1) = q
3377 * 11. tmp0.y = lo (tmp0.z * src2) = src2*q = src1 - r
3378 *
3379 * 12. tmp0.w = src1 - tmp0.y = r
3380 * 13. tmp1.x = tmp0.w >= src2 = r >= src2 (uint comparison)
3381 * 14. tmp1.y = src1 >= tmp0.y = r >= 0 (uint comparison)
3382 *
3383 * if DIV
3384 *
3385 * 15. tmp1.z = tmp0.z + 1 = q + 1
3386 * 16. tmp1.w = tmp0.z - 1 = q - 1
3387 *
3388 * else MOD
3389 *
3390 * 15. tmp1.z = tmp0.w - src2 = r - src2
3391 * 16. tmp1.w = tmp0.w + src2 = r + src2
3392 *
3393 * endif
3394 *
3395 * 17. tmp1.x = tmp1.x & tmp1.y
3396 *
3397 * DIV: 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z
3398 * MOD: 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z
3399 *
3400 * 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z
3401 * 20. dst = src2==0 ? MAX_UINT : tmp0.z
3402 *
3403 * Signed path:
3404 *
3405 * Same as unsigned, using abs values of the operands,
3406 * and fixing the sign of the result in the end.
3407 */
3408
3409 for (i = 0; i < 4; i++) {
3410 if (!(write_mask & (1<<i)))
3411 continue;
3412
3413 if (signed_op) {
3414
3415 /* tmp2.x = -src0 */
3416 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3417 alu.op = ALU_OP2_SUB_INT;
3418
3419 alu.dst.sel = tmp2;
3420 alu.dst.chan = 0;
3421 alu.dst.write = 1;
3422
3423 alu.src[0].sel = V_SQ_ALU_SRC_0;
3424
3425 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3426
3427 alu.last = 1;
3428 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3429 return r;
3430
3431 /* tmp2.y = -src1 */
3432 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3433 alu.op = ALU_OP2_SUB_INT;
3434
3435 alu.dst.sel = tmp2;
3436 alu.dst.chan = 1;
3437 alu.dst.write = 1;
3438
3439 alu.src[0].sel = V_SQ_ALU_SRC_0;
3440
3441 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3442
3443 alu.last = 1;
3444 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3445 return r;
3446
3447 /* tmp2.z sign bit is set if src0 and src2 signs are different */
3448 /* it will be a sign of the quotient */
3449 if (!mod) {
3450
3451 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3452 alu.op = ALU_OP2_XOR_INT;
3453
3454 alu.dst.sel = tmp2;
3455 alu.dst.chan = 2;
3456 alu.dst.write = 1;
3457
3458 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3459 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3460
3461 alu.last = 1;
3462 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3463 return r;
3464 }
3465
3466 /* tmp2.x = |src0| */
3467 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3468 alu.op = ALU_OP3_CNDGE_INT;
3469 alu.is_op3 = 1;
3470
3471 alu.dst.sel = tmp2;
3472 alu.dst.chan = 0;
3473 alu.dst.write = 1;
3474
3475 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3476 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3477 alu.src[2].sel = tmp2;
3478 alu.src[2].chan = 0;
3479
3480 alu.last = 1;
3481 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3482 return r;
3483
3484 /* tmp2.y = |src1| */
3485 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3486 alu.op = ALU_OP3_CNDGE_INT;
3487 alu.is_op3 = 1;
3488
3489 alu.dst.sel = tmp2;
3490 alu.dst.chan = 1;
3491 alu.dst.write = 1;
3492
3493 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
3494 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3495 alu.src[2].sel = tmp2;
3496 alu.src[2].chan = 1;
3497
3498 alu.last = 1;
3499 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3500 return r;
3501
3502 }
3503
3504 /* 1. tmp0.x = rcp_u (src2) = 2^32/src2 + e, where e is rounding error */
3505 if (ctx->bc->chip_class == CAYMAN) {
3506 /* tmp3.x = u2f(src2) */
3507 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3508 alu.op = ALU_OP1_UINT_TO_FLT;
3509
3510 alu.dst.sel = tmp3;
3511 alu.dst.chan = 0;
3512 alu.dst.write = 1;
3513
3514 if (signed_op) {
3515 alu.src[0].sel = tmp2;
3516 alu.src[0].chan = 1;
3517 } else {
3518 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
3519 }
3520
3521 alu.last = 1;
3522 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3523 return r;
3524
3525 /* tmp0.x = recip(tmp3.x) */
3526 for (j = 0 ; j < 3; j++) {
3527 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3528 alu.op = ALU_OP1_RECIP_IEEE;
3529
3530 alu.dst.sel = tmp0;
3531 alu.dst.chan = j;
3532 alu.dst.write = (j == 0);
3533
3534 alu.src[0].sel = tmp3;
3535 alu.src[0].chan = 0;
3536
3537 if (j == 2)
3538 alu.last = 1;
3539 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3540 return r;
3541 }
3542
3543 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3544 alu.op = ALU_OP2_MUL;
3545
3546 alu.src[0].sel = tmp0;
3547 alu.src[0].chan = 0;
3548
3549 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
3550 alu.src[1].value = 0x4f800000;
3551
3552 alu.dst.sel = tmp3;
3553 alu.dst.write = 1;
3554 alu.last = 1;
3555 r = r600_bytecode_add_alu(ctx->bc, &alu);
3556 if (r)
3557 return r;
3558
3559 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3560 alu.op = ALU_OP1_FLT_TO_UINT;
3561
3562 alu.dst.sel = tmp0;
3563 alu.dst.chan = 0;
3564 alu.dst.write = 1;
3565
3566 alu.src[0].sel = tmp3;
3567 alu.src[0].chan = 0;
3568
3569 alu.last = 1;
3570 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3571 return r;
3572
3573 } else {
3574 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3575 alu.op = ALU_OP1_RECIP_UINT;
3576
3577 alu.dst.sel = tmp0;
3578 alu.dst.chan = 0;
3579 alu.dst.write = 1;
3580
3581 if (signed_op) {
3582 alu.src[0].sel = tmp2;
3583 alu.src[0].chan = 1;
3584 } else {
3585 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
3586 }
3587
3588 alu.last = 1;
3589 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3590 return r;
3591 }
3592
3593 /* 2. tmp0.z = lo (tmp0.x * src2) */
3594 if (ctx->bc->chip_class == CAYMAN) {
3595 for (j = 0 ; j < 4; j++) {
3596 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3597 alu.op = ALU_OP2_MULLO_UINT;
3598
3599 alu.dst.sel = tmp0;
3600 alu.dst.chan = j;
3601 alu.dst.write = (j == 2);
3602
3603 alu.src[0].sel = tmp0;
3604 alu.src[0].chan = 0;
3605 if (signed_op) {
3606 alu.src[1].sel = tmp2;
3607 alu.src[1].chan = 1;
3608 } else {
3609 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3610 }
3611
3612 alu.last = (j == 3);
3613 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3614 return r;
3615 }
3616 } else {
3617 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3618 alu.op = ALU_OP2_MULLO_UINT;
3619
3620 alu.dst.sel = tmp0;
3621 alu.dst.chan = 2;
3622 alu.dst.write = 1;
3623
3624 alu.src[0].sel = tmp0;
3625 alu.src[0].chan = 0;
3626 if (signed_op) {
3627 alu.src[1].sel = tmp2;
3628 alu.src[1].chan = 1;
3629 } else {
3630 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3631 }
3632
3633 alu.last = 1;
3634 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3635 return r;
3636 }
3637
3638 /* 3. tmp0.w = -tmp0.z */
3639 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3640 alu.op = ALU_OP2_SUB_INT;
3641
3642 alu.dst.sel = tmp0;
3643 alu.dst.chan = 3;
3644 alu.dst.write = 1;
3645
3646 alu.src[0].sel = V_SQ_ALU_SRC_0;
3647 alu.src[1].sel = tmp0;
3648 alu.src[1].chan = 2;
3649
3650 alu.last = 1;
3651 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3652 return r;
3653
3654 /* 4. tmp0.y = hi (tmp0.x * src2) */
3655 if (ctx->bc->chip_class == CAYMAN) {
3656 for (j = 0 ; j < 4; j++) {
3657 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3658 alu.op = ALU_OP2_MULHI_UINT;
3659
3660 alu.dst.sel = tmp0;
3661 alu.dst.chan = j;
3662 alu.dst.write = (j == 1);
3663
3664 alu.src[0].sel = tmp0;
3665 alu.src[0].chan = 0;
3666
3667 if (signed_op) {
3668 alu.src[1].sel = tmp2;
3669 alu.src[1].chan = 1;
3670 } else {
3671 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3672 }
3673 alu.last = (j == 3);
3674 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3675 return r;
3676 }
3677 } else {
3678 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3679 alu.op = ALU_OP2_MULHI_UINT;
3680
3681 alu.dst.sel = tmp0;
3682 alu.dst.chan = 1;
3683 alu.dst.write = 1;
3684
3685 alu.src[0].sel = tmp0;
3686 alu.src[0].chan = 0;
3687
3688 if (signed_op) {
3689 alu.src[1].sel = tmp2;
3690 alu.src[1].chan = 1;
3691 } else {
3692 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3693 }
3694
3695 alu.last = 1;
3696 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3697 return r;
3698 }
3699
3700 /* 5. tmp0.z = (tmp0.y == 0 ? tmp0.w : tmp0.z) = abs(lo(rcp*src)) */
3701 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3702 alu.op = ALU_OP3_CNDE_INT;
3703 alu.is_op3 = 1;
3704
3705 alu.dst.sel = tmp0;
3706 alu.dst.chan = 2;
3707 alu.dst.write = 1;
3708
3709 alu.src[0].sel = tmp0;
3710 alu.src[0].chan = 1;
3711 alu.src[1].sel = tmp0;
3712 alu.src[1].chan = 3;
3713 alu.src[2].sel = tmp0;
3714 alu.src[2].chan = 2;
3715
3716 alu.last = 1;
3717 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3718 return r;
3719
3720 /* 6. tmp0.w = hi (tmp0.z * tmp0.x) = e, rounding error */
3721 if (ctx->bc->chip_class == CAYMAN) {
3722 for (j = 0 ; j < 4; j++) {
3723 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3724 alu.op = ALU_OP2_MULHI_UINT;
3725
3726 alu.dst.sel = tmp0;
3727 alu.dst.chan = j;
3728 alu.dst.write = (j == 3);
3729
3730 alu.src[0].sel = tmp0;
3731 alu.src[0].chan = 2;
3732
3733 alu.src[1].sel = tmp0;
3734 alu.src[1].chan = 0;
3735
3736 alu.last = (j == 3);
3737 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3738 return r;
3739 }
3740 } else {
3741 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3742 alu.op = ALU_OP2_MULHI_UINT;
3743
3744 alu.dst.sel = tmp0;
3745 alu.dst.chan = 3;
3746 alu.dst.write = 1;
3747
3748 alu.src[0].sel = tmp0;
3749 alu.src[0].chan = 2;
3750
3751 alu.src[1].sel = tmp0;
3752 alu.src[1].chan = 0;
3753
3754 alu.last = 1;
3755 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3756 return r;
3757 }
3758
3759 /* 7. tmp1.x = tmp0.x - tmp0.w */
3760 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3761 alu.op = ALU_OP2_SUB_INT;
3762
3763 alu.dst.sel = tmp1;
3764 alu.dst.chan = 0;
3765 alu.dst.write = 1;
3766
3767 alu.src[0].sel = tmp0;
3768 alu.src[0].chan = 0;
3769 alu.src[1].sel = tmp0;
3770 alu.src[1].chan = 3;
3771
3772 alu.last = 1;
3773 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3774 return r;
3775
3776 /* 8. tmp1.y = tmp0.x + tmp0.w */
3777 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3778 alu.op = ALU_OP2_ADD_INT;
3779
3780 alu.dst.sel = tmp1;
3781 alu.dst.chan = 1;
3782 alu.dst.write = 1;
3783
3784 alu.src[0].sel = tmp0;
3785 alu.src[0].chan = 0;
3786 alu.src[1].sel = tmp0;
3787 alu.src[1].chan = 3;
3788
3789 alu.last = 1;
3790 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3791 return r;
3792
3793 /* 9. tmp0.x = (tmp0.y == 0 ? tmp1.y : tmp1.x) */
3794 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3795 alu.op = ALU_OP3_CNDE_INT;
3796 alu.is_op3 = 1;
3797
3798 alu.dst.sel = tmp0;
3799 alu.dst.chan = 0;
3800 alu.dst.write = 1;
3801
3802 alu.src[0].sel = tmp0;
3803 alu.src[0].chan = 1;
3804 alu.src[1].sel = tmp1;
3805 alu.src[1].chan = 1;
3806 alu.src[2].sel = tmp1;
3807 alu.src[2].chan = 0;
3808
3809 alu.last = 1;
3810 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3811 return r;
3812
3813 /* 10. tmp0.z = hi(tmp0.x * src1) = q */
3814 if (ctx->bc->chip_class == CAYMAN) {
3815 for (j = 0 ; j < 4; j++) {
3816 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3817 alu.op = ALU_OP2_MULHI_UINT;
3818
3819 alu.dst.sel = tmp0;
3820 alu.dst.chan = j;
3821 alu.dst.write = (j == 2);
3822
3823 alu.src[0].sel = tmp0;
3824 alu.src[0].chan = 0;
3825
3826 if (signed_op) {
3827 alu.src[1].sel = tmp2;
3828 alu.src[1].chan = 0;
3829 } else {
3830 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3831 }
3832
3833 alu.last = (j == 3);
3834 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3835 return r;
3836 }
3837 } else {
3838 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3839 alu.op = ALU_OP2_MULHI_UINT;
3840
3841 alu.dst.sel = tmp0;
3842 alu.dst.chan = 2;
3843 alu.dst.write = 1;
3844
3845 alu.src[0].sel = tmp0;
3846 alu.src[0].chan = 0;
3847
3848 if (signed_op) {
3849 alu.src[1].sel = tmp2;
3850 alu.src[1].chan = 0;
3851 } else {
3852 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
3853 }
3854
3855 alu.last = 1;
3856 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3857 return r;
3858 }
3859
3860 /* 11. tmp0.y = lo (src2 * tmp0.z) = src2*q = src1 - r */
3861 if (ctx->bc->chip_class == CAYMAN) {
3862 for (j = 0 ; j < 4; j++) {
3863 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3864 alu.op = ALU_OP2_MULLO_UINT;
3865
3866 alu.dst.sel = tmp0;
3867 alu.dst.chan = j;
3868 alu.dst.write = (j == 1);
3869
3870 if (signed_op) {
3871 alu.src[0].sel = tmp2;
3872 alu.src[0].chan = 1;
3873 } else {
3874 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
3875 }
3876
3877 alu.src[1].sel = tmp0;
3878 alu.src[1].chan = 2;
3879
3880 alu.last = (j == 3);
3881 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3882 return r;
3883 }
3884 } else {
3885 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3886 alu.op = ALU_OP2_MULLO_UINT;
3887
3888 alu.dst.sel = tmp0;
3889 alu.dst.chan = 1;
3890 alu.dst.write = 1;
3891
3892 if (signed_op) {
3893 alu.src[0].sel = tmp2;
3894 alu.src[0].chan = 1;
3895 } else {
3896 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
3897 }
3898
3899 alu.src[1].sel = tmp0;
3900 alu.src[1].chan = 2;
3901
3902 alu.last = 1;
3903 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3904 return r;
3905 }
3906
3907 /* 12. tmp0.w = src1 - tmp0.y = r */
3908 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3909 alu.op = ALU_OP2_SUB_INT;
3910
3911 alu.dst.sel = tmp0;
3912 alu.dst.chan = 3;
3913 alu.dst.write = 1;
3914
3915 if (signed_op) {
3916 alu.src[0].sel = tmp2;
3917 alu.src[0].chan = 0;
3918 } else {
3919 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3920 }
3921
3922 alu.src[1].sel = tmp0;
3923 alu.src[1].chan = 1;
3924
3925 alu.last = 1;
3926 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3927 return r;
3928
3929 /* 13. tmp1.x = tmp0.w >= src2 = r >= src2 */
3930 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3931 alu.op = ALU_OP2_SETGE_UINT;
3932
3933 alu.dst.sel = tmp1;
3934 alu.dst.chan = 0;
3935 alu.dst.write = 1;
3936
3937 alu.src[0].sel = tmp0;
3938 alu.src[0].chan = 3;
3939 if (signed_op) {
3940 alu.src[1].sel = tmp2;
3941 alu.src[1].chan = 1;
3942 } else {
3943 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3944 }
3945
3946 alu.last = 1;
3947 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3948 return r;
3949
3950 /* 14. tmp1.y = src1 >= tmp0.y = r >= 0 */
3951 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3952 alu.op = ALU_OP2_SETGE_UINT;
3953
3954 alu.dst.sel = tmp1;
3955 alu.dst.chan = 1;
3956 alu.dst.write = 1;
3957
3958 if (signed_op) {
3959 alu.src[0].sel = tmp2;
3960 alu.src[0].chan = 0;
3961 } else {
3962 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
3963 }
3964
3965 alu.src[1].sel = tmp0;
3966 alu.src[1].chan = 1;
3967
3968 alu.last = 1;
3969 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3970 return r;
3971
3972 if (mod) { /* UMOD */
3973
3974 /* 15. tmp1.z = tmp0.w - src2 = r - src2 */
3975 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3976 alu.op = ALU_OP2_SUB_INT;
3977
3978 alu.dst.sel = tmp1;
3979 alu.dst.chan = 2;
3980 alu.dst.write = 1;
3981
3982 alu.src[0].sel = tmp0;
3983 alu.src[0].chan = 3;
3984
3985 if (signed_op) {
3986 alu.src[1].sel = tmp2;
3987 alu.src[1].chan = 1;
3988 } else {
3989 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
3990 }
3991
3992 alu.last = 1;
3993 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
3994 return r;
3995
3996 /* 16. tmp1.w = tmp0.w + src2 = r + src2 */
3997 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
3998 alu.op = ALU_OP2_ADD_INT;
3999
4000 alu.dst.sel = tmp1;
4001 alu.dst.chan = 3;
4002 alu.dst.write = 1;
4003
4004 alu.src[0].sel = tmp0;
4005 alu.src[0].chan = 3;
4006 if (signed_op) {
4007 alu.src[1].sel = tmp2;
4008 alu.src[1].chan = 1;
4009 } else {
4010 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
4011 }
4012
4013 alu.last = 1;
4014 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4015 return r;
4016
4017 } else { /* UDIV */
4018
4019 /* 15. tmp1.z = tmp0.z + 1 = q + 1 DIV */
4020 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4021 alu.op = ALU_OP2_ADD_INT;
4022
4023 alu.dst.sel = tmp1;
4024 alu.dst.chan = 2;
4025 alu.dst.write = 1;
4026
4027 alu.src[0].sel = tmp0;
4028 alu.src[0].chan = 2;
4029 alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
4030
4031 alu.last = 1;
4032 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4033 return r;
4034
4035 /* 16. tmp1.w = tmp0.z - 1 = q - 1 */
4036 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4037 alu.op = ALU_OP2_ADD_INT;
4038
4039 alu.dst.sel = tmp1;
4040 alu.dst.chan = 3;
4041 alu.dst.write = 1;
4042
4043 alu.src[0].sel = tmp0;
4044 alu.src[0].chan = 2;
4045 alu.src[1].sel = V_SQ_ALU_SRC_M_1_INT;
4046
4047 alu.last = 1;
4048 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4049 return r;
4050
4051 }
4052
4053 /* 17. tmp1.x = tmp1.x & tmp1.y */
4054 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4055 alu.op = ALU_OP2_AND_INT;
4056
4057 alu.dst.sel = tmp1;
4058 alu.dst.chan = 0;
4059 alu.dst.write = 1;
4060
4061 alu.src[0].sel = tmp1;
4062 alu.src[0].chan = 0;
4063 alu.src[1].sel = tmp1;
4064 alu.src[1].chan = 1;
4065
4066 alu.last = 1;
4067 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4068 return r;
4069
4070 /* 18. tmp0.z = tmp1.x==0 ? tmp0.z : tmp1.z DIV */
4071 /* 18. tmp0.z = tmp1.x==0 ? tmp0.w : tmp1.z MOD */
4072 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4073 alu.op = ALU_OP3_CNDE_INT;
4074 alu.is_op3 = 1;
4075
4076 alu.dst.sel = tmp0;
4077 alu.dst.chan = 2;
4078 alu.dst.write = 1;
4079
4080 alu.src[0].sel = tmp1;
4081 alu.src[0].chan = 0;
4082 alu.src[1].sel = tmp0;
4083 alu.src[1].chan = mod ? 3 : 2;
4084 alu.src[2].sel = tmp1;
4085 alu.src[2].chan = 2;
4086
4087 alu.last = 1;
4088 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4089 return r;
4090
4091 /* 19. tmp0.z = tmp1.y==0 ? tmp1.w : tmp0.z */
4092 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4093 alu.op = ALU_OP3_CNDE_INT;
4094 alu.is_op3 = 1;
4095
4096 if (signed_op) {
4097 alu.dst.sel = tmp0;
4098 alu.dst.chan = 2;
4099 alu.dst.write = 1;
4100 } else {
4101 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4102 }
4103
4104 alu.src[0].sel = tmp1;
4105 alu.src[0].chan = 1;
4106 alu.src[1].sel = tmp1;
4107 alu.src[1].chan = 3;
4108 alu.src[2].sel = tmp0;
4109 alu.src[2].chan = 2;
4110
4111 alu.last = 1;
4112 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4113 return r;
4114
4115 if (signed_op) {
4116
4117 /* fix the sign of the result */
4118
4119 if (mod) {
4120
4121 /* tmp0.x = -tmp0.z */
4122 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4123 alu.op = ALU_OP2_SUB_INT;
4124
4125 alu.dst.sel = tmp0;
4126 alu.dst.chan = 0;
4127 alu.dst.write = 1;
4128
4129 alu.src[0].sel = V_SQ_ALU_SRC_0;
4130 alu.src[1].sel = tmp0;
4131 alu.src[1].chan = 2;
4132
4133 alu.last = 1;
4134 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4135 return r;
4136
4137 /* sign of the remainder is the same as the sign of src0 */
4138 /* tmp0.x = src0>=0 ? tmp0.z : tmp0.x */
4139 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4140 alu.op = ALU_OP3_CNDGE_INT;
4141 alu.is_op3 = 1;
4142
4143 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4144
4145 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4146 alu.src[1].sel = tmp0;
4147 alu.src[1].chan = 2;
4148 alu.src[2].sel = tmp0;
4149 alu.src[2].chan = 0;
4150
4151 alu.last = 1;
4152 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4153 return r;
4154
4155 } else {
4156
4157 /* tmp0.x = -tmp0.z */
4158 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4159 alu.op = ALU_OP2_SUB_INT;
4160
4161 alu.dst.sel = tmp0;
4162 alu.dst.chan = 0;
4163 alu.dst.write = 1;
4164
4165 alu.src[0].sel = V_SQ_ALU_SRC_0;
4166 alu.src[1].sel = tmp0;
4167 alu.src[1].chan = 2;
4168
4169 alu.last = 1;
4170 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4171 return r;
4172
4173 /* fix the quotient sign (same as the sign of src0*src1) */
4174 /* tmp0.x = tmp2.z>=0 ? tmp0.z : tmp0.x */
4175 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4176 alu.op = ALU_OP3_CNDGE_INT;
4177 alu.is_op3 = 1;
4178
4179 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4180
4181 alu.src[0].sel = tmp2;
4182 alu.src[0].chan = 2;
4183 alu.src[1].sel = tmp0;
4184 alu.src[1].chan = 2;
4185 alu.src[2].sel = tmp0;
4186 alu.src[2].chan = 0;
4187
4188 alu.last = 1;
4189 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
4190 return r;
4191 }
4192 }
4193 }
4194 return 0;
4195 }
4196
4197 static int tgsi_udiv(struct r600_shader_ctx *ctx)
4198 {
4199 return tgsi_divmod(ctx, 0, 0);
4200 }
4201
4202 static int tgsi_umod(struct r600_shader_ctx *ctx)
4203 {
4204 return tgsi_divmod(ctx, 1, 0);
4205 }
4206
4207 static int tgsi_idiv(struct r600_shader_ctx *ctx)
4208 {
4209 return tgsi_divmod(ctx, 0, 1);
4210 }
4211
4212 static int tgsi_imod(struct r600_shader_ctx *ctx)
4213 {
4214 return tgsi_divmod(ctx, 1, 1);
4215 }
4216
4217
4218 static int tgsi_f2i(struct r600_shader_ctx *ctx)
4219 {
4220 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4221 struct r600_bytecode_alu alu;
4222 int i, r;
4223 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4224 int last_inst = tgsi_last_instruction(write_mask);
4225
4226 for (i = 0; i < 4; i++) {
4227 if (!(write_mask & (1<<i)))
4228 continue;
4229
4230 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4231 alu.op = ALU_OP1_TRUNC;
4232
4233 alu.dst.sel = ctx->temp_reg;
4234 alu.dst.chan = i;
4235 alu.dst.write = 1;
4236
4237 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4238 if (i == last_inst)
4239 alu.last = 1;
4240 r = r600_bytecode_add_alu(ctx->bc, &alu);
4241 if (r)
4242 return r;
4243 }
4244
4245 for (i = 0; i < 4; i++) {
4246 if (!(write_mask & (1<<i)))
4247 continue;
4248
4249 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4250 alu.op = ctx->inst_info->op;
4251
4252 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4253
4254 alu.src[0].sel = ctx->temp_reg;
4255 alu.src[0].chan = i;
4256
4257 if (i == last_inst || alu.op == ALU_OP1_FLT_TO_UINT)
4258 alu.last = 1;
4259 r = r600_bytecode_add_alu(ctx->bc, &alu);
4260 if (r)
4261 return r;
4262 }
4263
4264 return 0;
4265 }
4266
4267 static int tgsi_iabs(struct r600_shader_ctx *ctx)
4268 {
4269 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4270 struct r600_bytecode_alu alu;
4271 int i, r;
4272 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4273 int last_inst = tgsi_last_instruction(write_mask);
4274
4275 /* tmp = -src */
4276 for (i = 0; i < 4; i++) {
4277 if (!(write_mask & (1<<i)))
4278 continue;
4279
4280 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4281 alu.op = ALU_OP2_SUB_INT;
4282
4283 alu.dst.sel = ctx->temp_reg;
4284 alu.dst.chan = i;
4285 alu.dst.write = 1;
4286
4287 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4288 alu.src[0].sel = V_SQ_ALU_SRC_0;
4289
4290 if (i == last_inst)
4291 alu.last = 1;
4292 r = r600_bytecode_add_alu(ctx->bc, &alu);
4293 if (r)
4294 return r;
4295 }
4296
4297 /* dst = (src >= 0 ? src : tmp) */
4298 for (i = 0; i < 4; i++) {
4299 if (!(write_mask & (1<<i)))
4300 continue;
4301
4302 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4303 alu.op = ALU_OP3_CNDGE_INT;
4304 alu.is_op3 = 1;
4305 alu.dst.write = 1;
4306
4307 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4308
4309 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4310 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4311 alu.src[2].sel = ctx->temp_reg;
4312 alu.src[2].chan = i;
4313
4314 if (i == last_inst)
4315 alu.last = 1;
4316 r = r600_bytecode_add_alu(ctx->bc, &alu);
4317 if (r)
4318 return r;
4319 }
4320 return 0;
4321 }
4322
4323 static int tgsi_issg(struct r600_shader_ctx *ctx)
4324 {
4325 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4326 struct r600_bytecode_alu alu;
4327 int i, r;
4328 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4329 int last_inst = tgsi_last_instruction(write_mask);
4330
4331 /* tmp = (src >= 0 ? src : -1) */
4332 for (i = 0; i < 4; i++) {
4333 if (!(write_mask & (1<<i)))
4334 continue;
4335
4336 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4337 alu.op = ALU_OP3_CNDGE_INT;
4338 alu.is_op3 = 1;
4339
4340 alu.dst.sel = ctx->temp_reg;
4341 alu.dst.chan = i;
4342 alu.dst.write = 1;
4343
4344 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4345 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
4346 alu.src[2].sel = V_SQ_ALU_SRC_M_1_INT;
4347
4348 if (i == last_inst)
4349 alu.last = 1;
4350 r = r600_bytecode_add_alu(ctx->bc, &alu);
4351 if (r)
4352 return r;
4353 }
4354
4355 /* dst = (tmp > 0 ? 1 : tmp) */
4356 for (i = 0; i < 4; i++) {
4357 if (!(write_mask & (1<<i)))
4358 continue;
4359
4360 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4361 alu.op = ALU_OP3_CNDGT_INT;
4362 alu.is_op3 = 1;
4363 alu.dst.write = 1;
4364
4365 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4366
4367 alu.src[0].sel = ctx->temp_reg;
4368 alu.src[0].chan = i;
4369
4370 alu.src[1].sel = V_SQ_ALU_SRC_1_INT;
4371
4372 alu.src[2].sel = ctx->temp_reg;
4373 alu.src[2].chan = i;
4374
4375 if (i == last_inst)
4376 alu.last = 1;
4377 r = r600_bytecode_add_alu(ctx->bc, &alu);
4378 if (r)
4379 return r;
4380 }
4381 return 0;
4382 }
4383
4384
4385
4386 static int tgsi_ssg(struct r600_shader_ctx *ctx)
4387 {
4388 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4389 struct r600_bytecode_alu alu;
4390 int i, r;
4391
4392 /* tmp = (src > 0 ? 1 : src) */
4393 for (i = 0; i < 4; i++) {
4394 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4395 alu.op = ALU_OP3_CNDGT;
4396 alu.is_op3 = 1;
4397
4398 alu.dst.sel = ctx->temp_reg;
4399 alu.dst.chan = i;
4400
4401 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4402 alu.src[1].sel = V_SQ_ALU_SRC_1;
4403 r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
4404
4405 if (i == 3)
4406 alu.last = 1;
4407 r = r600_bytecode_add_alu(ctx->bc, &alu);
4408 if (r)
4409 return r;
4410 }
4411
4412 /* dst = (-tmp > 0 ? -1 : tmp) */
4413 for (i = 0; i < 4; i++) {
4414 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4415 alu.op = ALU_OP3_CNDGT;
4416 alu.is_op3 = 1;
4417 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4418
4419 alu.src[0].sel = ctx->temp_reg;
4420 alu.src[0].chan = i;
4421 alu.src[0].neg = 1;
4422
4423 alu.src[1].sel = V_SQ_ALU_SRC_1;
4424 alu.src[1].neg = 1;
4425
4426 alu.src[2].sel = ctx->temp_reg;
4427 alu.src[2].chan = i;
4428
4429 if (i == 3)
4430 alu.last = 1;
4431 r = r600_bytecode_add_alu(ctx->bc, &alu);
4432 if (r)
4433 return r;
4434 }
4435 return 0;
4436 }
4437
4438 static int tgsi_bfi(struct r600_shader_ctx *ctx)
4439 {
4440 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4441 struct r600_bytecode_alu alu;
4442 int i, r, t1, t2;
4443
4444 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4445 int last_inst = tgsi_last_instruction(write_mask);
4446
4447 t1 = ctx->temp_reg;
4448
4449 for (i = 0; i < 4; i++) {
4450 if (!(write_mask & (1<<i)))
4451 continue;
4452
4453 /* create mask tmp */
4454 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4455 alu.op = ALU_OP2_BFM_INT;
4456 alu.dst.sel = t1;
4457 alu.dst.chan = i;
4458 alu.dst.write = 1;
4459 alu.last = i == last_inst;
4460
4461 r600_bytecode_src(&alu.src[0], &ctx->src[3], i);
4462 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
4463
4464 r = r600_bytecode_add_alu(ctx->bc, &alu);
4465 if (r)
4466 return r;
4467 }
4468
4469 t2 = r600_get_temp(ctx);
4470
4471 for (i = 0; i < 4; i++) {
4472 if (!(write_mask & (1<<i)))
4473 continue;
4474
4475 /* shift insert left */
4476 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4477 alu.op = ALU_OP2_LSHL_INT;
4478 alu.dst.sel = t2;
4479 alu.dst.chan = i;
4480 alu.dst.write = 1;
4481 alu.last = i == last_inst;
4482
4483 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
4484 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
4485
4486 r = r600_bytecode_add_alu(ctx->bc, &alu);
4487 if (r)
4488 return r;
4489 }
4490
4491 for (i = 0; i < 4; i++) {
4492 if (!(write_mask & (1<<i)))
4493 continue;
4494
4495 /* actual bitfield insert */
4496 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4497 alu.op = ALU_OP3_BFI_INT;
4498 alu.is_op3 = 1;
4499 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4500 alu.dst.chan = i;
4501 alu.dst.write = 1;
4502 alu.last = i == last_inst;
4503
4504 alu.src[0].sel = t1;
4505 alu.src[0].chan = i;
4506 alu.src[1].sel = t2;
4507 alu.src[1].chan = i;
4508 r600_bytecode_src(&alu.src[2], &ctx->src[0], i);
4509
4510 r = r600_bytecode_add_alu(ctx->bc, &alu);
4511 if (r)
4512 return r;
4513 }
4514
4515 return 0;
4516 }
4517
4518 static int tgsi_msb(struct r600_shader_ctx *ctx)
4519 {
4520 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4521 struct r600_bytecode_alu alu;
4522 int i, r, t1, t2;
4523
4524 unsigned write_mask = inst->Dst[0].Register.WriteMask;
4525 int last_inst = tgsi_last_instruction(write_mask);
4526
4527 assert(ctx->inst_info->op == ALU_OP1_FFBH_INT ||
4528 ctx->inst_info->op == ALU_OP1_FFBH_UINT);
4529
4530 t1 = ctx->temp_reg;
4531
4532 /* bit position is indexed from lsb by TGSI, and from msb by the hardware */
4533 for (i = 0; i < 4; i++) {
4534 if (!(write_mask & (1<<i)))
4535 continue;
4536
4537 /* t1 = FFBH_INT / FFBH_UINT */
4538 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4539 alu.op = ctx->inst_info->op;
4540 alu.dst.sel = t1;
4541 alu.dst.chan = i;
4542 alu.dst.write = 1;
4543 alu.last = i == last_inst;
4544
4545 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4546
4547 r = r600_bytecode_add_alu(ctx->bc, &alu);
4548 if (r)
4549 return r;
4550 }
4551
4552 t2 = r600_get_temp(ctx);
4553
4554 for (i = 0; i < 4; i++) {
4555 if (!(write_mask & (1<<i)))
4556 continue;
4557
4558 /* t2 = 31 - t1 */
4559 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4560 alu.op = ALU_OP2_SUB_INT;
4561 alu.dst.sel = t2;
4562 alu.dst.chan = i;
4563 alu.dst.write = 1;
4564 alu.last = i == last_inst;
4565
4566 alu.src[0].sel = V_SQ_ALU_SRC_LITERAL;
4567 alu.src[0].value = 31;
4568 alu.src[1].sel = t1;
4569 alu.src[1].chan = i;
4570
4571 r = r600_bytecode_add_alu(ctx->bc, &alu);
4572 if (r)
4573 return r;
4574 }
4575
4576 for (i = 0; i < 4; i++) {
4577 if (!(write_mask & (1<<i)))
4578 continue;
4579
4580 /* result = t1 >= 0 ? t2 : t1 */
4581 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4582 alu.op = ALU_OP3_CNDGE_INT;
4583 alu.is_op3 = 1;
4584 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4585 alu.dst.chan = i;
4586 alu.dst.write = 1;
4587 alu.last = i == last_inst;
4588
4589 alu.src[0].sel = t1;
4590 alu.src[0].chan = i;
4591 alu.src[1].sel = t2;
4592 alu.src[1].chan = i;
4593 alu.src[2].sel = t1;
4594 alu.src[2].chan = i;
4595
4596 r = r600_bytecode_add_alu(ctx->bc, &alu);
4597 if (r)
4598 return r;
4599 }
4600
4601 return 0;
4602 }
4603
4604 static int tgsi_interp_egcm(struct r600_shader_ctx *ctx)
4605 {
4606 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4607 struct r600_bytecode_alu alu;
4608 int r, i = 0, k, interp_gpr, interp_base_chan, tmp, lasti;
4609 unsigned location;
4610 int input;
4611
4612 assert(inst->Src[0].Register.File == TGSI_FILE_INPUT);
4613
4614 input = inst->Src[0].Register.Index;
4615
4616 /* Interpolators have been marked for use already by allocate_system_value_inputs */
4617 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
4618 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
4619 location = TGSI_INTERPOLATE_LOC_CENTER; /* sample offset will be added explicitly */
4620 }
4621 else {
4622 location = TGSI_INTERPOLATE_LOC_CENTROID;
4623 }
4624
4625 k = eg_get_interpolator_index(ctx->shader->input[input].interpolate, location);
4626 if (k < 0)
4627 k = 0;
4628 interp_gpr = ctx->eg_interpolators[k].ij_index / 2;
4629 interp_base_chan = 2 * (ctx->eg_interpolators[k].ij_index % 2);
4630
4631 /* NOTE: currently offset is not perspective correct */
4632 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
4633 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
4634 int sample_gpr = -1;
4635 int gradientsH, gradientsV;
4636 struct r600_bytecode_tex tex;
4637
4638 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
4639 sample_gpr = load_sample_position(ctx, &ctx->src[1], ctx->src[1].swizzle[0]);
4640 }
4641
4642 gradientsH = r600_get_temp(ctx);
4643 gradientsV = r600_get_temp(ctx);
4644 for (i = 0; i < 2; i++) {
4645 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
4646 tex.op = i == 0 ? FETCH_OP_GET_GRADIENTS_H : FETCH_OP_GET_GRADIENTS_V;
4647 tex.src_gpr = interp_gpr;
4648 tex.src_sel_x = interp_base_chan + 0;
4649 tex.src_sel_y = interp_base_chan + 1;
4650 tex.src_sel_z = 0;
4651 tex.src_sel_w = 0;
4652 tex.dst_gpr = i == 0 ? gradientsH : gradientsV;
4653 tex.dst_sel_x = 0;
4654 tex.dst_sel_y = 1;
4655 tex.dst_sel_z = 7;
4656 tex.dst_sel_w = 7;
4657 tex.inst_mod = 1; // Use per pixel gradient calculation
4658 tex.sampler_id = 0;
4659 tex.resource_id = tex.sampler_id;
4660 r = r600_bytecode_add_tex(ctx->bc, &tex);
4661 if (r)
4662 return r;
4663 }
4664
4665 for (i = 0; i < 2; i++) {
4666 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4667 alu.op = ALU_OP3_MULADD;
4668 alu.is_op3 = 1;
4669 alu.src[0].sel = gradientsH;
4670 alu.src[0].chan = i;
4671 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
4672 alu.src[1].sel = sample_gpr;
4673 alu.src[1].chan = 2;
4674 }
4675 else {
4676 r600_bytecode_src(&alu.src[1], &ctx->src[1], 0);
4677 }
4678 alu.src[2].sel = interp_gpr;
4679 alu.src[2].chan = interp_base_chan + i;
4680 alu.dst.sel = ctx->temp_reg;
4681 alu.dst.chan = i;
4682 alu.last = i == 1;
4683
4684 r = r600_bytecode_add_alu(ctx->bc, &alu);
4685 if (r)
4686 return r;
4687 }
4688
4689 for (i = 0; i < 2; i++) {
4690 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4691 alu.op = ALU_OP3_MULADD;
4692 alu.is_op3 = 1;
4693 alu.src[0].sel = gradientsV;
4694 alu.src[0].chan = i;
4695 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
4696 alu.src[1].sel = sample_gpr;
4697 alu.src[1].chan = 3;
4698 }
4699 else {
4700 r600_bytecode_src(&alu.src[1], &ctx->src[1], 1);
4701 }
4702 alu.src[2].sel = ctx->temp_reg;
4703 alu.src[2].chan = i;
4704 alu.dst.sel = ctx->temp_reg;
4705 alu.dst.chan = i;
4706 alu.last = i == 1;
4707
4708 r = r600_bytecode_add_alu(ctx->bc, &alu);
4709 if (r)
4710 return r;
4711 }
4712 }
4713
4714 tmp = r600_get_temp(ctx);
4715 for (i = 0; i < 8; i++) {
4716 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4717 alu.op = i < 4 ? ALU_OP2_INTERP_ZW : ALU_OP2_INTERP_XY;
4718
4719 alu.dst.sel = tmp;
4720 if ((i > 1 && i < 6)) {
4721 alu.dst.write = 1;
4722 }
4723 else {
4724 alu.dst.write = 0;
4725 }
4726 alu.dst.chan = i % 4;
4727
4728 if (inst->Instruction.Opcode == TGSI_OPCODE_INTERP_OFFSET ||
4729 inst->Instruction.Opcode == TGSI_OPCODE_INTERP_SAMPLE) {
4730 alu.src[0].sel = ctx->temp_reg;
4731 alu.src[0].chan = 1 - (i % 2);
4732 } else {
4733 alu.src[0].sel = interp_gpr;
4734 alu.src[0].chan = interp_base_chan + 1 - (i % 2);
4735 }
4736 alu.src[1].sel = V_SQ_ALU_SRC_PARAM_BASE + ctx->shader->input[input].lds_pos;
4737 alu.src[1].chan = 0;
4738
4739 alu.last = i % 4 == 3;
4740 alu.bank_swizzle_force = SQ_ALU_VEC_210;
4741
4742 r = r600_bytecode_add_alu(ctx->bc, &alu);
4743 if (r)
4744 return r;
4745 }
4746
4747 // INTERP can't swizzle dst
4748 lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4749 for (i = 0; i <= lasti; i++) {
4750 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4751 continue;
4752
4753 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4754 alu.op = ALU_OP1_MOV;
4755 alu.src[0].sel = tmp;
4756 alu.src[0].chan = ctx->src[0].swizzle[i];
4757 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4758 alu.dst.write = 1;
4759 alu.last = i == lasti;
4760 r = r600_bytecode_add_alu(ctx->bc, &alu);
4761 if (r)
4762 return r;
4763 }
4764
4765 return 0;
4766 }
4767
4768
4769 static int tgsi_helper_copy(struct r600_shader_ctx *ctx, struct tgsi_full_instruction *inst)
4770 {
4771 struct r600_bytecode_alu alu;
4772 int i, r;
4773
4774 for (i = 0; i < 4; i++) {
4775 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4776 if (!(inst->Dst[0].Register.WriteMask & (1 << i))) {
4777 alu.op = ALU_OP0_NOP;
4778 alu.dst.chan = i;
4779 } else {
4780 alu.op = ALU_OP1_MOV;
4781 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4782 alu.src[0].sel = ctx->temp_reg;
4783 alu.src[0].chan = i;
4784 }
4785 if (i == 3) {
4786 alu.last = 1;
4787 }
4788 r = r600_bytecode_add_alu(ctx->bc, &alu);
4789 if (r)
4790 return r;
4791 }
4792 return 0;
4793 }
4794
4795 static int tgsi_op3(struct r600_shader_ctx *ctx)
4796 {
4797 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4798 struct r600_bytecode_alu alu;
4799 int i, j, r;
4800 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4801
4802 for (i = 0; i < lasti + 1; i++) {
4803 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4804 continue;
4805
4806 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4807 alu.op = ctx->inst_info->op;
4808 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4809 r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
4810 }
4811
4812 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4813 alu.dst.chan = i;
4814 alu.dst.write = 1;
4815 alu.is_op3 = 1;
4816 if (i == lasti) {
4817 alu.last = 1;
4818 }
4819 r = r600_bytecode_add_alu(ctx->bc, &alu);
4820 if (r)
4821 return r;
4822 }
4823 return 0;
4824 }
4825
4826 static int tgsi_dp(struct r600_shader_ctx *ctx)
4827 {
4828 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4829 struct r600_bytecode_alu alu;
4830 int i, j, r;
4831
4832 for (i = 0; i < 4; i++) {
4833 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4834 alu.op = ctx->inst_info->op;
4835 for (j = 0; j < inst->Instruction.NumSrcRegs; j++) {
4836 r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
4837 }
4838
4839 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
4840 alu.dst.chan = i;
4841 alu.dst.write = (inst->Dst[0].Register.WriteMask >> i) & 1;
4842 /* handle some special cases */
4843 switch (ctx->inst_info->tgsi_opcode) {
4844 case TGSI_OPCODE_DP2:
4845 if (i > 1) {
4846 alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
4847 alu.src[0].chan = alu.src[1].chan = 0;
4848 }
4849 break;
4850 case TGSI_OPCODE_DP3:
4851 if (i > 2) {
4852 alu.src[0].sel = alu.src[1].sel = V_SQ_ALU_SRC_0;
4853 alu.src[0].chan = alu.src[1].chan = 0;
4854 }
4855 break;
4856 case TGSI_OPCODE_DPH:
4857 if (i == 3) {
4858 alu.src[0].sel = V_SQ_ALU_SRC_1;
4859 alu.src[0].chan = 0;
4860 alu.src[0].neg = 0;
4861 }
4862 break;
4863 default:
4864 break;
4865 }
4866 if (i == 3) {
4867 alu.last = 1;
4868 }
4869 r = r600_bytecode_add_alu(ctx->bc, &alu);
4870 if (r)
4871 return r;
4872 }
4873 return 0;
4874 }
4875
4876 static inline boolean tgsi_tex_src_requires_loading(struct r600_shader_ctx *ctx,
4877 unsigned index)
4878 {
4879 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4880 return (inst->Src[index].Register.File != TGSI_FILE_TEMPORARY &&
4881 inst->Src[index].Register.File != TGSI_FILE_INPUT &&
4882 inst->Src[index].Register.File != TGSI_FILE_OUTPUT) ||
4883 ctx->src[index].neg || ctx->src[index].abs;
4884 }
4885
4886 static inline unsigned tgsi_tex_get_src_gpr(struct r600_shader_ctx *ctx,
4887 unsigned index)
4888 {
4889 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4890 return ctx->file_offset[inst->Src[index].Register.File] + inst->Src[index].Register.Index;
4891 }
4892
4893 static int do_vtx_fetch_inst(struct r600_shader_ctx *ctx, boolean src_requires_loading)
4894 {
4895 struct r600_bytecode_vtx vtx;
4896 struct r600_bytecode_alu alu;
4897 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4898 int src_gpr, r, i;
4899 int id = tgsi_tex_get_src_gpr(ctx, 1);
4900
4901 src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
4902 if (src_requires_loading) {
4903 for (i = 0; i < 4; i++) {
4904 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4905 alu.op = ALU_OP1_MOV;
4906 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
4907 alu.dst.sel = ctx->temp_reg;
4908 alu.dst.chan = i;
4909 if (i == 3)
4910 alu.last = 1;
4911 alu.dst.write = 1;
4912 r = r600_bytecode_add_alu(ctx->bc, &alu);
4913 if (r)
4914 return r;
4915 }
4916 src_gpr = ctx->temp_reg;
4917 }
4918
4919 memset(&vtx, 0, sizeof(vtx));
4920 vtx.op = FETCH_OP_VFETCH;
4921 vtx.buffer_id = id + R600_MAX_CONST_BUFFERS;
4922 vtx.fetch_type = 2; /* VTX_FETCH_NO_INDEX_OFFSET */
4923 vtx.src_gpr = src_gpr;
4924 vtx.mega_fetch_count = 16;
4925 vtx.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
4926 vtx.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7; /* SEL_X */
4927 vtx.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7; /* SEL_Y */
4928 vtx.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7; /* SEL_Z */
4929 vtx.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7; /* SEL_W */
4930 vtx.use_const_fields = 1;
4931
4932 if ((r = r600_bytecode_add_vtx(ctx->bc, &vtx)))
4933 return r;
4934
4935 if (ctx->bc->chip_class >= EVERGREEN)
4936 return 0;
4937
4938 for (i = 0; i < 4; i++) {
4939 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
4940 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
4941 continue;
4942
4943 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4944 alu.op = ALU_OP2_AND_INT;
4945
4946 alu.dst.chan = i;
4947 alu.dst.sel = vtx.dst_gpr;
4948 alu.dst.write = 1;
4949
4950 alu.src[0].sel = vtx.dst_gpr;
4951 alu.src[0].chan = i;
4952
4953 alu.src[1].sel = 512 + (id * 2);
4954 alu.src[1].chan = i % 4;
4955 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
4956
4957 if (i == lasti)
4958 alu.last = 1;
4959 r = r600_bytecode_add_alu(ctx->bc, &alu);
4960 if (r)
4961 return r;
4962 }
4963
4964 if (inst->Dst[0].Register.WriteMask & 3) {
4965 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4966 alu.op = ALU_OP2_OR_INT;
4967
4968 alu.dst.chan = 3;
4969 alu.dst.sel = vtx.dst_gpr;
4970 alu.dst.write = 1;
4971
4972 alu.src[0].sel = vtx.dst_gpr;
4973 alu.src[0].chan = 3;
4974
4975 alu.src[1].sel = 512 + (id * 2) + 1;
4976 alu.src[1].chan = 0;
4977 alu.src[1].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
4978
4979 alu.last = 1;
4980 r = r600_bytecode_add_alu(ctx->bc, &alu);
4981 if (r)
4982 return r;
4983 }
4984 return 0;
4985 }
4986
4987 static int r600_do_buffer_txq(struct r600_shader_ctx *ctx)
4988 {
4989 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
4990 struct r600_bytecode_alu alu;
4991 int r;
4992 int id = tgsi_tex_get_src_gpr(ctx, 1);
4993
4994 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
4995 alu.op = ALU_OP1_MOV;
4996
4997 if (ctx->bc->chip_class >= EVERGREEN) {
4998 alu.src[0].sel = 512 + (id / 4);
4999 alu.src[0].chan = id % 4;
5000 } else {
5001 /* r600 we have them at channel 2 of the second dword */
5002 alu.src[0].sel = 512 + (id * 2) + 1;
5003 alu.src[0].chan = 1;
5004 }
5005 alu.src[0].kc_bank = R600_BUFFER_INFO_CONST_BUFFER;
5006 tgsi_dst(ctx, &inst->Dst[0], 0, &alu.dst);
5007 alu.last = 1;
5008 r = r600_bytecode_add_alu(ctx->bc, &alu);
5009 if (r)
5010 return r;
5011 return 0;
5012 }
5013
5014 static int tgsi_tex(struct r600_shader_ctx *ctx)
5015 {
5016 static float one_point_five = 1.5f;
5017 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5018 struct r600_bytecode_tex tex;
5019 struct r600_bytecode_alu alu;
5020 unsigned src_gpr;
5021 int r, i, j;
5022 int opcode;
5023 bool read_compressed_msaa = ctx->bc->has_compressed_msaa_texturing &&
5024 inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
5025 (inst->Texture.Texture == TGSI_TEXTURE_2D_MSAA ||
5026 inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA);
5027
5028 bool txf_add_offsets = inst->Texture.NumOffsets &&
5029 inst->Instruction.Opcode == TGSI_OPCODE_TXF &&
5030 inst->Texture.Texture != TGSI_TEXTURE_BUFFER;
5031
5032 /* Texture fetch instructions can only use gprs as source.
5033 * Also they cannot negate the source or take the absolute value */
5034 const boolean src_requires_loading = (inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ &&
5035 tgsi_tex_src_requires_loading(ctx, 0)) ||
5036 read_compressed_msaa || txf_add_offsets;
5037
5038 boolean src_loaded = FALSE;
5039 unsigned sampler_src_reg = inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ ? 0 : 1;
5040 int8_t offset_x = 0, offset_y = 0, offset_z = 0;
5041 boolean has_txq_cube_array_z = false;
5042
5043 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ &&
5044 ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
5045 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)))
5046 if (inst->Dst[0].Register.WriteMask & 4) {
5047 ctx->shader->has_txq_cube_array_z_comp = true;
5048 has_txq_cube_array_z = true;
5049 }
5050
5051 if (inst->Instruction.Opcode == TGSI_OPCODE_TEX2 ||
5052 inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
5053 inst->Instruction.Opcode == TGSI_OPCODE_TXL2 ||
5054 inst->Instruction.Opcode == TGSI_OPCODE_TG4)
5055 sampler_src_reg = 2;
5056
5057 src_gpr = tgsi_tex_get_src_gpr(ctx, 0);
5058
5059 if (inst->Texture.Texture == TGSI_TEXTURE_BUFFER) {
5060 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ) {
5061 ctx->shader->uses_tex_buffers = true;
5062 return r600_do_buffer_txq(ctx);
5063 }
5064 else if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
5065 if (ctx->bc->chip_class < EVERGREEN)
5066 ctx->shader->uses_tex_buffers = true;
5067 return do_vtx_fetch_inst(ctx, src_requires_loading);
5068 }
5069 }
5070
5071 if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) {
5072 /* TGSI moves the sampler to src reg 3 for TXD */
5073 sampler_src_reg = 3;
5074
5075 for (i = 1; i < 3; i++) {
5076 /* set gradients h/v */
5077 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
5078 tex.op = (i == 1) ? FETCH_OP_SET_GRADIENTS_H :
5079 FETCH_OP_SET_GRADIENTS_V;
5080 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
5081 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
5082
5083 if (tgsi_tex_src_requires_loading(ctx, i)) {
5084 tex.src_gpr = r600_get_temp(ctx);
5085 tex.src_sel_x = 0;
5086 tex.src_sel_y = 1;
5087 tex.src_sel_z = 2;
5088 tex.src_sel_w = 3;
5089
5090 for (j = 0; j < 4; j++) {
5091 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5092 alu.op = ALU_OP1_MOV;
5093 r600_bytecode_src(&alu.src[0], &ctx->src[i], j);
5094 alu.dst.sel = tex.src_gpr;
5095 alu.dst.chan = j;
5096 if (j == 3)
5097 alu.last = 1;
5098 alu.dst.write = 1;
5099 r = r600_bytecode_add_alu(ctx->bc, &alu);
5100 if (r)
5101 return r;
5102 }
5103
5104 } else {
5105 tex.src_gpr = tgsi_tex_get_src_gpr(ctx, i);
5106 tex.src_sel_x = ctx->src[i].swizzle[0];
5107 tex.src_sel_y = ctx->src[i].swizzle[1];
5108 tex.src_sel_z = ctx->src[i].swizzle[2];
5109 tex.src_sel_w = ctx->src[i].swizzle[3];
5110 tex.src_rel = ctx->src[i].rel;
5111 }
5112 tex.dst_gpr = ctx->temp_reg; /* just to avoid confusing the asm scheduler */
5113 tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
5114 if (inst->Texture.Texture != TGSI_TEXTURE_RECT) {
5115 tex.coord_type_x = 1;
5116 tex.coord_type_y = 1;
5117 tex.coord_type_z = 1;
5118 tex.coord_type_w = 1;
5119 }
5120 r = r600_bytecode_add_tex(ctx->bc, &tex);
5121 if (r)
5122 return r;
5123 }
5124 } else if (inst->Instruction.Opcode == TGSI_OPCODE_TXP) {
5125 int out_chan;
5126 /* Add perspective divide */
5127 if (ctx->bc->chip_class == CAYMAN) {
5128 out_chan = 2;
5129 for (i = 0; i < 3; i++) {
5130 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5131 alu.op = ALU_OP1_RECIP_IEEE;
5132 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
5133
5134 alu.dst.sel = ctx->temp_reg;
5135 alu.dst.chan = i;
5136 if (i == 2)
5137 alu.last = 1;
5138 if (out_chan == i)
5139 alu.dst.write = 1;
5140 r = r600_bytecode_add_alu(ctx->bc, &alu);
5141 if (r)
5142 return r;
5143 }
5144
5145 } else {
5146 out_chan = 3;
5147 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5148 alu.op = ALU_OP1_RECIP_IEEE;
5149 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
5150
5151 alu.dst.sel = ctx->temp_reg;
5152 alu.dst.chan = out_chan;
5153 alu.last = 1;
5154 alu.dst.write = 1;
5155 r = r600_bytecode_add_alu(ctx->bc, &alu);
5156 if (r)
5157 return r;
5158 }
5159
5160 for (i = 0; i < 3; i++) {
5161 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5162 alu.op = ALU_OP2_MUL;
5163 alu.src[0].sel = ctx->temp_reg;
5164 alu.src[0].chan = out_chan;
5165 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5166 alu.dst.sel = ctx->temp_reg;
5167 alu.dst.chan = i;
5168 alu.dst.write = 1;
5169 r = r600_bytecode_add_alu(ctx->bc, &alu);
5170 if (r)
5171 return r;
5172 }
5173 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5174 alu.op = ALU_OP1_MOV;
5175 alu.src[0].sel = V_SQ_ALU_SRC_1;
5176 alu.src[0].chan = 0;
5177 alu.dst.sel = ctx->temp_reg;
5178 alu.dst.chan = 3;
5179 alu.last = 1;
5180 alu.dst.write = 1;
5181 r = r600_bytecode_add_alu(ctx->bc, &alu);
5182 if (r)
5183 return r;
5184 src_loaded = TRUE;
5185 src_gpr = ctx->temp_reg;
5186 }
5187
5188 if ((inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
5189 inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
5190 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
5191 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
5192 inst->Instruction.Opcode != TGSI_OPCODE_TXQ &&
5193 inst->Instruction.Opcode != TGSI_OPCODE_TXQ_LZ) {
5194
5195 static const unsigned src0_swizzle[] = {2, 2, 0, 1};
5196 static const unsigned src1_swizzle[] = {1, 0, 2, 2};
5197
5198 /* tmp1.xyzw = CUBE(R0.zzxy, R0.yxzz) */
5199 for (i = 0; i < 4; i++) {
5200 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5201 alu.op = ALU_OP2_CUBE;
5202 r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
5203 r600_bytecode_src(&alu.src[1], &ctx->src[0], src1_swizzle[i]);
5204 alu.dst.sel = ctx->temp_reg;
5205 alu.dst.chan = i;
5206 if (i == 3)
5207 alu.last = 1;
5208 alu.dst.write = 1;
5209 r = r600_bytecode_add_alu(ctx->bc, &alu);
5210 if (r)
5211 return r;
5212 }
5213
5214 /* tmp1.z = RCP_e(|tmp1.z|) */
5215 if (ctx->bc->chip_class == CAYMAN) {
5216 for (i = 0; i < 3; i++) {
5217 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5218 alu.op = ALU_OP1_RECIP_IEEE;
5219 alu.src[0].sel = ctx->temp_reg;
5220 alu.src[0].chan = 2;
5221 alu.src[0].abs = 1;
5222 alu.dst.sel = ctx->temp_reg;
5223 alu.dst.chan = i;
5224 if (i == 2)
5225 alu.dst.write = 1;
5226 if (i == 2)
5227 alu.last = 1;
5228 r = r600_bytecode_add_alu(ctx->bc, &alu);
5229 if (r)
5230 return r;
5231 }
5232 } else {
5233 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5234 alu.op = ALU_OP1_RECIP_IEEE;
5235 alu.src[0].sel = ctx->temp_reg;
5236 alu.src[0].chan = 2;
5237 alu.src[0].abs = 1;
5238 alu.dst.sel = ctx->temp_reg;
5239 alu.dst.chan = 2;
5240 alu.dst.write = 1;
5241 alu.last = 1;
5242 r = r600_bytecode_add_alu(ctx->bc, &alu);
5243 if (r)
5244 return r;
5245 }
5246
5247 /* MULADD R0.x, R0.x, PS1, (0x3FC00000, 1.5f).x
5248 * MULADD R0.y, R0.y, PS1, (0x3FC00000, 1.5f).x
5249 * muladd has no writemask, have to use another temp
5250 */
5251 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5252 alu.op = ALU_OP3_MULADD;
5253 alu.is_op3 = 1;
5254
5255 alu.src[0].sel = ctx->temp_reg;
5256 alu.src[0].chan = 0;
5257 alu.src[1].sel = ctx->temp_reg;
5258 alu.src[1].chan = 2;
5259
5260 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
5261 alu.src[2].chan = 0;
5262 alu.src[2].value = *(uint32_t *)&one_point_five;
5263
5264 alu.dst.sel = ctx->temp_reg;
5265 alu.dst.chan = 0;
5266 alu.dst.write = 1;
5267
5268 r = r600_bytecode_add_alu(ctx->bc, &alu);
5269 if (r)
5270 return r;
5271
5272 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5273 alu.op = ALU_OP3_MULADD;
5274 alu.is_op3 = 1;
5275
5276 alu.src[0].sel = ctx->temp_reg;
5277 alu.src[0].chan = 1;
5278 alu.src[1].sel = ctx->temp_reg;
5279 alu.src[1].chan = 2;
5280
5281 alu.src[2].sel = V_SQ_ALU_SRC_LITERAL;
5282 alu.src[2].chan = 0;
5283 alu.src[2].value = *(uint32_t *)&one_point_five;
5284
5285 alu.dst.sel = ctx->temp_reg;
5286 alu.dst.chan = 1;
5287 alu.dst.write = 1;
5288
5289 alu.last = 1;
5290 r = r600_bytecode_add_alu(ctx->bc, &alu);
5291 if (r)
5292 return r;
5293 /* write initial compare value into Z component
5294 - W src 0 for shadow cube
5295 - X src 1 for shadow cube array */
5296 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
5297 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
5298 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5299 alu.op = ALU_OP1_MOV;
5300 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY)
5301 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
5302 else
5303 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
5304 alu.dst.sel = ctx->temp_reg;
5305 alu.dst.chan = 2;
5306 alu.dst.write = 1;
5307 alu.last = 1;
5308 r = r600_bytecode_add_alu(ctx->bc, &alu);
5309 if (r)
5310 return r;
5311 }
5312
5313 if (inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
5314 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
5315 if (ctx->bc->chip_class >= EVERGREEN) {
5316 int mytmp = r600_get_temp(ctx);
5317 static const float eight = 8.0f;
5318 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5319 alu.op = ALU_OP1_MOV;
5320 alu.src[0].sel = ctx->temp_reg;
5321 alu.src[0].chan = 3;
5322 alu.dst.sel = mytmp;
5323 alu.dst.chan = 0;
5324 alu.dst.write = 1;
5325 alu.last = 1;
5326 r = r600_bytecode_add_alu(ctx->bc, &alu);
5327 if (r)
5328 return r;
5329
5330 /* have to multiply original layer by 8 and add to face id (temp.w) in Z */
5331 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5332 alu.op = ALU_OP3_MULADD;
5333 alu.is_op3 = 1;
5334 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
5335 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
5336 alu.src[1].chan = 0;
5337 alu.src[1].value = *(uint32_t *)&eight;
5338 alu.src[2].sel = mytmp;
5339 alu.src[2].chan = 0;
5340 alu.dst.sel = ctx->temp_reg;
5341 alu.dst.chan = 3;
5342 alu.dst.write = 1;
5343 alu.last = 1;
5344 r = r600_bytecode_add_alu(ctx->bc, &alu);
5345 if (r)
5346 return r;
5347 } else if (ctx->bc->chip_class < EVERGREEN) {
5348 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
5349 tex.op = FETCH_OP_SET_CUBEMAP_INDEX;
5350 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
5351 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
5352 tex.src_gpr = r600_get_temp(ctx);
5353 tex.src_sel_x = 0;
5354 tex.src_sel_y = 0;
5355 tex.src_sel_z = 0;
5356 tex.src_sel_w = 0;
5357 tex.dst_sel_x = tex.dst_sel_y = tex.dst_sel_z = tex.dst_sel_w = 7;
5358 tex.coord_type_x = 1;
5359 tex.coord_type_y = 1;
5360 tex.coord_type_z = 1;
5361 tex.coord_type_w = 1;
5362 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5363 alu.op = ALU_OP1_MOV;
5364 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
5365 alu.dst.sel = tex.src_gpr;
5366 alu.dst.chan = 0;
5367 alu.last = 1;
5368 alu.dst.write = 1;
5369 r = r600_bytecode_add_alu(ctx->bc, &alu);
5370 if (r)
5371 return r;
5372
5373 r = r600_bytecode_add_tex(ctx->bc, &tex);
5374 if (r)
5375 return r;
5376 }
5377
5378 }
5379
5380 /* for cube forms of lod and bias we need to route things */
5381 if (inst->Instruction.Opcode == TGSI_OPCODE_TXB ||
5382 inst->Instruction.Opcode == TGSI_OPCODE_TXL ||
5383 inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
5384 inst->Instruction.Opcode == TGSI_OPCODE_TXL2) {
5385 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5386 alu.op = ALU_OP1_MOV;
5387 if (inst->Instruction.Opcode == TGSI_OPCODE_TXB2 ||
5388 inst->Instruction.Opcode == TGSI_OPCODE_TXL2)
5389 r600_bytecode_src(&alu.src[0], &ctx->src[1], 0);
5390 else
5391 r600_bytecode_src(&alu.src[0], &ctx->src[0], 3);
5392 alu.dst.sel = ctx->temp_reg;
5393 alu.dst.chan = 2;
5394 alu.last = 1;
5395 alu.dst.write = 1;
5396 r = r600_bytecode_add_alu(ctx->bc, &alu);
5397 if (r)
5398 return r;
5399 }
5400
5401 src_loaded = TRUE;
5402 src_gpr = ctx->temp_reg;
5403 }
5404
5405 if (src_requires_loading && !src_loaded) {
5406 for (i = 0; i < 4; i++) {
5407 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5408 alu.op = ALU_OP1_MOV;
5409 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5410 alu.dst.sel = ctx->temp_reg;
5411 alu.dst.chan = i;
5412 if (i == 3)
5413 alu.last = 1;
5414 alu.dst.write = 1;
5415 r = r600_bytecode_add_alu(ctx->bc, &alu);
5416 if (r)
5417 return r;
5418 }
5419 src_loaded = TRUE;
5420 src_gpr = ctx->temp_reg;
5421 }
5422
5423 /* get offset values */
5424 if (inst->Texture.NumOffsets) {
5425 assert(inst->Texture.NumOffsets == 1);
5426
5427 /* The texture offset feature doesn't work with the TXF instruction
5428 * and must be emulated by adding the offset to the texture coordinates. */
5429 if (txf_add_offsets) {
5430 const struct tgsi_texture_offset *off = inst->TexOffsets;
5431
5432 switch (inst->Texture.Texture) {
5433 case TGSI_TEXTURE_3D:
5434 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5435 alu.op = ALU_OP2_ADD_INT;
5436 alu.src[0].sel = src_gpr;
5437 alu.src[0].chan = 2;
5438 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
5439 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleZ];
5440 alu.dst.sel = src_gpr;
5441 alu.dst.chan = 2;
5442 alu.dst.write = 1;
5443 alu.last = 1;
5444 r = r600_bytecode_add_alu(ctx->bc, &alu);
5445 if (r)
5446 return r;
5447 /* fall through */
5448
5449 case TGSI_TEXTURE_2D:
5450 case TGSI_TEXTURE_SHADOW2D:
5451 case TGSI_TEXTURE_RECT:
5452 case TGSI_TEXTURE_SHADOWRECT:
5453 case TGSI_TEXTURE_2D_ARRAY:
5454 case TGSI_TEXTURE_SHADOW2D_ARRAY:
5455 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5456 alu.op = ALU_OP2_ADD_INT;
5457 alu.src[0].sel = src_gpr;
5458 alu.src[0].chan = 1;
5459 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
5460 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleY];
5461 alu.dst.sel = src_gpr;
5462 alu.dst.chan = 1;
5463 alu.dst.write = 1;
5464 alu.last = 1;
5465 r = r600_bytecode_add_alu(ctx->bc, &alu);
5466 if (r)
5467 return r;
5468 /* fall through */
5469
5470 case TGSI_TEXTURE_1D:
5471 case TGSI_TEXTURE_SHADOW1D:
5472 case TGSI_TEXTURE_1D_ARRAY:
5473 case TGSI_TEXTURE_SHADOW1D_ARRAY:
5474 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5475 alu.op = ALU_OP2_ADD_INT;
5476 alu.src[0].sel = src_gpr;
5477 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
5478 alu.src[1].value = ctx->literals[4 * off[0].Index + off[0].SwizzleX];
5479 alu.dst.sel = src_gpr;
5480 alu.dst.write = 1;
5481 alu.last = 1;
5482 r = r600_bytecode_add_alu(ctx->bc, &alu);
5483 if (r)
5484 return r;
5485 break;
5486 /* texture offsets do not apply to other texture targets */
5487 }
5488 } else {
5489 offset_x = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleX] << 1;
5490 offset_y = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleY] << 1;
5491 offset_z = ctx->literals[4 * inst->TexOffsets[0].Index + inst->TexOffsets[0].SwizzleZ] << 1;
5492 }
5493 }
5494
5495 /* Obtain the sample index for reading a compressed MSAA color texture.
5496 * To read the FMASK, we use the ldfptr instruction, which tells us
5497 * where the samples are stored.
5498 * For uncompressed 8x MSAA surfaces, ldfptr should return 0x76543210,
5499 * which is the identity mapping. Each nibble says which physical sample
5500 * should be fetched to get that sample.
5501 *
5502 * Assume src.z contains the sample index. It should be modified like this:
5503 * src.z = (ldfptr() >> (src.z * 4)) & 0xF;
5504 * Then fetch the texel with src.
5505 */
5506 if (read_compressed_msaa) {
5507 unsigned sample_chan = 3;
5508 unsigned temp = r600_get_temp(ctx);
5509 assert(src_loaded);
5510
5511 /* temp.w = ldfptr() */
5512 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
5513 tex.op = FETCH_OP_LD;
5514 tex.inst_mod = 1; /* to indicate this is ldfptr */
5515 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
5516 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
5517 tex.src_gpr = src_gpr;
5518 tex.dst_gpr = temp;
5519 tex.dst_sel_x = 7; /* mask out these components */
5520 tex.dst_sel_y = 7;
5521 tex.dst_sel_z = 7;
5522 tex.dst_sel_w = 0; /* store X */
5523 tex.src_sel_x = 0;
5524 tex.src_sel_y = 1;
5525 tex.src_sel_z = 2;
5526 tex.src_sel_w = 3;
5527 tex.offset_x = offset_x;
5528 tex.offset_y = offset_y;
5529 tex.offset_z = offset_z;
5530 r = r600_bytecode_add_tex(ctx->bc, &tex);
5531 if (r)
5532 return r;
5533
5534 /* temp.x = sample_index*4 */
5535 if (ctx->bc->chip_class == CAYMAN) {
5536 for (i = 0 ; i < 4; i++) {
5537 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5538 alu.op = ALU_OP2_MULLO_INT;
5539 alu.src[0].sel = src_gpr;
5540 alu.src[0].chan = sample_chan;
5541 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
5542 alu.src[1].value = 4;
5543 alu.dst.sel = temp;
5544 alu.dst.chan = i;
5545 alu.dst.write = i == 0;
5546 if (i == 3)
5547 alu.last = 1;
5548 r = r600_bytecode_add_alu(ctx->bc, &alu);
5549 if (r)
5550 return r;
5551 }
5552 } else {
5553 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5554 alu.op = ALU_OP2_MULLO_INT;
5555 alu.src[0].sel = src_gpr;
5556 alu.src[0].chan = sample_chan;
5557 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
5558 alu.src[1].value = 4;
5559 alu.dst.sel = temp;
5560 alu.dst.chan = 0;
5561 alu.dst.write = 1;
5562 alu.last = 1;
5563 r = r600_bytecode_add_alu(ctx->bc, &alu);
5564 if (r)
5565 return r;
5566 }
5567
5568 /* sample_index = temp.w >> temp.x */
5569 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5570 alu.op = ALU_OP2_LSHR_INT;
5571 alu.src[0].sel = temp;
5572 alu.src[0].chan = 3;
5573 alu.src[1].sel = temp;
5574 alu.src[1].chan = 0;
5575 alu.dst.sel = src_gpr;
5576 alu.dst.chan = sample_chan;
5577 alu.dst.write = 1;
5578 alu.last = 1;
5579 r = r600_bytecode_add_alu(ctx->bc, &alu);
5580 if (r)
5581 return r;
5582
5583 /* sample_index & 0xF */
5584 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5585 alu.op = ALU_OP2_AND_INT;
5586 alu.src[0].sel = src_gpr;
5587 alu.src[0].chan = sample_chan;
5588 alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
5589 alu.src[1].value = 0xF;
5590 alu.dst.sel = src_gpr;
5591 alu.dst.chan = sample_chan;
5592 alu.dst.write = 1;
5593 alu.last = 1;
5594 r = r600_bytecode_add_alu(ctx->bc, &alu);
5595 if (r)
5596 return r;
5597 #if 0
5598 /* visualize the FMASK */
5599 for (i = 0; i < 4; i++) {
5600 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5601 alu.op = ALU_OP1_INT_TO_FLT;
5602 alu.src[0].sel = src_gpr;
5603 alu.src[0].chan = sample_chan;
5604 alu.dst.sel = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
5605 alu.dst.chan = i;
5606 alu.dst.write = 1;
5607 alu.last = 1;
5608 r = r600_bytecode_add_alu(ctx->bc, &alu);
5609 if (r)
5610 return r;
5611 }
5612 return 0;
5613 #endif
5614 }
5615
5616 /* does this shader want a num layers from TXQ for a cube array? */
5617 if (has_txq_cube_array_z) {
5618 int id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
5619
5620 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5621 alu.op = ALU_OP1_MOV;
5622
5623 alu.src[0].sel = 512 + (id / 4);
5624 alu.src[0].kc_bank = R600_TXQ_CONST_BUFFER;
5625 alu.src[0].chan = id % 4;
5626 tgsi_dst(ctx, &inst->Dst[0], 2, &alu.dst);
5627 alu.last = 1;
5628 r = r600_bytecode_add_alu(ctx->bc, &alu);
5629 if (r)
5630 return r;
5631 /* disable writemask from texture instruction */
5632 inst->Dst[0].Register.WriteMask &= ~4;
5633 }
5634
5635 opcode = ctx->inst_info->op;
5636 if (opcode == FETCH_OP_GATHER4 &&
5637 inst->TexOffsets[0].File != TGSI_FILE_NULL &&
5638 inst->TexOffsets[0].File != TGSI_FILE_IMMEDIATE) {
5639 opcode = FETCH_OP_GATHER4_O;
5640
5641 /* GATHER4_O/GATHER4_C_O use offset values loaded by
5642 SET_TEXTURE_OFFSETS instruction. The immediate offset values
5643 encoded in the instruction are ignored. */
5644 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
5645 tex.op = FETCH_OP_SET_TEXTURE_OFFSETS;
5646 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
5647 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
5648
5649 tex.src_gpr = ctx->file_offset[inst->TexOffsets[0].File] + inst->TexOffsets[0].Index;
5650 tex.src_sel_x = inst->TexOffsets[0].SwizzleX;
5651 tex.src_sel_y = inst->TexOffsets[0].SwizzleY;
5652 tex.src_sel_z = inst->TexOffsets[0].SwizzleZ;
5653 tex.src_sel_w = 4;
5654
5655 tex.dst_sel_x = 7;
5656 tex.dst_sel_y = 7;
5657 tex.dst_sel_z = 7;
5658 tex.dst_sel_w = 7;
5659
5660 r = r600_bytecode_add_tex(ctx->bc, &tex);
5661 if (r)
5662 return r;
5663 }
5664
5665 if (inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
5666 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
5667 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
5668 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
5669 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY ||
5670 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
5671 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
5672 switch (opcode) {
5673 case FETCH_OP_SAMPLE:
5674 opcode = FETCH_OP_SAMPLE_C;
5675 break;
5676 case FETCH_OP_SAMPLE_L:
5677 opcode = FETCH_OP_SAMPLE_C_L;
5678 break;
5679 case FETCH_OP_SAMPLE_LB:
5680 opcode = FETCH_OP_SAMPLE_C_LB;
5681 break;
5682 case FETCH_OP_SAMPLE_G:
5683 opcode = FETCH_OP_SAMPLE_C_G;
5684 break;
5685 /* Texture gather variants */
5686 case FETCH_OP_GATHER4:
5687 opcode = FETCH_OP_GATHER4_C;
5688 break;
5689 case FETCH_OP_GATHER4_O:
5690 opcode = FETCH_OP_GATHER4_C_O;
5691 break;
5692 }
5693 }
5694
5695 memset(&tex, 0, sizeof(struct r600_bytecode_tex));
5696 tex.op = opcode;
5697
5698 tex.sampler_id = tgsi_tex_get_src_gpr(ctx, sampler_src_reg);
5699 tex.resource_id = tex.sampler_id + R600_MAX_CONST_BUFFERS;
5700 tex.src_gpr = src_gpr;
5701 tex.dst_gpr = ctx->file_offset[inst->Dst[0].Register.File] + inst->Dst[0].Register.Index;
5702
5703 if (inst->Instruction.Opcode == TGSI_OPCODE_DDX_FINE ||
5704 inst->Instruction.Opcode == TGSI_OPCODE_DDY_FINE) {
5705 tex.inst_mod = 1; /* per pixel gradient calculation instead of per 2x2 quad */
5706 }
5707
5708 if (inst->Instruction.Opcode == TGSI_OPCODE_TG4) {
5709 int8_t texture_component_select = ctx->literals[4 * inst->Src[1].Register.Index + inst->Src[1].Register.SwizzleX];
5710 tex.inst_mod = texture_component_select;
5711
5712 /* GATHER4 result order is different from TGSI TG4 */
5713 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
5714 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
5715 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
5716 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
5717 }
5718 else if (inst->Instruction.Opcode == TGSI_OPCODE_LODQ) {
5719 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
5720 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
5721 tex.dst_sel_z = 7;
5722 tex.dst_sel_w = 7;
5723 }
5724 else {
5725 tex.dst_sel_x = (inst->Dst[0].Register.WriteMask & 1) ? 0 : 7;
5726 tex.dst_sel_y = (inst->Dst[0].Register.WriteMask & 2) ? 1 : 7;
5727 tex.dst_sel_z = (inst->Dst[0].Register.WriteMask & 4) ? 2 : 7;
5728 tex.dst_sel_w = (inst->Dst[0].Register.WriteMask & 8) ? 3 : 7;
5729 }
5730
5731
5732 if (inst->Instruction.Opcode == TGSI_OPCODE_TXQ_LZ) {
5733 tex.src_sel_x = 4;
5734 tex.src_sel_y = 4;
5735 tex.src_sel_z = 4;
5736 tex.src_sel_w = 4;
5737 } else if (src_loaded) {
5738 tex.src_sel_x = 0;
5739 tex.src_sel_y = 1;
5740 tex.src_sel_z = 2;
5741 tex.src_sel_w = 3;
5742 } else {
5743 tex.src_sel_x = ctx->src[0].swizzle[0];
5744 tex.src_sel_y = ctx->src[0].swizzle[1];
5745 tex.src_sel_z = ctx->src[0].swizzle[2];
5746 tex.src_sel_w = ctx->src[0].swizzle[3];
5747 tex.src_rel = ctx->src[0].rel;
5748 }
5749
5750 if (inst->Texture.Texture == TGSI_TEXTURE_CUBE ||
5751 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE ||
5752 inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
5753 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) {
5754 tex.src_sel_x = 1;
5755 tex.src_sel_y = 0;
5756 tex.src_sel_z = 3;
5757 tex.src_sel_w = 2; /* route Z compare or Lod value into W */
5758 }
5759
5760 if (inst->Texture.Texture != TGSI_TEXTURE_RECT &&
5761 inst->Texture.Texture != TGSI_TEXTURE_SHADOWRECT) {
5762 tex.coord_type_x = 1;
5763 tex.coord_type_y = 1;
5764 }
5765 tex.coord_type_z = 1;
5766 tex.coord_type_w = 1;
5767
5768 tex.offset_x = offset_x;
5769 tex.offset_y = offset_y;
5770 if (inst->Instruction.Opcode == TGSI_OPCODE_TG4 &&
5771 (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
5772 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY)) {
5773 tex.offset_z = 0;
5774 }
5775 else {
5776 tex.offset_z = offset_z;
5777 }
5778
5779 /* Put the depth for comparison in W.
5780 * TGSI_TEXTURE_SHADOW2D_ARRAY already has the depth in W.
5781 * Some instructions expect the depth in Z. */
5782 if ((inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D ||
5783 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D ||
5784 inst->Texture.Texture == TGSI_TEXTURE_SHADOWRECT ||
5785 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) &&
5786 opcode != FETCH_OP_SAMPLE_C_L &&
5787 opcode != FETCH_OP_SAMPLE_C_LB) {
5788 tex.src_sel_w = tex.src_sel_z;
5789 }
5790
5791 if (inst->Texture.Texture == TGSI_TEXTURE_1D_ARRAY ||
5792 inst->Texture.Texture == TGSI_TEXTURE_SHADOW1D_ARRAY) {
5793 if (opcode == FETCH_OP_SAMPLE_C_L ||
5794 opcode == FETCH_OP_SAMPLE_C_LB) {
5795 /* the array index is read from Y */
5796 tex.coord_type_y = 0;
5797 } else {
5798 /* the array index is read from Z */
5799 tex.coord_type_z = 0;
5800 tex.src_sel_z = tex.src_sel_y;
5801 }
5802 } else if (inst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY ||
5803 inst->Texture.Texture == TGSI_TEXTURE_SHADOW2D_ARRAY ||
5804 ((inst->Texture.Texture == TGSI_TEXTURE_CUBE_ARRAY ||
5805 inst->Texture.Texture == TGSI_TEXTURE_SHADOWCUBE_ARRAY) &&
5806 (ctx->bc->chip_class >= EVERGREEN)))
5807 /* the array index is read from Z */
5808 tex.coord_type_z = 0;
5809
5810 /* mask unused source components */
5811 if (opcode == FETCH_OP_SAMPLE || opcode == FETCH_OP_GATHER4) {
5812 switch (inst->Texture.Texture) {
5813 case TGSI_TEXTURE_2D:
5814 case TGSI_TEXTURE_RECT:
5815 tex.src_sel_z = 7;
5816 tex.src_sel_w = 7;
5817 break;
5818 case TGSI_TEXTURE_1D_ARRAY:
5819 tex.src_sel_y = 7;
5820 tex.src_sel_w = 7;
5821 break;
5822 case TGSI_TEXTURE_1D:
5823 tex.src_sel_y = 7;
5824 tex.src_sel_z = 7;
5825 tex.src_sel_w = 7;
5826 break;
5827 }
5828 }
5829
5830 r = r600_bytecode_add_tex(ctx->bc, &tex);
5831 if (r)
5832 return r;
5833
5834 /* add shadow ambient support - gallium doesn't do it yet */
5835 return 0;
5836 }
5837
5838 static int tgsi_lrp(struct r600_shader_ctx *ctx)
5839 {
5840 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5841 struct r600_bytecode_alu alu;
5842 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5843 unsigned i;
5844 int r;
5845
5846 /* optimize if it's just an equal balance */
5847 if (ctx->src[0].sel == V_SQ_ALU_SRC_0_5) {
5848 for (i = 0; i < lasti + 1; i++) {
5849 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5850 continue;
5851
5852 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5853 alu.op = ALU_OP2_ADD;
5854 r600_bytecode_src(&alu.src[0], &ctx->src[1], i);
5855 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
5856 alu.omod = 3;
5857 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5858 alu.dst.chan = i;
5859 if (i == lasti) {
5860 alu.last = 1;
5861 }
5862 r = r600_bytecode_add_alu(ctx->bc, &alu);
5863 if (r)
5864 return r;
5865 }
5866 return 0;
5867 }
5868
5869 /* 1 - src0 */
5870 for (i = 0; i < lasti + 1; i++) {
5871 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5872 continue;
5873
5874 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5875 alu.op = ALU_OP2_ADD;
5876 alu.src[0].sel = V_SQ_ALU_SRC_1;
5877 alu.src[0].chan = 0;
5878 r600_bytecode_src(&alu.src[1], &ctx->src[0], i);
5879 r600_bytecode_src_toggle_neg(&alu.src[1]);
5880 alu.dst.sel = ctx->temp_reg;
5881 alu.dst.chan = i;
5882 if (i == lasti) {
5883 alu.last = 1;
5884 }
5885 alu.dst.write = 1;
5886 r = r600_bytecode_add_alu(ctx->bc, &alu);
5887 if (r)
5888 return r;
5889 }
5890
5891 /* (1 - src0) * src2 */
5892 for (i = 0; i < lasti + 1; i++) {
5893 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5894 continue;
5895
5896 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5897 alu.op = ALU_OP2_MUL;
5898 alu.src[0].sel = ctx->temp_reg;
5899 alu.src[0].chan = i;
5900 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
5901 alu.dst.sel = ctx->temp_reg;
5902 alu.dst.chan = i;
5903 if (i == lasti) {
5904 alu.last = 1;
5905 }
5906 alu.dst.write = 1;
5907 r = r600_bytecode_add_alu(ctx->bc, &alu);
5908 if (r)
5909 return r;
5910 }
5911
5912 /* src0 * src1 + (1 - src0) * src2 */
5913 for (i = 0; i < lasti + 1; i++) {
5914 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5915 continue;
5916
5917 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5918 alu.op = ALU_OP3_MULADD;
5919 alu.is_op3 = 1;
5920 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5921 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
5922 alu.src[2].sel = ctx->temp_reg;
5923 alu.src[2].chan = i;
5924
5925 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5926 alu.dst.chan = i;
5927 if (i == lasti) {
5928 alu.last = 1;
5929 }
5930 r = r600_bytecode_add_alu(ctx->bc, &alu);
5931 if (r)
5932 return r;
5933 }
5934 return 0;
5935 }
5936
5937 static int tgsi_cmp(struct r600_shader_ctx *ctx)
5938 {
5939 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5940 struct r600_bytecode_alu alu;
5941 int i, r;
5942 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5943
5944 for (i = 0; i < lasti + 1; i++) {
5945 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5946 continue;
5947
5948 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5949 alu.op = ALU_OP3_CNDGE;
5950 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5951 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
5952 r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
5953 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5954 alu.dst.chan = i;
5955 alu.dst.write = 1;
5956 alu.is_op3 = 1;
5957 if (i == lasti)
5958 alu.last = 1;
5959 r = r600_bytecode_add_alu(ctx->bc, &alu);
5960 if (r)
5961 return r;
5962 }
5963 return 0;
5964 }
5965
5966 static int tgsi_ucmp(struct r600_shader_ctx *ctx)
5967 {
5968 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5969 struct r600_bytecode_alu alu;
5970 int i, r;
5971 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
5972
5973 for (i = 0; i < lasti + 1; i++) {
5974 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
5975 continue;
5976
5977 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
5978 alu.op = ALU_OP3_CNDGE_INT;
5979 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
5980 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
5981 r600_bytecode_src(&alu.src[2], &ctx->src[1], i);
5982 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
5983 alu.dst.chan = i;
5984 alu.dst.write = 1;
5985 alu.is_op3 = 1;
5986 if (i == lasti)
5987 alu.last = 1;
5988 r = r600_bytecode_add_alu(ctx->bc, &alu);
5989 if (r)
5990 return r;
5991 }
5992 return 0;
5993 }
5994
5995 static int tgsi_xpd(struct r600_shader_ctx *ctx)
5996 {
5997 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
5998 static const unsigned int src0_swizzle[] = {2, 0, 1};
5999 static const unsigned int src1_swizzle[] = {1, 2, 0};
6000 struct r600_bytecode_alu alu;
6001 uint32_t use_temp = 0;
6002 int i, r;
6003
6004 if (inst->Dst[0].Register.WriteMask != 0xf)
6005 use_temp = 1;
6006
6007 for (i = 0; i < 4; i++) {
6008 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6009 alu.op = ALU_OP2_MUL;
6010 if (i < 3) {
6011 r600_bytecode_src(&alu.src[0], &ctx->src[0], src0_swizzle[i]);
6012 r600_bytecode_src(&alu.src[1], &ctx->src[1], src1_swizzle[i]);
6013 } else {
6014 alu.src[0].sel = V_SQ_ALU_SRC_0;
6015 alu.src[0].chan = i;
6016 alu.src[1].sel = V_SQ_ALU_SRC_0;
6017 alu.src[1].chan = i;
6018 }
6019
6020 alu.dst.sel = ctx->temp_reg;
6021 alu.dst.chan = i;
6022 alu.dst.write = 1;
6023
6024 if (i == 3)
6025 alu.last = 1;
6026 r = r600_bytecode_add_alu(ctx->bc, &alu);
6027 if (r)
6028 return r;
6029 }
6030
6031 for (i = 0; i < 4; i++) {
6032 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6033 alu.op = ALU_OP3_MULADD;
6034
6035 if (i < 3) {
6036 r600_bytecode_src(&alu.src[0], &ctx->src[0], src1_swizzle[i]);
6037 r600_bytecode_src(&alu.src[1], &ctx->src[1], src0_swizzle[i]);
6038 } else {
6039 alu.src[0].sel = V_SQ_ALU_SRC_0;
6040 alu.src[0].chan = i;
6041 alu.src[1].sel = V_SQ_ALU_SRC_0;
6042 alu.src[1].chan = i;
6043 }
6044
6045 alu.src[2].sel = ctx->temp_reg;
6046 alu.src[2].neg = 1;
6047 alu.src[2].chan = i;
6048
6049 if (use_temp)
6050 alu.dst.sel = ctx->temp_reg;
6051 else
6052 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6053 alu.dst.chan = i;
6054 alu.dst.write = 1;
6055 alu.is_op3 = 1;
6056 if (i == 3)
6057 alu.last = 1;
6058 r = r600_bytecode_add_alu(ctx->bc, &alu);
6059 if (r)
6060 return r;
6061 }
6062 if (use_temp)
6063 return tgsi_helper_copy(ctx, inst);
6064 return 0;
6065 }
6066
6067 static int tgsi_exp(struct r600_shader_ctx *ctx)
6068 {
6069 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6070 struct r600_bytecode_alu alu;
6071 int r;
6072 int i;
6073
6074 /* result.x = 2^floor(src); */
6075 if (inst->Dst[0].Register.WriteMask & 1) {
6076 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6077
6078 alu.op = ALU_OP1_FLOOR;
6079 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6080
6081 alu.dst.sel = ctx->temp_reg;
6082 alu.dst.chan = 0;
6083 alu.dst.write = 1;
6084 alu.last = 1;
6085 r = r600_bytecode_add_alu(ctx->bc, &alu);
6086 if (r)
6087 return r;
6088
6089 if (ctx->bc->chip_class == CAYMAN) {
6090 for (i = 0; i < 3; i++) {
6091 alu.op = ALU_OP1_EXP_IEEE;
6092 alu.src[0].sel = ctx->temp_reg;
6093 alu.src[0].chan = 0;
6094
6095 alu.dst.sel = ctx->temp_reg;
6096 alu.dst.chan = i;
6097 alu.dst.write = i == 0;
6098 alu.last = i == 2;
6099 r = r600_bytecode_add_alu(ctx->bc, &alu);
6100 if (r)
6101 return r;
6102 }
6103 } else {
6104 alu.op = ALU_OP1_EXP_IEEE;
6105 alu.src[0].sel = ctx->temp_reg;
6106 alu.src[0].chan = 0;
6107
6108 alu.dst.sel = ctx->temp_reg;
6109 alu.dst.chan = 0;
6110 alu.dst.write = 1;
6111 alu.last = 1;
6112 r = r600_bytecode_add_alu(ctx->bc, &alu);
6113 if (r)
6114 return r;
6115 }
6116 }
6117
6118 /* result.y = tmp - floor(tmp); */
6119 if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
6120 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6121
6122 alu.op = ALU_OP1_FRACT;
6123 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6124
6125 alu.dst.sel = ctx->temp_reg;
6126 #if 0
6127 r = tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6128 if (r)
6129 return r;
6130 #endif
6131 alu.dst.write = 1;
6132 alu.dst.chan = 1;
6133
6134 alu.last = 1;
6135
6136 r = r600_bytecode_add_alu(ctx->bc, &alu);
6137 if (r)
6138 return r;
6139 }
6140
6141 /* result.z = RoughApprox2ToX(tmp);*/
6142 if ((inst->Dst[0].Register.WriteMask >> 2) & 0x1) {
6143 if (ctx->bc->chip_class == CAYMAN) {
6144 for (i = 0; i < 3; i++) {
6145 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6146 alu.op = ALU_OP1_EXP_IEEE;
6147 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6148
6149 alu.dst.sel = ctx->temp_reg;
6150 alu.dst.chan = i;
6151 if (i == 2) {
6152 alu.dst.write = 1;
6153 alu.last = 1;
6154 }
6155
6156 r = r600_bytecode_add_alu(ctx->bc, &alu);
6157 if (r)
6158 return r;
6159 }
6160 } else {
6161 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6162 alu.op = ALU_OP1_EXP_IEEE;
6163 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6164
6165 alu.dst.sel = ctx->temp_reg;
6166 alu.dst.write = 1;
6167 alu.dst.chan = 2;
6168
6169 alu.last = 1;
6170
6171 r = r600_bytecode_add_alu(ctx->bc, &alu);
6172 if (r)
6173 return r;
6174 }
6175 }
6176
6177 /* result.w = 1.0;*/
6178 if ((inst->Dst[0].Register.WriteMask >> 3) & 0x1) {
6179 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6180
6181 alu.op = ALU_OP1_MOV;
6182 alu.src[0].sel = V_SQ_ALU_SRC_1;
6183 alu.src[0].chan = 0;
6184
6185 alu.dst.sel = ctx->temp_reg;
6186 alu.dst.chan = 3;
6187 alu.dst.write = 1;
6188 alu.last = 1;
6189 r = r600_bytecode_add_alu(ctx->bc, &alu);
6190 if (r)
6191 return r;
6192 }
6193 return tgsi_helper_copy(ctx, inst);
6194 }
6195
6196 static int tgsi_log(struct r600_shader_ctx *ctx)
6197 {
6198 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6199 struct r600_bytecode_alu alu;
6200 int r;
6201 int i;
6202
6203 /* result.x = floor(log2(|src|)); */
6204 if (inst->Dst[0].Register.WriteMask & 1) {
6205 if (ctx->bc->chip_class == CAYMAN) {
6206 for (i = 0; i < 3; i++) {
6207 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6208
6209 alu.op = ALU_OP1_LOG_IEEE;
6210 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6211 r600_bytecode_src_set_abs(&alu.src[0]);
6212
6213 alu.dst.sel = ctx->temp_reg;
6214 alu.dst.chan = i;
6215 if (i == 0)
6216 alu.dst.write = 1;
6217 if (i == 2)
6218 alu.last = 1;
6219 r = r600_bytecode_add_alu(ctx->bc, &alu);
6220 if (r)
6221 return r;
6222 }
6223
6224 } else {
6225 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6226
6227 alu.op = ALU_OP1_LOG_IEEE;
6228 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6229 r600_bytecode_src_set_abs(&alu.src[0]);
6230
6231 alu.dst.sel = ctx->temp_reg;
6232 alu.dst.chan = 0;
6233 alu.dst.write = 1;
6234 alu.last = 1;
6235 r = r600_bytecode_add_alu(ctx->bc, &alu);
6236 if (r)
6237 return r;
6238 }
6239
6240 alu.op = ALU_OP1_FLOOR;
6241 alu.src[0].sel = ctx->temp_reg;
6242 alu.src[0].chan = 0;
6243
6244 alu.dst.sel = ctx->temp_reg;
6245 alu.dst.chan = 0;
6246 alu.dst.write = 1;
6247 alu.last = 1;
6248
6249 r = r600_bytecode_add_alu(ctx->bc, &alu);
6250 if (r)
6251 return r;
6252 }
6253
6254 /* result.y = |src.x| / (2 ^ floor(log2(|src.x|))); */
6255 if ((inst->Dst[0].Register.WriteMask >> 1) & 1) {
6256
6257 if (ctx->bc->chip_class == CAYMAN) {
6258 for (i = 0; i < 3; i++) {
6259 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6260
6261 alu.op = ALU_OP1_LOG_IEEE;
6262 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6263 r600_bytecode_src_set_abs(&alu.src[0]);
6264
6265 alu.dst.sel = ctx->temp_reg;
6266 alu.dst.chan = i;
6267 if (i == 1)
6268 alu.dst.write = 1;
6269 if (i == 2)
6270 alu.last = 1;
6271
6272 r = r600_bytecode_add_alu(ctx->bc, &alu);
6273 if (r)
6274 return r;
6275 }
6276 } else {
6277 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6278
6279 alu.op = ALU_OP1_LOG_IEEE;
6280 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6281 r600_bytecode_src_set_abs(&alu.src[0]);
6282
6283 alu.dst.sel = ctx->temp_reg;
6284 alu.dst.chan = 1;
6285 alu.dst.write = 1;
6286 alu.last = 1;
6287
6288 r = r600_bytecode_add_alu(ctx->bc, &alu);
6289 if (r)
6290 return r;
6291 }
6292
6293 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6294
6295 alu.op = ALU_OP1_FLOOR;
6296 alu.src[0].sel = ctx->temp_reg;
6297 alu.src[0].chan = 1;
6298
6299 alu.dst.sel = ctx->temp_reg;
6300 alu.dst.chan = 1;
6301 alu.dst.write = 1;
6302 alu.last = 1;
6303
6304 r = r600_bytecode_add_alu(ctx->bc, &alu);
6305 if (r)
6306 return r;
6307
6308 if (ctx->bc->chip_class == CAYMAN) {
6309 for (i = 0; i < 3; i++) {
6310 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6311 alu.op = ALU_OP1_EXP_IEEE;
6312 alu.src[0].sel = ctx->temp_reg;
6313 alu.src[0].chan = 1;
6314
6315 alu.dst.sel = ctx->temp_reg;
6316 alu.dst.chan = i;
6317 if (i == 1)
6318 alu.dst.write = 1;
6319 if (i == 2)
6320 alu.last = 1;
6321
6322 r = r600_bytecode_add_alu(ctx->bc, &alu);
6323 if (r)
6324 return r;
6325 }
6326 } else {
6327 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6328 alu.op = ALU_OP1_EXP_IEEE;
6329 alu.src[0].sel = ctx->temp_reg;
6330 alu.src[0].chan = 1;
6331
6332 alu.dst.sel = ctx->temp_reg;
6333 alu.dst.chan = 1;
6334 alu.dst.write = 1;
6335 alu.last = 1;
6336
6337 r = r600_bytecode_add_alu(ctx->bc, &alu);
6338 if (r)
6339 return r;
6340 }
6341
6342 if (ctx->bc->chip_class == CAYMAN) {
6343 for (i = 0; i < 3; i++) {
6344 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6345 alu.op = ALU_OP1_RECIP_IEEE;
6346 alu.src[0].sel = ctx->temp_reg;
6347 alu.src[0].chan = 1;
6348
6349 alu.dst.sel = ctx->temp_reg;
6350 alu.dst.chan = i;
6351 if (i == 1)
6352 alu.dst.write = 1;
6353 if (i == 2)
6354 alu.last = 1;
6355
6356 r = r600_bytecode_add_alu(ctx->bc, &alu);
6357 if (r)
6358 return r;
6359 }
6360 } else {
6361 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6362 alu.op = ALU_OP1_RECIP_IEEE;
6363 alu.src[0].sel = ctx->temp_reg;
6364 alu.src[0].chan = 1;
6365
6366 alu.dst.sel = ctx->temp_reg;
6367 alu.dst.chan = 1;
6368 alu.dst.write = 1;
6369 alu.last = 1;
6370
6371 r = r600_bytecode_add_alu(ctx->bc, &alu);
6372 if (r)
6373 return r;
6374 }
6375
6376 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6377
6378 alu.op = ALU_OP2_MUL;
6379
6380 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6381 r600_bytecode_src_set_abs(&alu.src[0]);
6382
6383 alu.src[1].sel = ctx->temp_reg;
6384 alu.src[1].chan = 1;
6385
6386 alu.dst.sel = ctx->temp_reg;
6387 alu.dst.chan = 1;
6388 alu.dst.write = 1;
6389 alu.last = 1;
6390
6391 r = r600_bytecode_add_alu(ctx->bc, &alu);
6392 if (r)
6393 return r;
6394 }
6395
6396 /* result.z = log2(|src|);*/
6397 if ((inst->Dst[0].Register.WriteMask >> 2) & 1) {
6398 if (ctx->bc->chip_class == CAYMAN) {
6399 for (i = 0; i < 3; i++) {
6400 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6401
6402 alu.op = ALU_OP1_LOG_IEEE;
6403 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6404 r600_bytecode_src_set_abs(&alu.src[0]);
6405
6406 alu.dst.sel = ctx->temp_reg;
6407 if (i == 2)
6408 alu.dst.write = 1;
6409 alu.dst.chan = i;
6410 if (i == 2)
6411 alu.last = 1;
6412
6413 r = r600_bytecode_add_alu(ctx->bc, &alu);
6414 if (r)
6415 return r;
6416 }
6417 } else {
6418 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6419
6420 alu.op = ALU_OP1_LOG_IEEE;
6421 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6422 r600_bytecode_src_set_abs(&alu.src[0]);
6423
6424 alu.dst.sel = ctx->temp_reg;
6425 alu.dst.write = 1;
6426 alu.dst.chan = 2;
6427 alu.last = 1;
6428
6429 r = r600_bytecode_add_alu(ctx->bc, &alu);
6430 if (r)
6431 return r;
6432 }
6433 }
6434
6435 /* result.w = 1.0; */
6436 if ((inst->Dst[0].Register.WriteMask >> 3) & 1) {
6437 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6438
6439 alu.op = ALU_OP1_MOV;
6440 alu.src[0].sel = V_SQ_ALU_SRC_1;
6441 alu.src[0].chan = 0;
6442
6443 alu.dst.sel = ctx->temp_reg;
6444 alu.dst.chan = 3;
6445 alu.dst.write = 1;
6446 alu.last = 1;
6447
6448 r = r600_bytecode_add_alu(ctx->bc, &alu);
6449 if (r)
6450 return r;
6451 }
6452
6453 return tgsi_helper_copy(ctx, inst);
6454 }
6455
6456 static int tgsi_eg_arl(struct r600_shader_ctx *ctx)
6457 {
6458 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6459 struct r600_bytecode_alu alu;
6460 int r;
6461 int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6462
6463 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6464
6465 switch (inst->Instruction.Opcode) {
6466 case TGSI_OPCODE_ARL:
6467 alu.op = ALU_OP1_FLT_TO_INT_FLOOR;
6468 break;
6469 case TGSI_OPCODE_ARR:
6470 alu.op = ALU_OP1_FLT_TO_INT;
6471 break;
6472 case TGSI_OPCODE_UARL:
6473 alu.op = ALU_OP1_MOV;
6474 break;
6475 default:
6476 assert(0);
6477 return -1;
6478 }
6479
6480 for (i = 0; i <= lasti; ++i) {
6481 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
6482 continue;
6483 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6484 alu.last = i == lasti;
6485 alu.dst.sel = ctx->bc->ar_reg;
6486 alu.dst.chan = i;
6487 alu.dst.write = 1;
6488 r = r600_bytecode_add_alu(ctx->bc, &alu);
6489 if (r)
6490 return r;
6491 }
6492
6493 ctx->bc->ar_loaded = 0;
6494 return 0;
6495 }
6496 static int tgsi_r600_arl(struct r600_shader_ctx *ctx)
6497 {
6498 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6499 struct r600_bytecode_alu alu;
6500 int r;
6501 int i, lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
6502
6503 switch (inst->Instruction.Opcode) {
6504 case TGSI_OPCODE_ARL:
6505 memset(&alu, 0, sizeof(alu));
6506 alu.op = ALU_OP1_FLOOR;
6507 alu.dst.sel = ctx->bc->ar_reg;
6508 alu.dst.write = 1;
6509 for (i = 0; i <= lasti; ++i) {
6510 if (inst->Dst[0].Register.WriteMask & (1 << i)) {
6511 alu.dst.chan = i;
6512 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6513 alu.last = i == lasti;
6514 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6515 return r;
6516 }
6517 }
6518
6519 memset(&alu, 0, sizeof(alu));
6520 alu.op = ALU_OP1_FLT_TO_INT;
6521 alu.src[0].sel = ctx->bc->ar_reg;
6522 alu.dst.sel = ctx->bc->ar_reg;
6523 alu.dst.write = 1;
6524 /* FLT_TO_INT is trans-only on r600/r700 */
6525 alu.last = TRUE;
6526 for (i = 0; i <= lasti; ++i) {
6527 alu.dst.chan = i;
6528 alu.src[0].chan = i;
6529 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6530 return r;
6531 }
6532 break;
6533 case TGSI_OPCODE_ARR:
6534 memset(&alu, 0, sizeof(alu));
6535 alu.op = ALU_OP1_FLT_TO_INT;
6536 alu.dst.sel = ctx->bc->ar_reg;
6537 alu.dst.write = 1;
6538 /* FLT_TO_INT is trans-only on r600/r700 */
6539 alu.last = TRUE;
6540 for (i = 0; i <= lasti; ++i) {
6541 if (inst->Dst[0].Register.WriteMask & (1 << i)) {
6542 alu.dst.chan = i;
6543 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6544 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6545 return r;
6546 }
6547 }
6548 break;
6549 case TGSI_OPCODE_UARL:
6550 memset(&alu, 0, sizeof(alu));
6551 alu.op = ALU_OP1_MOV;
6552 alu.dst.sel = ctx->bc->ar_reg;
6553 alu.dst.write = 1;
6554 for (i = 0; i <= lasti; ++i) {
6555 if (inst->Dst[0].Register.WriteMask & (1 << i)) {
6556 alu.dst.chan = i;
6557 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6558 alu.last = i == lasti;
6559 if ((r = r600_bytecode_add_alu(ctx->bc, &alu)))
6560 return r;
6561 }
6562 }
6563 break;
6564 default:
6565 assert(0);
6566 return -1;
6567 }
6568
6569 ctx->bc->ar_loaded = 0;
6570 return 0;
6571 }
6572
6573 static int tgsi_opdst(struct r600_shader_ctx *ctx)
6574 {
6575 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
6576 struct r600_bytecode_alu alu;
6577 int i, r = 0;
6578
6579 for (i = 0; i < 4; i++) {
6580 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6581
6582 alu.op = ALU_OP2_MUL;
6583 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
6584
6585 if (i == 0 || i == 3) {
6586 alu.src[0].sel = V_SQ_ALU_SRC_1;
6587 } else {
6588 r600_bytecode_src(&alu.src[0], &ctx->src[0], i);
6589 }
6590
6591 if (i == 0 || i == 2) {
6592 alu.src[1].sel = V_SQ_ALU_SRC_1;
6593 } else {
6594 r600_bytecode_src(&alu.src[1], &ctx->src[1], i);
6595 }
6596 if (i == 3)
6597 alu.last = 1;
6598 r = r600_bytecode_add_alu(ctx->bc, &alu);
6599 if (r)
6600 return r;
6601 }
6602 return 0;
6603 }
6604
6605 static int emit_logic_pred(struct r600_shader_ctx *ctx, int opcode, int alu_type)
6606 {
6607 struct r600_bytecode_alu alu;
6608 int r;
6609
6610 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
6611 alu.op = opcode;
6612 alu.execute_mask = 1;
6613 alu.update_pred = 1;
6614
6615 alu.dst.sel = ctx->temp_reg;
6616 alu.dst.write = 1;
6617 alu.dst.chan = 0;
6618
6619 r600_bytecode_src(&alu.src[0], &ctx->src[0], 0);
6620 alu.src[1].sel = V_SQ_ALU_SRC_0;
6621 alu.src[1].chan = 0;
6622
6623 alu.last = 1;
6624
6625 r = r600_bytecode_add_alu_type(ctx->bc, &alu, alu_type);
6626 if (r)
6627 return r;
6628 return 0;
6629 }
6630
6631 static int pops(struct r600_shader_ctx *ctx, int pops)
6632 {
6633 unsigned force_pop = ctx->bc->force_add_cf;
6634
6635 if (!force_pop) {
6636 int alu_pop = 3;
6637 if (ctx->bc->cf_last) {
6638 if (ctx->bc->cf_last->op == CF_OP_ALU)
6639 alu_pop = 0;
6640 else if (ctx->bc->cf_last->op == CF_OP_ALU_POP_AFTER)
6641 alu_pop = 1;
6642 }
6643 alu_pop += pops;
6644 if (alu_pop == 1) {
6645 ctx->bc->cf_last->op = CF_OP_ALU_POP_AFTER;
6646 ctx->bc->force_add_cf = 1;
6647 } else if (alu_pop == 2) {
6648 ctx->bc->cf_last->op = CF_OP_ALU_POP2_AFTER;
6649 ctx->bc->force_add_cf = 1;
6650 } else {
6651 force_pop = 1;
6652 }
6653 }
6654
6655 if (force_pop) {
6656 r600_bytecode_add_cfinst(ctx->bc, CF_OP_POP);
6657 ctx->bc->cf_last->pop_count = pops;
6658 ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
6659 }
6660
6661 return 0;
6662 }
6663
6664 static inline void callstack_update_max_depth(struct r600_shader_ctx *ctx,
6665 unsigned reason)
6666 {
6667 struct r600_stack_info *stack = &ctx->bc->stack;
6668 unsigned elements, entries;
6669
6670 unsigned entry_size = stack->entry_size;
6671
6672 elements = (stack->loop + stack->push_wqm ) * entry_size;
6673 elements += stack->push;
6674
6675 switch (ctx->bc->chip_class) {
6676 case R600:
6677 case R700:
6678 /* pre-r8xx: if any non-WQM PUSH instruction is invoked, 2 elements on
6679 * the stack must be reserved to hold the current active/continue
6680 * masks */
6681 if (reason == FC_PUSH_VPM) {
6682 elements += 2;
6683 }
6684 break;
6685
6686 case CAYMAN:
6687 /* r9xx: any stack operation on empty stack consumes 2 additional
6688 * elements */
6689 elements += 2;
6690
6691 /* fallthrough */
6692 /* FIXME: do the two elements added above cover the cases for the
6693 * r8xx+ below? */
6694
6695 case EVERGREEN:
6696 /* r8xx+: 2 extra elements are not always required, but one extra
6697 * element must be added for each of the following cases:
6698 * 1. There is an ALU_ELSE_AFTER instruction at the point of greatest
6699 * stack usage.
6700 * (Currently we don't use ALU_ELSE_AFTER.)
6701 * 2. There are LOOP/WQM frames on the stack when any flavor of non-WQM
6702 * PUSH instruction executed.
6703 *
6704 * NOTE: it seems we also need to reserve additional element in some
6705 * other cases, e.g. when we have 4 levels of PUSH_VPM in the shader,
6706 * then STACK_SIZE should be 2 instead of 1 */
6707 if (reason == FC_PUSH_VPM) {
6708 elements += 1;
6709 }
6710 break;
6711
6712 default:
6713 assert(0);
6714 break;
6715 }
6716
6717 /* NOTE: it seems STACK_SIZE is interpreted by hw as if entry_size is 4
6718 * for all chips, so we use 4 in the final formula, not the real entry_size
6719 * for the chip */
6720 entry_size = 4;
6721
6722 entries = (elements + (entry_size - 1)) / entry_size;
6723
6724 if (entries > stack->max_entries)
6725 stack->max_entries = entries;
6726 }
6727
6728 static inline void callstack_pop(struct r600_shader_ctx *ctx, unsigned reason)
6729 {
6730 switch(reason) {
6731 case FC_PUSH_VPM:
6732 --ctx->bc->stack.push;
6733 assert(ctx->bc->stack.push >= 0);
6734 break;
6735 case FC_PUSH_WQM:
6736 --ctx->bc->stack.push_wqm;
6737 assert(ctx->bc->stack.push_wqm >= 0);
6738 break;
6739 case FC_LOOP:
6740 --ctx->bc->stack.loop;
6741 assert(ctx->bc->stack.loop >= 0);
6742 break;
6743 default:
6744 assert(0);
6745 break;
6746 }
6747 }
6748
6749 static inline void callstack_push(struct r600_shader_ctx *ctx, unsigned reason)
6750 {
6751 switch (reason) {
6752 case FC_PUSH_VPM:
6753 ++ctx->bc->stack.push;
6754 break;
6755 case FC_PUSH_WQM:
6756 ++ctx->bc->stack.push_wqm;
6757 case FC_LOOP:
6758 ++ctx->bc->stack.loop;
6759 break;
6760 default:
6761 assert(0);
6762 }
6763
6764 callstack_update_max_depth(ctx, reason);
6765 }
6766
6767 static void fc_set_mid(struct r600_shader_ctx *ctx, int fc_sp)
6768 {
6769 struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[fc_sp];
6770
6771 sp->mid = realloc((void *)sp->mid,
6772 sizeof(struct r600_bytecode_cf *) * (sp->num_mid + 1));
6773 sp->mid[sp->num_mid] = ctx->bc->cf_last;
6774 sp->num_mid++;
6775 }
6776
6777 static void fc_pushlevel(struct r600_shader_ctx *ctx, int type)
6778 {
6779 ctx->bc->fc_sp++;
6780 ctx->bc->fc_stack[ctx->bc->fc_sp].type = type;
6781 ctx->bc->fc_stack[ctx->bc->fc_sp].start = ctx->bc->cf_last;
6782 }
6783
6784 static void fc_poplevel(struct r600_shader_ctx *ctx)
6785 {
6786 struct r600_cf_stack_entry *sp = &ctx->bc->fc_stack[ctx->bc->fc_sp];
6787 free(sp->mid);
6788 sp->mid = NULL;
6789 sp->num_mid = 0;
6790 sp->start = NULL;
6791 sp->type = 0;
6792 ctx->bc->fc_sp--;
6793 }
6794
6795 #if 0
6796 static int emit_return(struct r600_shader_ctx *ctx)
6797 {
6798 r600_bytecode_add_cfinst(ctx->bc, CF_OP_RETURN));
6799 return 0;
6800 }
6801
6802 static int emit_jump_to_offset(struct r600_shader_ctx *ctx, int pops, int offset)
6803 {
6804
6805 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP));
6806 ctx->bc->cf_last->pop_count = pops;
6807 /* XXX work out offset */
6808 return 0;
6809 }
6810
6811 static int emit_setret_in_loop_flag(struct r600_shader_ctx *ctx, unsigned flag_value)
6812 {
6813 return 0;
6814 }
6815
6816 static void emit_testflag(struct r600_shader_ctx *ctx)
6817 {
6818
6819 }
6820
6821 static void emit_return_on_flag(struct r600_shader_ctx *ctx, unsigned ifidx)
6822 {
6823 emit_testflag(ctx);
6824 emit_jump_to_offset(ctx, 1, 4);
6825 emit_setret_in_loop_flag(ctx, V_SQ_ALU_SRC_0);
6826 pops(ctx, ifidx + 1);
6827 emit_return(ctx);
6828 }
6829
6830 static void break_loop_on_flag(struct r600_shader_ctx *ctx, unsigned fc_sp)
6831 {
6832 emit_testflag(ctx);
6833
6834 r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
6835 ctx->bc->cf_last->pop_count = 1;
6836
6837 fc_set_mid(ctx, fc_sp);
6838
6839 pops(ctx, 1);
6840 }
6841 #endif
6842
6843 static int emit_if(struct r600_shader_ctx *ctx, int opcode)
6844 {
6845 int alu_type = CF_OP_ALU_PUSH_BEFORE;
6846
6847 /* There is a hardware bug on Cayman where a BREAK/CONTINUE followed by
6848 * LOOP_STARTxxx for nested loops may put the branch stack into a state
6849 * such that ALU_PUSH_BEFORE doesn't work as expected. Workaround this
6850 * by replacing the ALU_PUSH_BEFORE with a PUSH + ALU */
6851 if (ctx->bc->chip_class == CAYMAN && ctx->bc->stack.loop > 1) {
6852 r600_bytecode_add_cfinst(ctx->bc, CF_OP_PUSH);
6853 ctx->bc->cf_last->cf_addr = ctx->bc->cf_last->id + 2;
6854 alu_type = CF_OP_ALU;
6855 }
6856
6857 emit_logic_pred(ctx, opcode, alu_type);
6858
6859 r600_bytecode_add_cfinst(ctx->bc, CF_OP_JUMP);
6860
6861 fc_pushlevel(ctx, FC_IF);
6862
6863 callstack_push(ctx, FC_PUSH_VPM);
6864 return 0;
6865 }
6866
6867 static int tgsi_if(struct r600_shader_ctx *ctx)
6868 {
6869 return emit_if(ctx, ALU_OP2_PRED_SETNE);
6870 }
6871
6872 static int tgsi_uif(struct r600_shader_ctx *ctx)
6873 {
6874 return emit_if(ctx, ALU_OP2_PRED_SETNE_INT);
6875 }
6876
6877 static int tgsi_else(struct r600_shader_ctx *ctx)
6878 {
6879 r600_bytecode_add_cfinst(ctx->bc, CF_OP_ELSE);
6880 ctx->bc->cf_last->pop_count = 1;
6881
6882 fc_set_mid(ctx, ctx->bc->fc_sp);
6883 ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id;
6884 return 0;
6885 }
6886
6887 static int tgsi_endif(struct r600_shader_ctx *ctx)
6888 {
6889 pops(ctx, 1);
6890 if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_IF) {
6891 R600_ERR("if/endif unbalanced in shader\n");
6892 return -1;
6893 }
6894
6895 if (ctx->bc->fc_stack[ctx->bc->fc_sp].mid == NULL) {
6896 ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2;
6897 ctx->bc->fc_stack[ctx->bc->fc_sp].start->pop_count = 1;
6898 } else {
6899 ctx->bc->fc_stack[ctx->bc->fc_sp].mid[0]->cf_addr = ctx->bc->cf_last->id + 2;
6900 }
6901 fc_poplevel(ctx);
6902
6903 callstack_pop(ctx, FC_PUSH_VPM);
6904 return 0;
6905 }
6906
6907 static int tgsi_bgnloop(struct r600_shader_ctx *ctx)
6908 {
6909 /* LOOP_START_DX10 ignores the LOOP_CONFIG* registers, so it is not
6910 * limited to 4096 iterations, like the other LOOP_* instructions. */
6911 r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_START_DX10);
6912
6913 fc_pushlevel(ctx, FC_LOOP);
6914
6915 /* check stack depth */
6916 callstack_push(ctx, FC_LOOP);
6917 return 0;
6918 }
6919
6920 static int tgsi_endloop(struct r600_shader_ctx *ctx)
6921 {
6922 int i;
6923
6924 r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_END);
6925
6926 if (ctx->bc->fc_stack[ctx->bc->fc_sp].type != FC_LOOP) {
6927 R600_ERR("loop/endloop in shader code are not paired.\n");
6928 return -EINVAL;
6929 }
6930
6931 /* fixup loop pointers - from r600isa
6932 LOOP END points to CF after LOOP START,
6933 LOOP START point to CF after LOOP END
6934 BRK/CONT point to LOOP END CF
6935 */
6936 ctx->bc->cf_last->cf_addr = ctx->bc->fc_stack[ctx->bc->fc_sp].start->id + 2;
6937
6938 ctx->bc->fc_stack[ctx->bc->fc_sp].start->cf_addr = ctx->bc->cf_last->id + 2;
6939
6940 for (i = 0; i < ctx->bc->fc_stack[ctx->bc->fc_sp].num_mid; i++) {
6941 ctx->bc->fc_stack[ctx->bc->fc_sp].mid[i]->cf_addr = ctx->bc->cf_last->id;
6942 }
6943 /* XXX add LOOPRET support */
6944 fc_poplevel(ctx);
6945 callstack_pop(ctx, FC_LOOP);
6946 return 0;
6947 }
6948
6949 static int tgsi_loop_breakc(struct r600_shader_ctx *ctx)
6950 {
6951 int r;
6952 unsigned int fscp;
6953
6954 for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
6955 {
6956 if (FC_LOOP == ctx->bc->fc_stack[fscp].type)
6957 break;
6958 }
6959 if (fscp == 0) {
6960 R600_ERR("BREAKC not inside loop/endloop pair\n");
6961 return -EINVAL;
6962 }
6963
6964 if (ctx->bc->chip_class == EVERGREEN &&
6965 ctx->bc->family != CHIP_CYPRESS &&
6966 ctx->bc->family != CHIP_JUNIPER) {
6967 /* HW bug: ALU_BREAK does not save the active mask correctly */
6968 r = tgsi_uif(ctx);
6969 if (r)
6970 return r;
6971
6972 r = r600_bytecode_add_cfinst(ctx->bc, CF_OP_LOOP_BREAK);
6973 if (r)
6974 return r;
6975 fc_set_mid(ctx, fscp);
6976
6977 return tgsi_endif(ctx);
6978 } else {
6979 r = emit_logic_pred(ctx, ALU_OP2_PRED_SETE_INT, CF_OP_ALU_BREAK);
6980 if (r)
6981 return r;
6982 fc_set_mid(ctx, fscp);
6983 }
6984
6985 return 0;
6986 }
6987
6988 static int tgsi_loop_brk_cont(struct r600_shader_ctx *ctx)
6989 {
6990 unsigned int fscp;
6991
6992 for (fscp = ctx->bc->fc_sp; fscp > 0; fscp--)
6993 {
6994 if (FC_LOOP == ctx->bc->fc_stack[fscp].type)
6995 break;
6996 }
6997
6998 if (fscp == 0) {
6999 R600_ERR("Break not inside loop/endloop pair\n");
7000 return -EINVAL;
7001 }
7002
7003 r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
7004
7005 fc_set_mid(ctx, fscp);
7006
7007 return 0;
7008 }
7009
7010 static int tgsi_gs_emit(struct r600_shader_ctx *ctx)
7011 {
7012 if (ctx->inst_info->op == CF_OP_EMIT_VERTEX)
7013 emit_gs_ring_writes(ctx, TRUE);
7014
7015 return r600_bytecode_add_cfinst(ctx->bc, ctx->inst_info->op);
7016 }
7017
7018 static int tgsi_umad(struct r600_shader_ctx *ctx)
7019 {
7020 struct tgsi_full_instruction *inst = &ctx->parse.FullToken.FullInstruction;
7021 struct r600_bytecode_alu alu;
7022 int i, j, k, r;
7023 int lasti = tgsi_last_instruction(inst->Dst[0].Register.WriteMask);
7024
7025 /* src0 * src1 */
7026 for (i = 0; i < lasti + 1; i++) {
7027 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7028 continue;
7029
7030 if (ctx->bc->chip_class == CAYMAN) {
7031 for (j = 0 ; j < 4; j++) {
7032 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7033
7034 alu.op = ALU_OP2_MULLO_UINT;
7035 for (k = 0; k < inst->Instruction.NumSrcRegs; k++) {
7036 r600_bytecode_src(&alu.src[k], &ctx->src[k], i);
7037 }
7038 tgsi_dst(ctx, &inst->Dst[0], j, &alu.dst);
7039 alu.dst.sel = ctx->temp_reg;
7040 alu.dst.write = (j == i);
7041 if (j == 3)
7042 alu.last = 1;
7043 r = r600_bytecode_add_alu(ctx->bc, &alu);
7044 if (r)
7045 return r;
7046 }
7047 } else {
7048 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7049
7050 alu.dst.chan = i;
7051 alu.dst.sel = ctx->temp_reg;
7052 alu.dst.write = 1;
7053
7054 alu.op = ALU_OP2_MULLO_UINT;
7055 for (j = 0; j < 2; j++) {
7056 r600_bytecode_src(&alu.src[j], &ctx->src[j], i);
7057 }
7058
7059 alu.last = 1;
7060 r = r600_bytecode_add_alu(ctx->bc, &alu);
7061 if (r)
7062 return r;
7063 }
7064 }
7065
7066
7067 for (i = 0; i < lasti + 1; i++) {
7068 if (!(inst->Dst[0].Register.WriteMask & (1 << i)))
7069 continue;
7070
7071 memset(&alu, 0, sizeof(struct r600_bytecode_alu));
7072 tgsi_dst(ctx, &inst->Dst[0], i, &alu.dst);
7073
7074 alu.op = ALU_OP2_ADD_INT;
7075
7076 alu.src[0].sel = ctx->temp_reg;
7077 alu.src[0].chan = i;
7078
7079 r600_bytecode_src(&alu.src[1], &ctx->src[2], i);
7080 if (i == lasti) {
7081 alu.last = 1;
7082 }
7083 r = r600_bytecode_add_alu(ctx->bc, &alu);
7084 if (r)
7085 return r;
7086 }
7087 return 0;
7088 }
7089
7090 static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
7091 {TGSI_OPCODE_ARL, 0, ALU_OP0_NOP, tgsi_r600_arl},
7092 {TGSI_OPCODE_MOV, 0, ALU_OP1_MOV, tgsi_op2},
7093 {TGSI_OPCODE_LIT, 0, ALU_OP0_NOP, tgsi_lit},
7094
7095 /* XXX:
7096 * For state trackers other than OpenGL, we'll want to use
7097 * _RECIP_IEEE instead.
7098 */
7099 {TGSI_OPCODE_RCP, 0, ALU_OP1_RECIP_CLAMPED, tgsi_trans_srcx_replicate},
7100
7101 {TGSI_OPCODE_RSQ, 0, ALU_OP0_NOP, tgsi_rsq},
7102 {TGSI_OPCODE_EXP, 0, ALU_OP0_NOP, tgsi_exp},
7103 {TGSI_OPCODE_LOG, 0, ALU_OP0_NOP, tgsi_log},
7104 {TGSI_OPCODE_MUL, 0, ALU_OP2_MUL, tgsi_op2},
7105 {TGSI_OPCODE_ADD, 0, ALU_OP2_ADD, tgsi_op2},
7106 {TGSI_OPCODE_DP3, 0, ALU_OP2_DOT4, tgsi_dp},
7107 {TGSI_OPCODE_DP4, 0, ALU_OP2_DOT4, tgsi_dp},
7108 {TGSI_OPCODE_DST, 0, ALU_OP0_NOP, tgsi_opdst},
7109 {TGSI_OPCODE_MIN, 0, ALU_OP2_MIN, tgsi_op2},
7110 {TGSI_OPCODE_MAX, 0, ALU_OP2_MAX, tgsi_op2},
7111 {TGSI_OPCODE_SLT, 0, ALU_OP2_SETGT, tgsi_op2_swap},
7112 {TGSI_OPCODE_SGE, 0, ALU_OP2_SETGE, tgsi_op2},
7113 {TGSI_OPCODE_MAD, 1, ALU_OP3_MULADD, tgsi_op3},
7114 {TGSI_OPCODE_SUB, 0, ALU_OP2_ADD, tgsi_op2},
7115 {TGSI_OPCODE_LRP, 0, ALU_OP0_NOP, tgsi_lrp},
7116 {TGSI_OPCODE_CND, 0, ALU_OP0_NOP, tgsi_unsupported},
7117 {TGSI_OPCODE_SQRT, 0, ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
7118 {TGSI_OPCODE_DP2A, 0, ALU_OP0_NOP, tgsi_unsupported},
7119 /* gap */
7120 {22, 0, ALU_OP0_NOP, tgsi_unsupported},
7121 {23, 0, ALU_OP0_NOP, tgsi_unsupported},
7122 {TGSI_OPCODE_FRC, 0, ALU_OP1_FRACT, tgsi_op2},
7123 {TGSI_OPCODE_CLAMP, 0, ALU_OP0_NOP, tgsi_unsupported},
7124 {TGSI_OPCODE_FLR, 0, ALU_OP1_FLOOR, tgsi_op2},
7125 {TGSI_OPCODE_ROUND, 0, ALU_OP1_RNDNE, tgsi_op2},
7126 {TGSI_OPCODE_EX2, 0, ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
7127 {TGSI_OPCODE_LG2, 0, ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
7128 {TGSI_OPCODE_POW, 0, ALU_OP0_NOP, tgsi_pow},
7129 {TGSI_OPCODE_XPD, 0, ALU_OP0_NOP, tgsi_xpd},
7130 /* gap */
7131 {32, 0, ALU_OP0_NOP, tgsi_unsupported},
7132 {TGSI_OPCODE_ABS, 0, ALU_OP1_MOV, tgsi_op2},
7133 {TGSI_OPCODE_RCC, 0, ALU_OP0_NOP, tgsi_unsupported},
7134 {TGSI_OPCODE_DPH, 0, ALU_OP2_DOT4, tgsi_dp},
7135 {TGSI_OPCODE_COS, 0, ALU_OP1_COS, tgsi_trig},
7136 {TGSI_OPCODE_DDX, 0, FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
7137 {TGSI_OPCODE_DDY, 0, FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
7138 {TGSI_OPCODE_KILL, 0, ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */
7139 {TGSI_OPCODE_PK2H, 0, ALU_OP0_NOP, tgsi_unsupported},
7140 {TGSI_OPCODE_PK2US, 0, ALU_OP0_NOP, tgsi_unsupported},
7141 {TGSI_OPCODE_PK4B, 0, ALU_OP0_NOP, tgsi_unsupported},
7142 {TGSI_OPCODE_PK4UB, 0, ALU_OP0_NOP, tgsi_unsupported},
7143 {TGSI_OPCODE_RFL, 0, ALU_OP0_NOP, tgsi_unsupported},
7144 {TGSI_OPCODE_SEQ, 0, ALU_OP2_SETE, tgsi_op2},
7145 {TGSI_OPCODE_SFL, 0, ALU_OP0_NOP, tgsi_unsupported},
7146 {TGSI_OPCODE_SGT, 0, ALU_OP2_SETGT, tgsi_op2},
7147 {TGSI_OPCODE_SIN, 0, ALU_OP1_SIN, tgsi_trig},
7148 {TGSI_OPCODE_SLE, 0, ALU_OP2_SETGE, tgsi_op2_swap},
7149 {TGSI_OPCODE_SNE, 0, ALU_OP2_SETNE, tgsi_op2},
7150 {TGSI_OPCODE_STR, 0, ALU_OP0_NOP, tgsi_unsupported},
7151 {TGSI_OPCODE_TEX, 0, FETCH_OP_SAMPLE, tgsi_tex},
7152 {TGSI_OPCODE_TXD, 0, FETCH_OP_SAMPLE_G, tgsi_tex},
7153 {TGSI_OPCODE_TXP, 0, FETCH_OP_SAMPLE, tgsi_tex},
7154 {TGSI_OPCODE_UP2H, 0, ALU_OP0_NOP, tgsi_unsupported},
7155 {TGSI_OPCODE_UP2US, 0, ALU_OP0_NOP, tgsi_unsupported},
7156 {TGSI_OPCODE_UP4B, 0, ALU_OP0_NOP, tgsi_unsupported},
7157 {TGSI_OPCODE_UP4UB, 0, ALU_OP0_NOP, tgsi_unsupported},
7158 {TGSI_OPCODE_X2D, 0, ALU_OP0_NOP, tgsi_unsupported},
7159 {TGSI_OPCODE_ARA, 0, ALU_OP0_NOP, tgsi_unsupported},
7160 {TGSI_OPCODE_ARR, 0, ALU_OP0_NOP, tgsi_r600_arl},
7161 {TGSI_OPCODE_BRA, 0, ALU_OP0_NOP, tgsi_unsupported},
7162 {TGSI_OPCODE_CAL, 0, ALU_OP0_NOP, tgsi_unsupported},
7163 {TGSI_OPCODE_RET, 0, ALU_OP0_NOP, tgsi_unsupported},
7164 {TGSI_OPCODE_SSG, 0, ALU_OP0_NOP, tgsi_ssg},
7165 {TGSI_OPCODE_CMP, 0, ALU_OP0_NOP, tgsi_cmp},
7166 {TGSI_OPCODE_SCS, 0, ALU_OP0_NOP, tgsi_scs},
7167 {TGSI_OPCODE_TXB, 0, FETCH_OP_SAMPLE_LB, tgsi_tex},
7168 {TGSI_OPCODE_NRM, 0, ALU_OP0_NOP, tgsi_unsupported},
7169 {TGSI_OPCODE_DIV, 0, ALU_OP0_NOP, tgsi_unsupported},
7170 {TGSI_OPCODE_DP2, 0, ALU_OP2_DOT4, tgsi_dp},
7171 {TGSI_OPCODE_TXL, 0, FETCH_OP_SAMPLE_L, tgsi_tex},
7172 {TGSI_OPCODE_BRK, 0, CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
7173 {TGSI_OPCODE_IF, 0, ALU_OP0_NOP, tgsi_if},
7174 {TGSI_OPCODE_UIF, 0, ALU_OP0_NOP, tgsi_uif},
7175 {76, 0, ALU_OP0_NOP, tgsi_unsupported},
7176 {TGSI_OPCODE_ELSE, 0, ALU_OP0_NOP, tgsi_else},
7177 {TGSI_OPCODE_ENDIF, 0, ALU_OP0_NOP, tgsi_endif},
7178 {TGSI_OPCODE_DDX_FINE, 0, ALU_OP0_NOP, tgsi_unsupported},
7179 {TGSI_OPCODE_DDY_FINE, 0, ALU_OP0_NOP, tgsi_unsupported},
7180 {TGSI_OPCODE_PUSHA, 0, ALU_OP0_NOP, tgsi_unsupported},
7181 {TGSI_OPCODE_POPA, 0, ALU_OP0_NOP, tgsi_unsupported},
7182 {TGSI_OPCODE_CEIL, 0, ALU_OP1_CEIL, tgsi_op2},
7183 {TGSI_OPCODE_I2F, 0, ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
7184 {TGSI_OPCODE_NOT, 0, ALU_OP1_NOT_INT, tgsi_op2},
7185 {TGSI_OPCODE_TRUNC, 0, ALU_OP1_TRUNC, tgsi_op2},
7186 {TGSI_OPCODE_SHL, 0, ALU_OP2_LSHL_INT, tgsi_op2_trans},
7187 /* gap */
7188 {88, 0, ALU_OP0_NOP, tgsi_unsupported},
7189 {TGSI_OPCODE_AND, 0, ALU_OP2_AND_INT, tgsi_op2},
7190 {TGSI_OPCODE_OR, 0, ALU_OP2_OR_INT, tgsi_op2},
7191 {TGSI_OPCODE_MOD, 0, ALU_OP0_NOP, tgsi_imod},
7192 {TGSI_OPCODE_XOR, 0, ALU_OP2_XOR_INT, tgsi_op2},
7193 {TGSI_OPCODE_SAD, 0, ALU_OP0_NOP, tgsi_unsupported},
7194 {TGSI_OPCODE_TXF, 0, FETCH_OP_LD, tgsi_tex},
7195 {TGSI_OPCODE_TXQ, 0, FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
7196 {TGSI_OPCODE_CONT, 0, CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
7197 {TGSI_OPCODE_EMIT, 0, CF_OP_EMIT_VERTEX, tgsi_gs_emit},
7198 {TGSI_OPCODE_ENDPRIM, 0, CF_OP_CUT_VERTEX, tgsi_gs_emit},
7199 {TGSI_OPCODE_BGNLOOP, 0, ALU_OP0_NOP, tgsi_bgnloop},
7200 {TGSI_OPCODE_BGNSUB, 0, ALU_OP0_NOP, tgsi_unsupported},
7201 {TGSI_OPCODE_ENDLOOP, 0, ALU_OP0_NOP, tgsi_endloop},
7202 {TGSI_OPCODE_ENDSUB, 0, ALU_OP0_NOP, tgsi_unsupported},
7203 {TGSI_OPCODE_TXQ_LZ, 0, FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
7204 /* gap */
7205 {104, 0, ALU_OP0_NOP, tgsi_unsupported},
7206 {105, 0, ALU_OP0_NOP, tgsi_unsupported},
7207 {106, 0, ALU_OP0_NOP, tgsi_unsupported},
7208 {TGSI_OPCODE_NOP, 0, ALU_OP0_NOP, tgsi_unsupported},
7209 {TGSI_OPCODE_FSEQ, 0, ALU_OP2_SETE_DX10, tgsi_op2},
7210 {TGSI_OPCODE_FSGE, 0, ALU_OP2_SETGE_DX10, tgsi_op2},
7211 {TGSI_OPCODE_FSLT, 0, ALU_OP2_SETGT_DX10, tgsi_op2_swap},
7212 {TGSI_OPCODE_FSNE, 0, ALU_OP2_SETNE_DX10, tgsi_op2_swap},
7213 {TGSI_OPCODE_NRM4, 0, ALU_OP0_NOP, tgsi_unsupported},
7214 {TGSI_OPCODE_CALLNZ, 0, ALU_OP0_NOP, tgsi_unsupported},
7215 /* gap */
7216 {114, 0, ALU_OP0_NOP, tgsi_unsupported},
7217 {TGSI_OPCODE_BREAKC, 0, ALU_OP0_NOP, tgsi_loop_breakc},
7218 {TGSI_OPCODE_KILL_IF, 0, ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */
7219 {TGSI_OPCODE_END, 0, ALU_OP0_NOP, tgsi_end}, /* aka HALT */
7220 /* gap */
7221 {118, 0, ALU_OP0_NOP, tgsi_unsupported},
7222 {TGSI_OPCODE_F2I, 0, ALU_OP1_FLT_TO_INT, tgsi_op2_trans},
7223 {TGSI_OPCODE_IDIV, 0, ALU_OP0_NOP, tgsi_idiv},
7224 {TGSI_OPCODE_IMAX, 0, ALU_OP2_MAX_INT, tgsi_op2},
7225 {TGSI_OPCODE_IMIN, 0, ALU_OP2_MIN_INT, tgsi_op2},
7226 {TGSI_OPCODE_INEG, 0, ALU_OP2_SUB_INT, tgsi_ineg},
7227 {TGSI_OPCODE_ISGE, 0, ALU_OP2_SETGE_INT, tgsi_op2},
7228 {TGSI_OPCODE_ISHR, 0, ALU_OP2_ASHR_INT, tgsi_op2_trans},
7229 {TGSI_OPCODE_ISLT, 0, ALU_OP2_SETGT_INT, tgsi_op2_swap},
7230 {TGSI_OPCODE_F2U, 0, ALU_OP1_FLT_TO_UINT, tgsi_op2_trans},
7231 {TGSI_OPCODE_U2F, 0, ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
7232 {TGSI_OPCODE_UADD, 0, ALU_OP2_ADD_INT, tgsi_op2},
7233 {TGSI_OPCODE_UDIV, 0, ALU_OP0_NOP, tgsi_udiv},
7234 {TGSI_OPCODE_UMAD, 0, ALU_OP0_NOP, tgsi_umad},
7235 {TGSI_OPCODE_UMAX, 0, ALU_OP2_MAX_UINT, tgsi_op2},
7236 {TGSI_OPCODE_UMIN, 0, ALU_OP2_MIN_UINT, tgsi_op2},
7237 {TGSI_OPCODE_UMOD, 0, ALU_OP0_NOP, tgsi_umod},
7238 {TGSI_OPCODE_UMUL, 0, ALU_OP2_MULLO_UINT, tgsi_op2_trans},
7239 {TGSI_OPCODE_USEQ, 0, ALU_OP2_SETE_INT, tgsi_op2},
7240 {TGSI_OPCODE_USGE, 0, ALU_OP2_SETGE_UINT, tgsi_op2},
7241 {TGSI_OPCODE_USHR, 0, ALU_OP2_LSHR_INT, tgsi_op2_trans},
7242 {TGSI_OPCODE_USLT, 0, ALU_OP2_SETGT_UINT, tgsi_op2_swap},
7243 {TGSI_OPCODE_USNE, 0, ALU_OP2_SETNE_INT, tgsi_op2_swap},
7244 {TGSI_OPCODE_SWITCH, 0, ALU_OP0_NOP, tgsi_unsupported},
7245 {TGSI_OPCODE_CASE, 0, ALU_OP0_NOP, tgsi_unsupported},
7246 {TGSI_OPCODE_DEFAULT, 0, ALU_OP0_NOP, tgsi_unsupported},
7247 {TGSI_OPCODE_ENDSWITCH, 0, ALU_OP0_NOP, tgsi_unsupported},
7248 {TGSI_OPCODE_SAMPLE, 0, 0, tgsi_unsupported},
7249 {TGSI_OPCODE_SAMPLE_I, 0, 0, tgsi_unsupported},
7250 {TGSI_OPCODE_SAMPLE_I_MS, 0, 0, tgsi_unsupported},
7251 {TGSI_OPCODE_SAMPLE_B, 0, 0, tgsi_unsupported},
7252 {TGSI_OPCODE_SAMPLE_C, 0, 0, tgsi_unsupported},
7253 {TGSI_OPCODE_SAMPLE_C_LZ, 0, 0, tgsi_unsupported},
7254 {TGSI_OPCODE_SAMPLE_D, 0, 0, tgsi_unsupported},
7255 {TGSI_OPCODE_SAMPLE_L, 0, 0, tgsi_unsupported},
7256 {TGSI_OPCODE_GATHER4, 0, 0, tgsi_unsupported},
7257 {TGSI_OPCODE_SVIEWINFO, 0, 0, tgsi_unsupported},
7258 {TGSI_OPCODE_SAMPLE_POS, 0, 0, tgsi_unsupported},
7259 {TGSI_OPCODE_SAMPLE_INFO, 0, 0, tgsi_unsupported},
7260 {TGSI_OPCODE_UARL, 0, ALU_OP1_MOVA_INT, tgsi_r600_arl},
7261 {TGSI_OPCODE_UCMP, 0, ALU_OP0_NOP, tgsi_ucmp},
7262 {TGSI_OPCODE_IABS, 0, 0, tgsi_iabs},
7263 {TGSI_OPCODE_ISSG, 0, 0, tgsi_issg},
7264 {TGSI_OPCODE_LOAD, 0, ALU_OP0_NOP, tgsi_unsupported},
7265 {TGSI_OPCODE_STORE, 0, ALU_OP0_NOP, tgsi_unsupported},
7266 {TGSI_OPCODE_MFENCE, 0, ALU_OP0_NOP, tgsi_unsupported},
7267 {TGSI_OPCODE_LFENCE, 0, ALU_OP0_NOP, tgsi_unsupported},
7268 {TGSI_OPCODE_SFENCE, 0, ALU_OP0_NOP, tgsi_unsupported},
7269 {TGSI_OPCODE_BARRIER, 0, ALU_OP0_NOP, tgsi_unsupported},
7270 {TGSI_OPCODE_ATOMUADD, 0, ALU_OP0_NOP, tgsi_unsupported},
7271 {TGSI_OPCODE_ATOMXCHG, 0, ALU_OP0_NOP, tgsi_unsupported},
7272 {TGSI_OPCODE_ATOMCAS, 0, ALU_OP0_NOP, tgsi_unsupported},
7273 {TGSI_OPCODE_ATOMAND, 0, ALU_OP0_NOP, tgsi_unsupported},
7274 {TGSI_OPCODE_ATOMOR, 0, ALU_OP0_NOP, tgsi_unsupported},
7275 {TGSI_OPCODE_ATOMXOR, 0, ALU_OP0_NOP, tgsi_unsupported},
7276 {TGSI_OPCODE_ATOMUMIN, 0, ALU_OP0_NOP, tgsi_unsupported},
7277 {TGSI_OPCODE_ATOMUMAX, 0, ALU_OP0_NOP, tgsi_unsupported},
7278 {TGSI_OPCODE_ATOMIMIN, 0, ALU_OP0_NOP, tgsi_unsupported},
7279 {TGSI_OPCODE_ATOMIMAX, 0, ALU_OP0_NOP, tgsi_unsupported},
7280 {TGSI_OPCODE_TEX2, 0, FETCH_OP_SAMPLE, tgsi_tex},
7281 {TGSI_OPCODE_TXB2, 0, FETCH_OP_SAMPLE_LB, tgsi_tex},
7282 {TGSI_OPCODE_TXL2, 0, FETCH_OP_SAMPLE_L, tgsi_tex},
7283 {TGSI_OPCODE_IMUL_HI, 0, ALU_OP2_MULHI_INT, tgsi_op2_trans},
7284 {TGSI_OPCODE_UMUL_HI, 0, ALU_OP2_MULHI_UINT, tgsi_op2_trans},
7285 {TGSI_OPCODE_TG4, 0, FETCH_OP_GATHER4, tgsi_unsupported},
7286 {TGSI_OPCODE_LODQ, 0, FETCH_OP_GET_LOD, tgsi_unsupported},
7287 {TGSI_OPCODE_IBFE, 1, ALU_OP3_BFE_INT, tgsi_unsupported},
7288 {TGSI_OPCODE_UBFE, 1, ALU_OP3_BFE_UINT, tgsi_unsupported},
7289 {TGSI_OPCODE_BFI, 0, ALU_OP0_NOP, tgsi_unsupported},
7290 {TGSI_OPCODE_BREV, 0, ALU_OP1_BFREV_INT, tgsi_unsupported},
7291 {TGSI_OPCODE_POPC, 0, ALU_OP1_BCNT_INT, tgsi_unsupported},
7292 {TGSI_OPCODE_LSB, 0, ALU_OP1_FFBL_INT, tgsi_unsupported},
7293 {TGSI_OPCODE_IMSB, 0, ALU_OP1_FFBH_INT, tgsi_unsupported},
7294 {TGSI_OPCODE_UMSB, 0, ALU_OP1_FFBH_UINT, tgsi_unsupported},
7295 {TGSI_OPCODE_INTERP_CENTROID, 0, ALU_OP0_NOP, tgsi_unsupported},
7296 {TGSI_OPCODE_INTERP_SAMPLE, 0, ALU_OP0_NOP, tgsi_unsupported},
7297 {TGSI_OPCODE_INTERP_OFFSET, 0, ALU_OP0_NOP, tgsi_unsupported},
7298 {TGSI_OPCODE_LAST, 0, ALU_OP0_NOP, tgsi_unsupported},
7299 };
7300
7301 static struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = {
7302 {TGSI_OPCODE_ARL, 0, ALU_OP0_NOP, tgsi_eg_arl},
7303 {TGSI_OPCODE_MOV, 0, ALU_OP1_MOV, tgsi_op2},
7304 {TGSI_OPCODE_LIT, 0, ALU_OP0_NOP, tgsi_lit},
7305 {TGSI_OPCODE_RCP, 0, ALU_OP1_RECIP_IEEE, tgsi_trans_srcx_replicate},
7306 {TGSI_OPCODE_RSQ, 0, ALU_OP1_RECIPSQRT_IEEE, tgsi_rsq},
7307 {TGSI_OPCODE_EXP, 0, ALU_OP0_NOP, tgsi_exp},
7308 {TGSI_OPCODE_LOG, 0, ALU_OP0_NOP, tgsi_log},
7309 {TGSI_OPCODE_MUL, 0, ALU_OP2_MUL, tgsi_op2},
7310 {TGSI_OPCODE_ADD, 0, ALU_OP2_ADD, tgsi_op2},
7311 {TGSI_OPCODE_DP3, 0, ALU_OP2_DOT4, tgsi_dp},
7312 {TGSI_OPCODE_DP4, 0, ALU_OP2_DOT4, tgsi_dp},
7313 {TGSI_OPCODE_DST, 0, ALU_OP0_NOP, tgsi_opdst},
7314 {TGSI_OPCODE_MIN, 0, ALU_OP2_MIN, tgsi_op2},
7315 {TGSI_OPCODE_MAX, 0, ALU_OP2_MAX, tgsi_op2},
7316 {TGSI_OPCODE_SLT, 0, ALU_OP2_SETGT, tgsi_op2_swap},
7317 {TGSI_OPCODE_SGE, 0, ALU_OP2_SETGE, tgsi_op2},
7318 {TGSI_OPCODE_MAD, 1, ALU_OP3_MULADD, tgsi_op3},
7319 {TGSI_OPCODE_SUB, 0, ALU_OP2_ADD, tgsi_op2},
7320 {TGSI_OPCODE_LRP, 0, ALU_OP0_NOP, tgsi_lrp},
7321 {TGSI_OPCODE_CND, 0, ALU_OP0_NOP, tgsi_unsupported},
7322 {TGSI_OPCODE_SQRT, 0, ALU_OP1_SQRT_IEEE, tgsi_trans_srcx_replicate},
7323 {TGSI_OPCODE_DP2A, 0, ALU_OP0_NOP, tgsi_unsupported},
7324 /* gap */
7325 {22, 0, ALU_OP0_NOP, tgsi_unsupported},
7326 {23, 0, ALU_OP0_NOP, tgsi_unsupported},
7327 {TGSI_OPCODE_FRC, 0, ALU_OP1_FRACT, tgsi_op2},
7328 {TGSI_OPCODE_CLAMP, 0, ALU_OP0_NOP, tgsi_unsupported},
7329 {TGSI_OPCODE_FLR, 0, ALU_OP1_FLOOR, tgsi_op2},
7330 {TGSI_OPCODE_ROUND, 0, ALU_OP1_RNDNE, tgsi_op2},
7331 {TGSI_OPCODE_EX2, 0, ALU_OP1_EXP_IEEE, tgsi_trans_srcx_replicate},
7332 {TGSI_OPCODE_LG2, 0, ALU_OP1_LOG_IEEE, tgsi_trans_srcx_replicate},
7333 {TGSI_OPCODE_POW, 0, ALU_OP0_NOP, tgsi_pow},
7334 {TGSI_OPCODE_XPD, 0, ALU_OP0_NOP, tgsi_xpd},
7335 /* gap */
7336 {32, 0, ALU_OP0_NOP, tgsi_unsupported},
7337 {TGSI_OPCODE_ABS, 0, ALU_OP1_MOV, tgsi_op2},
7338 {TGSI_OPCODE_RCC, 0, ALU_OP0_NOP, tgsi_unsupported},
7339 {TGSI_OPCODE_DPH, 0, ALU_OP2_DOT4, tgsi_dp},
7340 {TGSI_OPCODE_COS, 0, ALU_OP1_COS, tgsi_trig},
7341 {TGSI_OPCODE_DDX, 0, FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
7342 {TGSI_OPCODE_DDY, 0, FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
7343 {TGSI_OPCODE_KILL, 0, ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */
7344 {TGSI_OPCODE_PK2H, 0, ALU_OP0_NOP, tgsi_unsupported},
7345 {TGSI_OPCODE_PK2US, 0, ALU_OP0_NOP, tgsi_unsupported},
7346 {TGSI_OPCODE_PK4B, 0, ALU_OP0_NOP, tgsi_unsupported},
7347 {TGSI_OPCODE_PK4UB, 0, ALU_OP0_NOP, tgsi_unsupported},
7348 {TGSI_OPCODE_RFL, 0, ALU_OP0_NOP, tgsi_unsupported},
7349 {TGSI_OPCODE_SEQ, 0, ALU_OP2_SETE, tgsi_op2},
7350 {TGSI_OPCODE_SFL, 0, ALU_OP0_NOP, tgsi_unsupported},
7351 {TGSI_OPCODE_SGT, 0, ALU_OP2_SETGT, tgsi_op2},
7352 {TGSI_OPCODE_SIN, 0, ALU_OP1_SIN, tgsi_trig},
7353 {TGSI_OPCODE_SLE, 0, ALU_OP2_SETGE, tgsi_op2_swap},
7354 {TGSI_OPCODE_SNE, 0, ALU_OP2_SETNE, tgsi_op2},
7355 {TGSI_OPCODE_STR, 0, ALU_OP0_NOP, tgsi_unsupported},
7356 {TGSI_OPCODE_TEX, 0, FETCH_OP_SAMPLE, tgsi_tex},
7357 {TGSI_OPCODE_TXD, 0, FETCH_OP_SAMPLE_G, tgsi_tex},
7358 {TGSI_OPCODE_TXP, 0, FETCH_OP_SAMPLE, tgsi_tex},
7359 {TGSI_OPCODE_UP2H, 0, ALU_OP0_NOP, tgsi_unsupported},
7360 {TGSI_OPCODE_UP2US, 0, ALU_OP0_NOP, tgsi_unsupported},
7361 {TGSI_OPCODE_UP4B, 0, ALU_OP0_NOP, tgsi_unsupported},
7362 {TGSI_OPCODE_UP4UB, 0, ALU_OP0_NOP, tgsi_unsupported},
7363 {TGSI_OPCODE_X2D, 0, ALU_OP0_NOP, tgsi_unsupported},
7364 {TGSI_OPCODE_ARA, 0, ALU_OP0_NOP, tgsi_unsupported},
7365 {TGSI_OPCODE_ARR, 0, ALU_OP0_NOP, tgsi_eg_arl},
7366 {TGSI_OPCODE_BRA, 0, ALU_OP0_NOP, tgsi_unsupported},
7367 {TGSI_OPCODE_CAL, 0, ALU_OP0_NOP, tgsi_unsupported},
7368 {TGSI_OPCODE_RET, 0, ALU_OP0_NOP, tgsi_unsupported},
7369 {TGSI_OPCODE_SSG, 0, ALU_OP0_NOP, tgsi_ssg},
7370 {TGSI_OPCODE_CMP, 0, ALU_OP0_NOP, tgsi_cmp},
7371 {TGSI_OPCODE_SCS, 0, ALU_OP0_NOP, tgsi_scs},
7372 {TGSI_OPCODE_TXB, 0, FETCH_OP_SAMPLE_LB, tgsi_tex},
7373 {TGSI_OPCODE_NRM, 0, ALU_OP0_NOP, tgsi_unsupported},
7374 {TGSI_OPCODE_DIV, 0, ALU_OP0_NOP, tgsi_unsupported},
7375 {TGSI_OPCODE_DP2, 0, ALU_OP2_DOT4, tgsi_dp},
7376 {TGSI_OPCODE_TXL, 0, FETCH_OP_SAMPLE_L, tgsi_tex},
7377 {TGSI_OPCODE_BRK, 0, CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
7378 {TGSI_OPCODE_IF, 0, ALU_OP0_NOP, tgsi_if},
7379 {TGSI_OPCODE_UIF, 0, ALU_OP0_NOP, tgsi_uif},
7380 {76, 0, ALU_OP0_NOP, tgsi_unsupported},
7381 {TGSI_OPCODE_ELSE, 0, ALU_OP0_NOP, tgsi_else},
7382 {TGSI_OPCODE_ENDIF, 0, ALU_OP0_NOP, tgsi_endif},
7383 {TGSI_OPCODE_DDX_FINE, 0, FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
7384 {TGSI_OPCODE_DDY_FINE, 0, FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
7385 {TGSI_OPCODE_PUSHA, 0, ALU_OP0_NOP, tgsi_unsupported},
7386 {TGSI_OPCODE_POPA, 0, ALU_OP0_NOP, tgsi_unsupported},
7387 {TGSI_OPCODE_CEIL, 0, ALU_OP1_CEIL, tgsi_op2},
7388 {TGSI_OPCODE_I2F, 0, ALU_OP1_INT_TO_FLT, tgsi_op2_trans},
7389 {TGSI_OPCODE_NOT, 0, ALU_OP1_NOT_INT, tgsi_op2},
7390 {TGSI_OPCODE_TRUNC, 0, ALU_OP1_TRUNC, tgsi_op2},
7391 {TGSI_OPCODE_SHL, 0, ALU_OP2_LSHL_INT, tgsi_op2},
7392 /* gap */
7393 {88, 0, ALU_OP0_NOP, tgsi_unsupported},
7394 {TGSI_OPCODE_AND, 0, ALU_OP2_AND_INT, tgsi_op2},
7395 {TGSI_OPCODE_OR, 0, ALU_OP2_OR_INT, tgsi_op2},
7396 {TGSI_OPCODE_MOD, 0, ALU_OP0_NOP, tgsi_imod},
7397 {TGSI_OPCODE_XOR, 0, ALU_OP2_XOR_INT, tgsi_op2},
7398 {TGSI_OPCODE_SAD, 0, ALU_OP0_NOP, tgsi_unsupported},
7399 {TGSI_OPCODE_TXF, 0, FETCH_OP_LD, tgsi_tex},
7400 {TGSI_OPCODE_TXQ, 0, FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
7401 {TGSI_OPCODE_CONT, 0, CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
7402 {TGSI_OPCODE_EMIT, 0, CF_OP_EMIT_VERTEX, tgsi_gs_emit},
7403 {TGSI_OPCODE_ENDPRIM, 0, CF_OP_CUT_VERTEX, tgsi_gs_emit},
7404 {TGSI_OPCODE_BGNLOOP, 0, ALU_OP0_NOP, tgsi_bgnloop},
7405 {TGSI_OPCODE_BGNSUB, 0, ALU_OP0_NOP, tgsi_unsupported},
7406 {TGSI_OPCODE_ENDLOOP, 0, ALU_OP0_NOP, tgsi_endloop},
7407 {TGSI_OPCODE_ENDSUB, 0, ALU_OP0_NOP, tgsi_unsupported},
7408 {TGSI_OPCODE_TXQ_LZ, 0, FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
7409 /* gap */
7410 {104, 0, ALU_OP0_NOP, tgsi_unsupported},
7411 {105, 0, ALU_OP0_NOP, tgsi_unsupported},
7412 {106, 0, ALU_OP0_NOP, tgsi_unsupported},
7413 {TGSI_OPCODE_NOP, 0, ALU_OP0_NOP, tgsi_unsupported},
7414 {TGSI_OPCODE_FSEQ, 0, ALU_OP2_SETE_DX10, tgsi_op2},
7415 {TGSI_OPCODE_FSGE, 0, ALU_OP2_SETGE_DX10, tgsi_op2},
7416 {TGSI_OPCODE_FSLT, 0, ALU_OP2_SETGT_DX10, tgsi_op2_swap},
7417 {TGSI_OPCODE_FSNE, 0, ALU_OP2_SETNE_DX10, tgsi_op2_swap},
7418 {TGSI_OPCODE_NRM4, 0, ALU_OP0_NOP, tgsi_unsupported},
7419 {TGSI_OPCODE_CALLNZ, 0, ALU_OP0_NOP, tgsi_unsupported},
7420 /* gap */
7421 {114, 0, ALU_OP0_NOP, tgsi_unsupported},
7422 {TGSI_OPCODE_BREAKC, 0, ALU_OP0_NOP, tgsi_unsupported},
7423 {TGSI_OPCODE_KILL_IF, 0, ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */
7424 {TGSI_OPCODE_END, 0, ALU_OP0_NOP, tgsi_end}, /* aka HALT */
7425 /* gap */
7426 {118, 0, ALU_OP0_NOP, tgsi_unsupported},
7427 {TGSI_OPCODE_F2I, 0, ALU_OP1_FLT_TO_INT, tgsi_f2i},
7428 {TGSI_OPCODE_IDIV, 0, ALU_OP0_NOP, tgsi_idiv},
7429 {TGSI_OPCODE_IMAX, 0, ALU_OP2_MAX_INT, tgsi_op2},
7430 {TGSI_OPCODE_IMIN, 0, ALU_OP2_MIN_INT, tgsi_op2},
7431 {TGSI_OPCODE_INEG, 0, ALU_OP2_SUB_INT, tgsi_ineg},
7432 {TGSI_OPCODE_ISGE, 0, ALU_OP2_SETGE_INT, tgsi_op2},
7433 {TGSI_OPCODE_ISHR, 0, ALU_OP2_ASHR_INT, tgsi_op2},
7434 {TGSI_OPCODE_ISLT, 0, ALU_OP2_SETGT_INT, tgsi_op2_swap},
7435 {TGSI_OPCODE_F2U, 0, ALU_OP1_FLT_TO_UINT, tgsi_f2i},
7436 {TGSI_OPCODE_U2F, 0, ALU_OP1_UINT_TO_FLT, tgsi_op2_trans},
7437 {TGSI_OPCODE_UADD, 0, ALU_OP2_ADD_INT, tgsi_op2},
7438 {TGSI_OPCODE_UDIV, 0, ALU_OP0_NOP, tgsi_udiv},
7439 {TGSI_OPCODE_UMAD, 0, ALU_OP0_NOP, tgsi_umad},
7440 {TGSI_OPCODE_UMAX, 0, ALU_OP2_MAX_UINT, tgsi_op2},
7441 {TGSI_OPCODE_UMIN, 0, ALU_OP2_MIN_UINT, tgsi_op2},
7442 {TGSI_OPCODE_UMOD, 0, ALU_OP0_NOP, tgsi_umod},
7443 {TGSI_OPCODE_UMUL, 0, ALU_OP2_MULLO_UINT, tgsi_op2_trans},
7444 {TGSI_OPCODE_USEQ, 0, ALU_OP2_SETE_INT, tgsi_op2},
7445 {TGSI_OPCODE_USGE, 0, ALU_OP2_SETGE_UINT, tgsi_op2},
7446 {TGSI_OPCODE_USHR, 0, ALU_OP2_LSHR_INT, tgsi_op2},
7447 {TGSI_OPCODE_USLT, 0, ALU_OP2_SETGT_UINT, tgsi_op2_swap},
7448 {TGSI_OPCODE_USNE, 0, ALU_OP2_SETNE_INT, tgsi_op2},
7449 {TGSI_OPCODE_SWITCH, 0, ALU_OP0_NOP, tgsi_unsupported},
7450 {TGSI_OPCODE_CASE, 0, ALU_OP0_NOP, tgsi_unsupported},
7451 {TGSI_OPCODE_DEFAULT, 0, ALU_OP0_NOP, tgsi_unsupported},
7452 {TGSI_OPCODE_ENDSWITCH, 0, ALU_OP0_NOP, tgsi_unsupported},
7453 {TGSI_OPCODE_SAMPLE, 0, 0, tgsi_unsupported},
7454 {TGSI_OPCODE_SAMPLE_I, 0, 0, tgsi_unsupported},
7455 {TGSI_OPCODE_SAMPLE_I_MS, 0, 0, tgsi_unsupported},
7456 {TGSI_OPCODE_SAMPLE_B, 0, 0, tgsi_unsupported},
7457 {TGSI_OPCODE_SAMPLE_C, 0, 0, tgsi_unsupported},
7458 {TGSI_OPCODE_SAMPLE_C_LZ, 0, 0, tgsi_unsupported},
7459 {TGSI_OPCODE_SAMPLE_D, 0, 0, tgsi_unsupported},
7460 {TGSI_OPCODE_SAMPLE_L, 0, 0, tgsi_unsupported},
7461 {TGSI_OPCODE_GATHER4, 0, 0, tgsi_unsupported},
7462 {TGSI_OPCODE_SVIEWINFO, 0, 0, tgsi_unsupported},
7463 {TGSI_OPCODE_SAMPLE_POS, 0, 0, tgsi_unsupported},
7464 {TGSI_OPCODE_SAMPLE_INFO, 0, 0, tgsi_unsupported},
7465 {TGSI_OPCODE_UARL, 0, ALU_OP1_MOVA_INT, tgsi_eg_arl},
7466 {TGSI_OPCODE_UCMP, 0, ALU_OP0_NOP, tgsi_ucmp},
7467 {TGSI_OPCODE_IABS, 0, 0, tgsi_iabs},
7468 {TGSI_OPCODE_ISSG, 0, 0, tgsi_issg},
7469 {TGSI_OPCODE_LOAD, 0, ALU_OP0_NOP, tgsi_unsupported},
7470 {TGSI_OPCODE_STORE, 0, ALU_OP0_NOP, tgsi_unsupported},
7471 {TGSI_OPCODE_MFENCE, 0, ALU_OP0_NOP, tgsi_unsupported},
7472 {TGSI_OPCODE_LFENCE, 0, ALU_OP0_NOP, tgsi_unsupported},
7473 {TGSI_OPCODE_SFENCE, 0, ALU_OP0_NOP, tgsi_unsupported},
7474 {TGSI_OPCODE_BARRIER, 0, ALU_OP0_NOP, tgsi_unsupported},
7475 {TGSI_OPCODE_ATOMUADD, 0, ALU_OP0_NOP, tgsi_unsupported},
7476 {TGSI_OPCODE_ATOMXCHG, 0, ALU_OP0_NOP, tgsi_unsupported},
7477 {TGSI_OPCODE_ATOMCAS, 0, ALU_OP0_NOP, tgsi_unsupported},
7478 {TGSI_OPCODE_ATOMAND, 0, ALU_OP0_NOP, tgsi_unsupported},
7479 {TGSI_OPCODE_ATOMOR, 0, ALU_OP0_NOP, tgsi_unsupported},
7480 {TGSI_OPCODE_ATOMXOR, 0, ALU_OP0_NOP, tgsi_unsupported},
7481 {TGSI_OPCODE_ATOMUMIN, 0, ALU_OP0_NOP, tgsi_unsupported},
7482 {TGSI_OPCODE_ATOMUMAX, 0, ALU_OP0_NOP, tgsi_unsupported},
7483 {TGSI_OPCODE_ATOMIMIN, 0, ALU_OP0_NOP, tgsi_unsupported},
7484 {TGSI_OPCODE_ATOMIMAX, 0, ALU_OP0_NOP, tgsi_unsupported},
7485 {TGSI_OPCODE_TEX2, 0, FETCH_OP_SAMPLE, tgsi_tex},
7486 {TGSI_OPCODE_TXB2, 0, FETCH_OP_SAMPLE_LB, tgsi_tex},
7487 {TGSI_OPCODE_TXL2, 0, FETCH_OP_SAMPLE_L, tgsi_tex},
7488 {TGSI_OPCODE_IMUL_HI, 0, ALU_OP2_MULHI_INT, tgsi_op2_trans},
7489 {TGSI_OPCODE_UMUL_HI, 0, ALU_OP2_MULHI_UINT, tgsi_op2_trans},
7490 {TGSI_OPCODE_TG4, 0, FETCH_OP_GATHER4, tgsi_tex},
7491 {TGSI_OPCODE_LODQ, 0, FETCH_OP_GET_LOD, tgsi_tex},
7492 {TGSI_OPCODE_IBFE, 1, ALU_OP3_BFE_INT, tgsi_op3},
7493 {TGSI_OPCODE_UBFE, 1, ALU_OP3_BFE_UINT, tgsi_op3},
7494 {TGSI_OPCODE_BFI, 0, ALU_OP0_NOP, tgsi_bfi},
7495 {TGSI_OPCODE_BREV, 0, ALU_OP1_BFREV_INT, tgsi_op2},
7496 {TGSI_OPCODE_POPC, 0, ALU_OP1_BCNT_INT, tgsi_op2},
7497 {TGSI_OPCODE_LSB, 0, ALU_OP1_FFBL_INT, tgsi_op2},
7498 {TGSI_OPCODE_IMSB, 0, ALU_OP1_FFBH_INT, tgsi_msb},
7499 {TGSI_OPCODE_UMSB, 0, ALU_OP1_FFBH_UINT, tgsi_msb},
7500 {TGSI_OPCODE_INTERP_CENTROID, 0, ALU_OP0_NOP, tgsi_interp_egcm},
7501 {TGSI_OPCODE_INTERP_SAMPLE, 0, ALU_OP0_NOP, tgsi_interp_egcm},
7502 {TGSI_OPCODE_INTERP_OFFSET, 0, ALU_OP0_NOP, tgsi_interp_egcm},
7503 {TGSI_OPCODE_LAST, 0, ALU_OP0_NOP, tgsi_unsupported},
7504 };
7505
7506 static struct r600_shader_tgsi_instruction cm_shader_tgsi_instruction[] = {
7507 {TGSI_OPCODE_ARL, 0, ALU_OP0_NOP, tgsi_eg_arl},
7508 {TGSI_OPCODE_MOV, 0, ALU_OP1_MOV, tgsi_op2},
7509 {TGSI_OPCODE_LIT, 0, ALU_OP0_NOP, tgsi_lit},
7510 {TGSI_OPCODE_RCP, 0, ALU_OP1_RECIP_IEEE, cayman_emit_float_instr},
7511 {TGSI_OPCODE_RSQ, 0, ALU_OP1_RECIPSQRT_IEEE, cayman_emit_float_instr},
7512 {TGSI_OPCODE_EXP, 0, ALU_OP0_NOP, tgsi_exp},
7513 {TGSI_OPCODE_LOG, 0, ALU_OP0_NOP, tgsi_log},
7514 {TGSI_OPCODE_MUL, 0, ALU_OP2_MUL, tgsi_op2},
7515 {TGSI_OPCODE_ADD, 0, ALU_OP2_ADD, tgsi_op2},
7516 {TGSI_OPCODE_DP3, 0, ALU_OP2_DOT4, tgsi_dp},
7517 {TGSI_OPCODE_DP4, 0, ALU_OP2_DOT4, tgsi_dp},
7518 {TGSI_OPCODE_DST, 0, ALU_OP0_NOP, tgsi_opdst},
7519 {TGSI_OPCODE_MIN, 0, ALU_OP2_MIN, tgsi_op2},
7520 {TGSI_OPCODE_MAX, 0, ALU_OP2_MAX, tgsi_op2},
7521 {TGSI_OPCODE_SLT, 0, ALU_OP2_SETGT, tgsi_op2_swap},
7522 {TGSI_OPCODE_SGE, 0, ALU_OP2_SETGE, tgsi_op2},
7523 {TGSI_OPCODE_MAD, 1, ALU_OP3_MULADD, tgsi_op3},
7524 {TGSI_OPCODE_SUB, 0, ALU_OP2_ADD, tgsi_op2},
7525 {TGSI_OPCODE_LRP, 0, ALU_OP0_NOP, tgsi_lrp},
7526 {TGSI_OPCODE_CND, 0, ALU_OP0_NOP, tgsi_unsupported},
7527 {TGSI_OPCODE_SQRT, 0, ALU_OP1_SQRT_IEEE, cayman_emit_float_instr},
7528 {TGSI_OPCODE_DP2A, 0, ALU_OP0_NOP, tgsi_unsupported},
7529 /* gap */
7530 {22, 0, ALU_OP0_NOP, tgsi_unsupported},
7531 {23, 0, ALU_OP0_NOP, tgsi_unsupported},
7532 {TGSI_OPCODE_FRC, 0, ALU_OP1_FRACT, tgsi_op2},
7533 {TGSI_OPCODE_CLAMP, 0, ALU_OP0_NOP, tgsi_unsupported},
7534 {TGSI_OPCODE_FLR, 0, ALU_OP1_FLOOR, tgsi_op2},
7535 {TGSI_OPCODE_ROUND, 0, ALU_OP1_RNDNE, tgsi_op2},
7536 {TGSI_OPCODE_EX2, 0, ALU_OP1_EXP_IEEE, cayman_emit_float_instr},
7537 {TGSI_OPCODE_LG2, 0, ALU_OP1_LOG_IEEE, cayman_emit_float_instr},
7538 {TGSI_OPCODE_POW, 0, ALU_OP0_NOP, cayman_pow},
7539 {TGSI_OPCODE_XPD, 0, ALU_OP0_NOP, tgsi_xpd},
7540 /* gap */
7541 {32, 0, ALU_OP0_NOP, tgsi_unsupported},
7542 {TGSI_OPCODE_ABS, 0, ALU_OP1_MOV, tgsi_op2},
7543 {TGSI_OPCODE_RCC, 0, ALU_OP0_NOP, tgsi_unsupported},
7544 {TGSI_OPCODE_DPH, 0, ALU_OP2_DOT4, tgsi_dp},
7545 {TGSI_OPCODE_COS, 0, ALU_OP1_COS, cayman_trig},
7546 {TGSI_OPCODE_DDX, 0, FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
7547 {TGSI_OPCODE_DDY, 0, FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
7548 {TGSI_OPCODE_KILL, 0, ALU_OP2_KILLGT, tgsi_kill}, /* unconditional kill */
7549 {TGSI_OPCODE_PK2H, 0, ALU_OP0_NOP, tgsi_unsupported},
7550 {TGSI_OPCODE_PK2US, 0, ALU_OP0_NOP, tgsi_unsupported},
7551 {TGSI_OPCODE_PK4B, 0, ALU_OP0_NOP, tgsi_unsupported},
7552 {TGSI_OPCODE_PK4UB, 0, ALU_OP0_NOP, tgsi_unsupported},
7553 {TGSI_OPCODE_RFL, 0, ALU_OP0_NOP, tgsi_unsupported},
7554 {TGSI_OPCODE_SEQ, 0, ALU_OP2_SETE, tgsi_op2},
7555 {TGSI_OPCODE_SFL, 0, ALU_OP0_NOP, tgsi_unsupported},
7556 {TGSI_OPCODE_SGT, 0, ALU_OP2_SETGT, tgsi_op2},
7557 {TGSI_OPCODE_SIN, 0, ALU_OP1_SIN, cayman_trig},
7558 {TGSI_OPCODE_SLE, 0, ALU_OP2_SETGE, tgsi_op2_swap},
7559 {TGSI_OPCODE_SNE, 0, ALU_OP2_SETNE, tgsi_op2},
7560 {TGSI_OPCODE_STR, 0, ALU_OP0_NOP, tgsi_unsupported},
7561 {TGSI_OPCODE_TEX, 0, FETCH_OP_SAMPLE, tgsi_tex},
7562 {TGSI_OPCODE_TXD, 0, FETCH_OP_SAMPLE_G, tgsi_tex},
7563 {TGSI_OPCODE_TXP, 0, FETCH_OP_SAMPLE, tgsi_tex},
7564 {TGSI_OPCODE_UP2H, 0, ALU_OP0_NOP, tgsi_unsupported},
7565 {TGSI_OPCODE_UP2US, 0, ALU_OP0_NOP, tgsi_unsupported},
7566 {TGSI_OPCODE_UP4B, 0, ALU_OP0_NOP, tgsi_unsupported},
7567 {TGSI_OPCODE_UP4UB, 0, ALU_OP0_NOP, tgsi_unsupported},
7568 {TGSI_OPCODE_X2D, 0, ALU_OP0_NOP, tgsi_unsupported},
7569 {TGSI_OPCODE_ARA, 0, ALU_OP0_NOP, tgsi_unsupported},
7570 {TGSI_OPCODE_ARR, 0, ALU_OP0_NOP, tgsi_eg_arl},
7571 {TGSI_OPCODE_BRA, 0, ALU_OP0_NOP, tgsi_unsupported},
7572 {TGSI_OPCODE_CAL, 0, ALU_OP0_NOP, tgsi_unsupported},
7573 {TGSI_OPCODE_RET, 0, ALU_OP0_NOP, tgsi_unsupported},
7574 {TGSI_OPCODE_SSG, 0, ALU_OP0_NOP, tgsi_ssg},
7575 {TGSI_OPCODE_CMP, 0, ALU_OP0_NOP, tgsi_cmp},
7576 {TGSI_OPCODE_SCS, 0, ALU_OP0_NOP, tgsi_scs},
7577 {TGSI_OPCODE_TXB, 0, FETCH_OP_SAMPLE_LB, tgsi_tex},
7578 {TGSI_OPCODE_NRM, 0, ALU_OP0_NOP, tgsi_unsupported},
7579 {TGSI_OPCODE_DIV, 0, ALU_OP0_NOP, tgsi_unsupported},
7580 {TGSI_OPCODE_DP2, 0, ALU_OP2_DOT4, tgsi_dp},
7581 {TGSI_OPCODE_TXL, 0, FETCH_OP_SAMPLE_L, tgsi_tex},
7582 {TGSI_OPCODE_BRK, 0, CF_OP_LOOP_BREAK, tgsi_loop_brk_cont},
7583 {TGSI_OPCODE_IF, 0, ALU_OP0_NOP, tgsi_if},
7584 {TGSI_OPCODE_UIF, 0, ALU_OP0_NOP, tgsi_uif},
7585 {76, 0, ALU_OP0_NOP, tgsi_unsupported},
7586 {TGSI_OPCODE_ELSE, 0, ALU_OP0_NOP, tgsi_else},
7587 {TGSI_OPCODE_ENDIF, 0, ALU_OP0_NOP, tgsi_endif},
7588 {TGSI_OPCODE_DDX_FINE, 0, FETCH_OP_GET_GRADIENTS_H, tgsi_tex},
7589 {TGSI_OPCODE_DDY_FINE, 0, FETCH_OP_GET_GRADIENTS_V, tgsi_tex},
7590 {TGSI_OPCODE_PUSHA, 0, ALU_OP0_NOP, tgsi_unsupported},
7591 {TGSI_OPCODE_POPA, 0, ALU_OP0_NOP, tgsi_unsupported},
7592 {TGSI_OPCODE_CEIL, 0, ALU_OP1_CEIL, tgsi_op2},
7593 {TGSI_OPCODE_I2F, 0, ALU_OP1_INT_TO_FLT, tgsi_op2},
7594 {TGSI_OPCODE_NOT, 0, ALU_OP1_NOT_INT, tgsi_op2},
7595 {TGSI_OPCODE_TRUNC, 0, ALU_OP1_TRUNC, tgsi_op2},
7596 {TGSI_OPCODE_SHL, 0, ALU_OP2_LSHL_INT, tgsi_op2},
7597 /* gap */
7598 {88, 0, ALU_OP0_NOP, tgsi_unsupported},
7599 {TGSI_OPCODE_AND, 0, ALU_OP2_AND_INT, tgsi_op2},
7600 {TGSI_OPCODE_OR, 0, ALU_OP2_OR_INT, tgsi_op2},
7601 {TGSI_OPCODE_MOD, 0, ALU_OP0_NOP, tgsi_imod},
7602 {TGSI_OPCODE_XOR, 0, ALU_OP2_XOR_INT, tgsi_op2},
7603 {TGSI_OPCODE_SAD, 0, ALU_OP0_NOP, tgsi_unsupported},
7604 {TGSI_OPCODE_TXF, 0, FETCH_OP_LD, tgsi_tex},
7605 {TGSI_OPCODE_TXQ, 0, FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
7606 {TGSI_OPCODE_CONT, 0, CF_OP_LOOP_CONTINUE, tgsi_loop_brk_cont},
7607 {TGSI_OPCODE_EMIT, 0, CF_OP_EMIT_VERTEX, tgsi_gs_emit},
7608 {TGSI_OPCODE_ENDPRIM, 0, CF_OP_CUT_VERTEX, tgsi_gs_emit},
7609 {TGSI_OPCODE_BGNLOOP, 0, ALU_OP0_NOP, tgsi_bgnloop},
7610 {TGSI_OPCODE_BGNSUB, 0, ALU_OP0_NOP, tgsi_unsupported},
7611 {TGSI_OPCODE_ENDLOOP, 0, ALU_OP0_NOP, tgsi_endloop},
7612 {TGSI_OPCODE_ENDSUB, 0, ALU_OP0_NOP, tgsi_unsupported},
7613 {TGSI_OPCODE_TXQ_LZ, 0, FETCH_OP_GET_TEXTURE_RESINFO, tgsi_tex},
7614 /* gap */
7615 {104, 0, ALU_OP0_NOP, tgsi_unsupported},
7616 {105, 0, ALU_OP0_NOP, tgsi_unsupported},
7617 {106, 0, ALU_OP0_NOP, tgsi_unsupported},
7618 {TGSI_OPCODE_NOP, 0, ALU_OP0_NOP, tgsi_unsupported},
7619 /* gap */
7620 {TGSI_OPCODE_FSEQ, 0, ALU_OP2_SETE_DX10, tgsi_op2},
7621 {TGSI_OPCODE_FSGE, 0, ALU_OP2_SETGE_DX10, tgsi_op2},
7622 {TGSI_OPCODE_FSLT, 0, ALU_OP2_SETGT_DX10, tgsi_op2_swap},
7623 {TGSI_OPCODE_FSNE, 0, ALU_OP2_SETNE_DX10, tgsi_op2_swap},
7624 {TGSI_OPCODE_NRM4, 0, ALU_OP0_NOP, tgsi_unsupported},
7625 {TGSI_OPCODE_CALLNZ, 0, ALU_OP0_NOP, tgsi_unsupported},
7626 /* gap */
7627 {114, 0, ALU_OP0_NOP, tgsi_unsupported},
7628 {TGSI_OPCODE_BREAKC, 0, ALU_OP0_NOP, tgsi_unsupported},
7629 {TGSI_OPCODE_KILL_IF, 0, ALU_OP2_KILLGT, tgsi_kill}, /* conditional kill */
7630 {TGSI_OPCODE_END, 0, ALU_OP0_NOP, tgsi_end}, /* aka HALT */
7631 /* gap */
7632 {118, 0, ALU_OP0_NOP, tgsi_unsupported},
7633 {TGSI_OPCODE_F2I, 0, ALU_OP1_FLT_TO_INT, tgsi_op2},
7634 {TGSI_OPCODE_IDIV, 0, ALU_OP0_NOP, tgsi_idiv},
7635 {TGSI_OPCODE_IMAX, 0, ALU_OP2_MAX_INT, tgsi_op2},
7636 {TGSI_OPCODE_IMIN, 0, ALU_OP2_MIN_INT, tgsi_op2},
7637 {TGSI_OPCODE_INEG, 0, ALU_OP2_SUB_INT, tgsi_ineg},
7638 {TGSI_OPCODE_ISGE, 0, ALU_OP2_SETGE_INT, tgsi_op2},
7639 {TGSI_OPCODE_ISHR, 0, ALU_OP2_ASHR_INT, tgsi_op2},
7640 {TGSI_OPCODE_ISLT, 0, ALU_OP2_SETGT_INT, tgsi_op2_swap},
7641 {TGSI_OPCODE_F2U, 0, ALU_OP1_FLT_TO_UINT, tgsi_op2},
7642 {TGSI_OPCODE_U2F, 0, ALU_OP1_UINT_TO_FLT, tgsi_op2},
7643 {TGSI_OPCODE_UADD, 0, ALU_OP2_ADD_INT, tgsi_op2},
7644 {TGSI_OPCODE_UDIV, 0, ALU_OP0_NOP, tgsi_udiv},
7645 {TGSI_OPCODE_UMAD, 0, ALU_OP0_NOP, tgsi_umad},
7646 {TGSI_OPCODE_UMAX, 0, ALU_OP2_MAX_UINT, tgsi_op2},
7647 {TGSI_OPCODE_UMIN, 0, ALU_OP2_MIN_UINT, tgsi_op2},
7648 {TGSI_OPCODE_UMOD, 0, ALU_OP0_NOP, tgsi_umod},
7649 {TGSI_OPCODE_UMUL, 0, ALU_OP2_MULLO_INT, cayman_mul_int_instr},
7650 {TGSI_OPCODE_USEQ, 0, ALU_OP2_SETE_INT, tgsi_op2},
7651 {TGSI_OPCODE_USGE, 0, ALU_OP2_SETGE_UINT, tgsi_op2},
7652 {TGSI_OPCODE_USHR, 0, ALU_OP2_LSHR_INT, tgsi_op2},
7653 {TGSI_OPCODE_USLT, 0, ALU_OP2_SETGT_UINT, tgsi_op2_swap},
7654 {TGSI_OPCODE_USNE, 0, ALU_OP2_SETNE_INT, tgsi_op2},
7655 {TGSI_OPCODE_SWITCH, 0, ALU_OP0_NOP, tgsi_unsupported},
7656 {TGSI_OPCODE_CASE, 0, ALU_OP0_NOP, tgsi_unsupported},
7657 {TGSI_OPCODE_DEFAULT, 0, ALU_OP0_NOP, tgsi_unsupported},
7658 {TGSI_OPCODE_ENDSWITCH, 0, ALU_OP0_NOP, tgsi_unsupported},
7659 {TGSI_OPCODE_SAMPLE, 0, 0, tgsi_unsupported},
7660 {TGSI_OPCODE_SAMPLE_I, 0, 0, tgsi_unsupported},
7661 {TGSI_OPCODE_SAMPLE_I_MS, 0, 0, tgsi_unsupported},
7662 {TGSI_OPCODE_SAMPLE_B, 0, 0, tgsi_unsupported},
7663 {TGSI_OPCODE_SAMPLE_C, 0, 0, tgsi_unsupported},
7664 {TGSI_OPCODE_SAMPLE_C_LZ, 0, 0, tgsi_unsupported},
7665 {TGSI_OPCODE_SAMPLE_D, 0, 0, tgsi_unsupported},
7666 {TGSI_OPCODE_SAMPLE_L, 0, 0, tgsi_unsupported},
7667 {TGSI_OPCODE_GATHER4, 0, 0, tgsi_unsupported},
7668 {TGSI_OPCODE_SVIEWINFO, 0, 0, tgsi_unsupported},
7669 {TGSI_OPCODE_SAMPLE_POS, 0, 0, tgsi_unsupported},
7670 {TGSI_OPCODE_SAMPLE_INFO, 0, 0, tgsi_unsupported},
7671 {TGSI_OPCODE_UARL, 0, ALU_OP1_MOVA_INT, tgsi_eg_arl},
7672 {TGSI_OPCODE_UCMP, 0, ALU_OP0_NOP, tgsi_ucmp},
7673 {TGSI_OPCODE_IABS, 0, 0, tgsi_iabs},
7674 {TGSI_OPCODE_ISSG, 0, 0, tgsi_issg},
7675 {TGSI_OPCODE_LOAD, 0, ALU_OP0_NOP, tgsi_unsupported},
7676 {TGSI_OPCODE_STORE, 0, ALU_OP0_NOP, tgsi_unsupported},
7677 {TGSI_OPCODE_MFENCE, 0, ALU_OP0_NOP, tgsi_unsupported},
7678 {TGSI_OPCODE_LFENCE, 0, ALU_OP0_NOP, tgsi_unsupported},
7679 {TGSI_OPCODE_SFENCE, 0, ALU_OP0_NOP, tgsi_unsupported},
7680 {TGSI_OPCODE_BARRIER, 0, ALU_OP0_NOP, tgsi_unsupported},
7681 {TGSI_OPCODE_ATOMUADD, 0, ALU_OP0_NOP, tgsi_unsupported},
7682 {TGSI_OPCODE_ATOMXCHG, 0, ALU_OP0_NOP, tgsi_unsupported},
7683 {TGSI_OPCODE_ATOMCAS, 0, ALU_OP0_NOP, tgsi_unsupported},
7684 {TGSI_OPCODE_ATOMAND, 0, ALU_OP0_NOP, tgsi_unsupported},
7685 {TGSI_OPCODE_ATOMOR, 0, ALU_OP0_NOP, tgsi_unsupported},
7686 {TGSI_OPCODE_ATOMXOR, 0, ALU_OP0_NOP, tgsi_unsupported},
7687 {TGSI_OPCODE_ATOMUMIN, 0, ALU_OP0_NOP, tgsi_unsupported},
7688 {TGSI_OPCODE_ATOMUMAX, 0, ALU_OP0_NOP, tgsi_unsupported},
7689 {TGSI_OPCODE_ATOMIMIN, 0, ALU_OP0_NOP, tgsi_unsupported},
7690 {TGSI_OPCODE_ATOMIMAX, 0, ALU_OP0_NOP, tgsi_unsupported},
7691 {TGSI_OPCODE_TEX2, 0, FETCH_OP_SAMPLE, tgsi_tex},
7692 {TGSI_OPCODE_TXB2, 0, FETCH_OP_SAMPLE_LB, tgsi_tex},
7693 {TGSI_OPCODE_TXL2, 0, FETCH_OP_SAMPLE_L, tgsi_tex},
7694 {TGSI_OPCODE_IMUL_HI, 0, ALU_OP2_MULHI_INT, cayman_mul_int_instr},
7695 {TGSI_OPCODE_UMUL_HI, 0, ALU_OP2_MULHI_UINT, cayman_mul_int_instr},
7696 {TGSI_OPCODE_TG4, 0, FETCH_OP_GATHER4, tgsi_tex},
7697 {TGSI_OPCODE_LODQ, 0, FETCH_OP_GET_LOD, tgsi_tex},
7698 {TGSI_OPCODE_IBFE, 1, ALU_OP3_BFE_INT, tgsi_op3},
7699 {TGSI_OPCODE_UBFE, 1, ALU_OP3_BFE_UINT, tgsi_op3},
7700 {TGSI_OPCODE_BFI, 0, ALU_OP0_NOP, tgsi_bfi},
7701 {TGSI_OPCODE_BREV, 0, ALU_OP1_BFREV_INT, tgsi_op2},
7702 {TGSI_OPCODE_POPC, 0, ALU_OP1_BCNT_INT, tgsi_op2},
7703 {TGSI_OPCODE_LSB, 0, ALU_OP1_FFBL_INT, tgsi_op2},
7704 {TGSI_OPCODE_IMSB, 0, ALU_OP1_FFBH_INT, tgsi_msb},
7705 {TGSI_OPCODE_UMSB, 0, ALU_OP1_FFBH_UINT, tgsi_msb},
7706 {TGSI_OPCODE_INTERP_CENTROID, 0, ALU_OP0_NOP, tgsi_interp_egcm},
7707 {TGSI_OPCODE_INTERP_SAMPLE, 0, ALU_OP0_NOP, tgsi_interp_egcm},
7708 {TGSI_OPCODE_INTERP_OFFSET, 0, ALU_OP0_NOP, tgsi_interp_egcm},
7709 {TGSI_OPCODE_LAST, 0, ALU_OP0_NOP, tgsi_unsupported},
7710 };