freedreno/ir3: drop instr_clone() stuff
[mesa.git] / src / gallium / drivers / freedreno / ir3 / ir3_compiler.c
1 /* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
2
3 /*
4 * Copyright (C) 2013 Rob Clark <robclark@freedesktop.org>
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 * SOFTWARE.
24 *
25 * Authors:
26 * Rob Clark <robclark@freedesktop.org>
27 */
28
29 #include <stdarg.h>
30
31 #include "pipe/p_state.h"
32 #include "util/u_string.h"
33 #include "util/u_memory.h"
34 #include "util/u_inlines.h"
35 #include "tgsi/tgsi_lowering.h"
36 #include "tgsi/tgsi_parse.h"
37 #include "tgsi/tgsi_ureg.h"
38 #include "tgsi/tgsi_info.h"
39 #include "tgsi/tgsi_strings.h"
40 #include "tgsi/tgsi_dump.h"
41 #include "tgsi/tgsi_scan.h"
42
43 #include "freedreno_util.h"
44
45 #include "ir3_compiler.h"
46 #include "ir3_shader.h"
47
48 #include "instr-a3xx.h"
49 #include "ir3.h"
50
51 struct ir3_compile_context {
52 const struct tgsi_token *tokens;
53 bool free_tokens;
54 struct ir3 *ir;
55 struct ir3_shader_variant *so;
56 uint16_t integer_s;
57
58 struct ir3_block *block;
59 struct ir3_instruction *current_instr;
60
61 /* we need to defer updates to block->outputs[] until the end
62 * of an instruction (so we don't see new value until *after*
63 * the src registers are processed)
64 */
65 struct {
66 struct ir3_instruction *instr, **instrp;
67 } output_updates[16];
68 unsigned num_output_updates;
69
70 /* are we in a sequence of "atomic" instructions?
71 */
72 bool atomic;
73
74 /* For fragment shaders, from the hw perspective the only
75 * actual input is r0.xy position register passed to bary.f.
76 * But TGSI doesn't know that, it still declares things as
77 * IN[] registers. So we do all the input tracking normally
78 * and fix things up after compile_instructions()
79 *
80 * NOTE that frag_pos is the hardware position (possibly it
81 * is actually an index or tag or some such.. it is *not*
82 * values that can be directly used for gl_FragCoord..)
83 */
84 struct ir3_instruction *frag_pos, *frag_face, *frag_coord[4];
85
86 struct tgsi_parse_context parser;
87 unsigned type;
88
89 struct tgsi_shader_info info;
90
91 /* for calculating input/output positions/linkages: */
92 unsigned next_inloc;
93
94 unsigned num_internal_temps;
95 struct tgsi_src_register internal_temps[8];
96
97 /* idx/slot for last compiler generated immediate */
98 unsigned immediate_idx;
99
100 /* stack of branch instructions that mark (potentially nested)
101 * branch if/else/loop/etc
102 */
103 struct {
104 struct ir3_instruction *instr, *cond;
105 bool inv; /* true iff in else leg of branch */
106 } branch[16];
107 unsigned int branch_count;
108
109 /* list of kill instructions: */
110 struct ir3_instruction *kill[16];
111 unsigned int kill_count;
112
113 /* used when dst is same as one of the src, to avoid overwriting a
114 * src element before the remaining scalar instructions that make
115 * up the vector operation
116 */
117 struct tgsi_dst_register tmp_dst;
118 struct tgsi_src_register *tmp_src;
119
120 /* just for catching incorrect use of get_dst()/put_dst():
121 */
122 bool using_tmp_dst;
123 };
124
125
126 static void vectorize(struct ir3_compile_context *ctx,
127 struct ir3_instruction *instr, struct tgsi_dst_register *dst,
128 int nsrcs, ...);
129 static void create_mov(struct ir3_compile_context *ctx,
130 struct tgsi_dst_register *dst, struct tgsi_src_register *src);
131 static type_t get_ftype(struct ir3_compile_context *ctx);
132
133 static unsigned
134 compile_init(struct ir3_compile_context *ctx, struct ir3_shader_variant *so,
135 const struct tgsi_token *tokens)
136 {
137 unsigned ret;
138 struct tgsi_shader_info *info = &ctx->info;
139 struct tgsi_lowering_config lconfig = {
140 .color_two_side = so->key.color_two_side,
141 .lower_DST = true,
142 .lower_XPD = true,
143 .lower_SCS = true,
144 .lower_LRP = true,
145 .lower_FRC = true,
146 .lower_POW = true,
147 .lower_LIT = true,
148 .lower_EXP = true,
149 .lower_LOG = true,
150 .lower_DP4 = true,
151 .lower_DP3 = true,
152 .lower_DPH = true,
153 .lower_DP2 = true,
154 .lower_DP2A = true,
155 };
156
157 switch (so->type) {
158 case SHADER_FRAGMENT:
159 case SHADER_COMPUTE:
160 lconfig.saturate_s = so->key.fsaturate_s;
161 lconfig.saturate_t = so->key.fsaturate_t;
162 lconfig.saturate_r = so->key.fsaturate_r;
163 ctx->integer_s = so->key.finteger_s;
164 break;
165 case SHADER_VERTEX:
166 lconfig.saturate_s = so->key.vsaturate_s;
167 lconfig.saturate_t = so->key.vsaturate_t;
168 lconfig.saturate_r = so->key.vsaturate_r;
169 ctx->integer_s = so->key.vinteger_s;
170 break;
171 }
172
173 if (!so->shader) {
174 /* hack for standalone compiler which does not have
175 * screen/context:
176 */
177 } else if (ir3_shader_gpuid(so->shader) >= 400) {
178 /* a4xx seems to have *no* sam.p */
179 lconfig.lower_TXP = ~0; /* lower all txp */
180 } else {
181 /* a3xx just needs to avoid sam.p for 3d tex */
182 lconfig.lower_TXP = (1 << TGSI_TEXTURE_3D);
183 }
184
185 ctx->tokens = tgsi_transform_lowering(&lconfig, tokens, &ctx->info);
186 ctx->free_tokens = !!ctx->tokens;
187 if (!ctx->tokens) {
188 /* no lowering */
189 ctx->tokens = tokens;
190 }
191 ctx->ir = so->ir;
192 ctx->so = so;
193 ctx->next_inloc = 8;
194 ctx->num_internal_temps = 0;
195 ctx->branch_count = 0;
196 ctx->kill_count = 0;
197 ctx->block = NULL;
198 ctx->current_instr = NULL;
199 ctx->num_output_updates = 0;
200 ctx->atomic = false;
201 ctx->frag_pos = NULL;
202 ctx->frag_face = NULL;
203 ctx->tmp_src = NULL;
204 ctx->using_tmp_dst = false;
205
206 memset(ctx->frag_coord, 0, sizeof(ctx->frag_coord));
207
208 #define FM(x) (1 << TGSI_FILE_##x)
209 /* optimize can't deal with relative addressing: */
210 if (info->indirect_files & (FM(TEMPORARY) | FM(INPUT) | FM(OUTPUT)))
211 return TGSI_PARSE_ERROR;
212
213 /* NOTE: if relative addressing is used, we set constlen in
214 * the compiler (to worst-case value) since we don't know in
215 * the assembler what the max addr reg value can be:
216 */
217 if (info->indirect_files & FM(CONSTANT))
218 so->constlen = 4 * (ctx->info.file_max[TGSI_FILE_CONSTANT] + 1);
219
220 /* Immediates go after constants: */
221 so->first_immediate = info->file_max[TGSI_FILE_CONSTANT] + 1;
222 ctx->immediate_idx = 4 * (ctx->info.file_max[TGSI_FILE_IMMEDIATE] + 1);
223
224 ret = tgsi_parse_init(&ctx->parser, ctx->tokens);
225 if (ret != TGSI_PARSE_OK)
226 return ret;
227
228 ctx->type = ctx->parser.FullHeader.Processor.Processor;
229
230 return ret;
231 }
232
233 static void
234 compile_error(struct ir3_compile_context *ctx, const char *format, ...)
235 {
236 va_list ap;
237 va_start(ap, format);
238 _debug_vprintf(format, ap);
239 va_end(ap);
240 tgsi_dump(ctx->tokens, 0);
241 debug_assert(0);
242 }
243
244 #define compile_assert(ctx, cond) do { \
245 if (!(cond)) compile_error((ctx), "failed assert: "#cond"\n"); \
246 } while (0)
247
248 static void
249 compile_free(struct ir3_compile_context *ctx)
250 {
251 if (ctx->free_tokens)
252 free((void *)ctx->tokens);
253 tgsi_parse_free(&ctx->parser);
254 }
255
256 struct instr_translater {
257 void (*fxn)(const struct instr_translater *t,
258 struct ir3_compile_context *ctx,
259 struct tgsi_full_instruction *inst);
260 unsigned tgsi_opc;
261 opc_t opc;
262 opc_t hopc; /* opc to use for half_precision mode, if different */
263 unsigned arg;
264 };
265
266 static void
267 instr_finish(struct ir3_compile_context *ctx)
268 {
269 unsigned i;
270
271 if (ctx->atomic)
272 return;
273
274 for (i = 0; i < ctx->num_output_updates; i++)
275 *(ctx->output_updates[i].instrp) = ctx->output_updates[i].instr;
276
277 ctx->num_output_updates = 0;
278 }
279
280 /* For "atomic" groups of instructions, for example the four scalar
281 * instructions to perform a vec4 operation. Basically this just
282 * blocks out handling of output_updates so the next scalar instruction
283 * still sees the result from before the start of the atomic group.
284 *
285 * NOTE: when used properly, this could probably replace get/put_dst()
286 * stuff.
287 */
288 static void
289 instr_atomic_start(struct ir3_compile_context *ctx)
290 {
291 ctx->atomic = true;
292 }
293
294 static void
295 instr_atomic_end(struct ir3_compile_context *ctx)
296 {
297 ctx->atomic = false;
298 instr_finish(ctx);
299 }
300
301 static struct ir3_instruction *
302 instr_create(struct ir3_compile_context *ctx, int category, opc_t opc)
303 {
304 instr_finish(ctx);
305 return (ctx->current_instr = ir3_instr_create(ctx->block, category, opc));
306 }
307
308 static struct ir3_block *
309 push_block(struct ir3_compile_context *ctx)
310 {
311 struct ir3_block *block;
312 unsigned ntmp, nin, nout;
313
314 #define SCALAR_REGS(file) (4 * (ctx->info.file_max[TGSI_FILE_ ## file] + 1))
315
316 /* hmm, give ourselves room to create 8 extra temporaries (vec4):
317 */
318 ntmp = SCALAR_REGS(TEMPORARY);
319 ntmp += 8 * 4;
320
321 nout = SCALAR_REGS(OUTPUT);
322 nin = SCALAR_REGS(INPUT);
323
324 /* for outermost block, 'inputs' are the actual shader INPUT
325 * register file. Reads from INPUT registers always go back to
326 * top block. For nested blocks, 'inputs' is used to track any
327 * TEMPORARY file register from one of the enclosing blocks that
328 * is ready in this block.
329 */
330 if (!ctx->block) {
331 /* NOTE: fragment shaders actually have two inputs (r0.xy, the
332 * position)
333 */
334 if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
335 int n = 2;
336 if (ctx->info.reads_position)
337 n += 4;
338 if (ctx->info.uses_frontface)
339 n += 4;
340 nin = MAX2(n, nin);
341 nout += ARRAY_SIZE(ctx->kill);
342 }
343 } else {
344 nin = ntmp;
345 }
346
347 block = ir3_block_create(ctx->ir, ntmp, nin, nout);
348
349 if ((ctx->type == TGSI_PROCESSOR_FRAGMENT) && !ctx->block)
350 block->noutputs -= ARRAY_SIZE(ctx->kill);
351
352 block->parent = ctx->block;
353 ctx->block = block;
354
355 return block;
356 }
357
358 static void
359 pop_block(struct ir3_compile_context *ctx)
360 {
361 ctx->block = ctx->block->parent;
362 compile_assert(ctx, ctx->block);
363 }
364
365 static struct ir3_instruction *
366 create_output(struct ir3_block *block, struct ir3_instruction *instr,
367 unsigned n)
368 {
369 struct ir3_instruction *out;
370
371 out = ir3_instr_create(block, -1, OPC_META_OUTPUT);
372 out->inout.block = block;
373 ir3_reg_create(out, n, 0);
374 if (instr)
375 ir3_reg_create(out, 0, IR3_REG_SSA)->instr = instr;
376
377 return out;
378 }
379
380 static struct ir3_instruction *
381 create_input(struct ir3_block *block, struct ir3_instruction *instr,
382 unsigned n)
383 {
384 struct ir3_instruction *in;
385
386 in = ir3_instr_create(block, -1, OPC_META_INPUT);
387 in->inout.block = block;
388 ir3_reg_create(in, n, 0);
389 if (instr)
390 ir3_reg_create(in, 0, IR3_REG_SSA)->instr = instr;
391
392 return in;
393 }
394
395 static struct ir3_instruction *
396 block_input(struct ir3_block *block, unsigned n)
397 {
398 /* references to INPUT register file always go back up to
399 * top level:
400 */
401 if (block->parent)
402 return block_input(block->parent, n);
403 return block->inputs[n];
404 }
405
406 /* return temporary in scope, creating if needed meta-input node
407 * to track block inputs
408 */
409 static struct ir3_instruction *
410 block_temporary(struct ir3_block *block, unsigned n)
411 {
412 /* references to TEMPORARY register file, find the nearest
413 * enclosing block which has already assigned this temporary,
414 * creating meta-input instructions along the way to keep
415 * track of block inputs
416 */
417 if (block->parent && !block->temporaries[n]) {
418 /* if already have input for this block, reuse: */
419 if (!block->inputs[n])
420 block->inputs[n] = block_temporary(block->parent, n);
421
422 /* and create new input to return: */
423 return create_input(block, block->inputs[n], n);
424 }
425 return block->temporaries[n];
426 }
427
428 static struct ir3_instruction *
429 create_immed(struct ir3_compile_context *ctx, float val)
430 {
431 /* NOTE: *don't* use instr_create() here!
432 */
433 struct ir3_instruction *instr;
434 instr = ir3_instr_create(ctx->block, 1, 0);
435 instr->cat1.src_type = get_ftype(ctx);
436 instr->cat1.dst_type = get_ftype(ctx);
437 ir3_reg_create(instr, 0, 0);
438 ir3_reg_create(instr, 0, IR3_REG_IMMED)->fim_val = val;
439 return instr;
440 }
441
442 static void
443 ssa_dst(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
444 const struct tgsi_dst_register *dst, unsigned chan)
445 {
446 unsigned n = regid(dst->Index, chan);
447 unsigned idx = ctx->num_output_updates;
448
449 compile_assert(ctx, idx < ARRAY_SIZE(ctx->output_updates));
450
451 /* NOTE: defer update of temporaries[idx] or output[idx]
452 * until instr_finish(), so that if the current instruction
453 * reads the same TEMP/OUT[] it gets the old value:
454 *
455 * bleh.. this might be a bit easier to just figure out
456 * in instr_finish(). But at that point we've already
457 * lost information about OUTPUT vs TEMPORARY register
458 * file..
459 */
460
461 switch (dst->File) {
462 case TGSI_FILE_OUTPUT:
463 compile_assert(ctx, n < ctx->block->noutputs);
464 ctx->output_updates[idx].instrp = &ctx->block->outputs[n];
465 ctx->output_updates[idx].instr = instr;
466 ctx->num_output_updates++;
467 break;
468 case TGSI_FILE_TEMPORARY:
469 compile_assert(ctx, n < ctx->block->ntemporaries);
470 ctx->output_updates[idx].instrp = &ctx->block->temporaries[n];
471 ctx->output_updates[idx].instr = instr;
472 ctx->num_output_updates++;
473 break;
474 case TGSI_FILE_ADDRESS:
475 compile_assert(ctx, n < 1);
476 ctx->output_updates[idx].instrp = &ctx->block->address;
477 ctx->output_updates[idx].instr = instr;
478 ctx->num_output_updates++;
479 break;
480 }
481 }
482
483 static void
484 ssa_src(struct ir3_compile_context *ctx, struct ir3_register *reg,
485 const struct tgsi_src_register *src, unsigned chan)
486 {
487 struct ir3_block *block = ctx->block;
488 unsigned n = regid(src->Index, chan);
489
490 switch (src->File) {
491 case TGSI_FILE_INPUT:
492 reg->flags |= IR3_REG_SSA;
493 reg->instr = block_input(ctx->block, n);
494 break;
495 case TGSI_FILE_OUTPUT:
496 /* really this should just happen in case of 'MOV_SAT OUT[n], ..',
497 * for the following clamp instructions:
498 */
499 reg->flags |= IR3_REG_SSA;
500 reg->instr = block->outputs[n];
501 /* we don't have to worry about read from an OUTPUT that was
502 * assigned outside of the current block, because the _SAT
503 * clamp instructions will always be in the same block as
504 * the original instruction which wrote the OUTPUT
505 */
506 compile_assert(ctx, reg->instr);
507 break;
508 case TGSI_FILE_TEMPORARY:
509 reg->flags |= IR3_REG_SSA;
510 reg->instr = block_temporary(ctx->block, n);
511 break;
512 }
513
514 if ((reg->flags & IR3_REG_SSA) && !reg->instr) {
515 /* this can happen when registers (or components of a TGSI
516 * register) are used as src before they have been assigned
517 * (undefined contents). To avoid confusing the rest of the
518 * compiler, and to generally keep things peachy, substitute
519 * an instruction that sets the src to 0.0. Or to keep
520 * things undefined, I could plug in a random number? :-P
521 *
522 * NOTE: *don't* use instr_create() here!
523 */
524 reg->instr = create_immed(ctx, 0.0);
525 }
526 }
527
528 static struct ir3_register *
529 add_dst_reg_wrmask(struct ir3_compile_context *ctx,
530 struct ir3_instruction *instr, const struct tgsi_dst_register *dst,
531 unsigned chan, unsigned wrmask)
532 {
533 unsigned flags = 0, num = 0;
534 struct ir3_register *reg;
535
536 switch (dst->File) {
537 case TGSI_FILE_OUTPUT:
538 case TGSI_FILE_TEMPORARY:
539 /* uses SSA */
540 break;
541 case TGSI_FILE_ADDRESS:
542 flags |= IR3_REG_ADDR;
543 /* uses SSA */
544 break;
545 default:
546 compile_error(ctx, "unsupported dst register file: %s\n",
547 tgsi_file_name(dst->File));
548 break;
549 }
550
551 if (dst->Indirect)
552 flags |= IR3_REG_RELATIV;
553
554 reg = ir3_reg_create(instr, regid(num, chan), flags);
555
556 reg->wrmask = wrmask;
557 if (wrmask == 0x1) {
558 /* normal case */
559 ssa_dst(ctx, instr, dst, chan);
560 } else if ((dst->File == TGSI_FILE_TEMPORARY) ||
561 (dst->File == TGSI_FILE_OUTPUT) ||
562 (dst->File == TGSI_FILE_ADDRESS)) {
563 unsigned i;
564
565 /* if instruction writes multiple, we need to create
566 * some place-holder collect the registers:
567 */
568 for (i = 0; i < 4; i++) {
569 if (wrmask & (1 << i)) {
570 struct ir3_instruction *collect =
571 ir3_instr_create(ctx->block, -1, OPC_META_FO);
572 collect->fo.off = i;
573 /* unused dst reg: */
574 ir3_reg_create(collect, 0, 0);
575 /* and src reg used to hold original instr */
576 ir3_reg_create(collect, 0, IR3_REG_SSA)->instr = instr;
577 if (!ctx->atomic)
578 ssa_dst(ctx, collect, dst, chan+i);
579 }
580 }
581 }
582
583 return reg;
584 }
585
586 static struct ir3_register *
587 add_dst_reg(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
588 const struct tgsi_dst_register *dst, unsigned chan)
589 {
590 return add_dst_reg_wrmask(ctx, instr, dst, chan, 0x1);
591 }
592
593 static struct ir3_register *
594 add_src_reg_wrmask(struct ir3_compile_context *ctx,
595 struct ir3_instruction *instr, const struct tgsi_src_register *src,
596 unsigned chan, unsigned wrmask)
597 {
598 unsigned flags = 0, num = 0;
599 struct ir3_register *reg;
600 struct ir3_instruction *orig = NULL;
601
602 switch (src->File) {
603 case TGSI_FILE_IMMEDIATE:
604 /* TODO if possible, use actual immediate instead of const.. but
605 * TGSI has vec4 immediates, we can only embed scalar (of limited
606 * size, depending on instruction..)
607 */
608 flags |= IR3_REG_CONST;
609 num = src->Index + ctx->so->first_immediate;
610 break;
611 case TGSI_FILE_CONSTANT:
612 flags |= IR3_REG_CONST;
613 num = src->Index;
614 break;
615 case TGSI_FILE_OUTPUT:
616 /* NOTE: we should only end up w/ OUTPUT file for things like
617 * clamp()'ing saturated dst instructions
618 */
619 case TGSI_FILE_INPUT:
620 case TGSI_FILE_TEMPORARY:
621 /* uses SSA */
622 break;
623 default:
624 compile_error(ctx, "unsupported src register file: %s\n",
625 tgsi_file_name(src->File));
626 break;
627 }
628
629 /* We seem to have 8 bits (6.2) for dst register always, so I think
630 * it is safe to assume GPR cannot be >=64
631 *
632 * cat3 instructions only have 8 bits for src2, but cannot take a
633 * const for src2
634 *
635 * cat5 and cat6 in some cases only has 8 bits, but cannot take a
636 * const for any src.
637 *
638 * Other than that we seem to have 12 bits to encode const src,
639 * except for cat1 which may only have 11 bits (but that seems like
640 * a bug)
641 */
642 if (flags & IR3_REG_CONST)
643 compile_assert(ctx, src->Index < (1 << 9));
644 else
645 compile_assert(ctx, src->Index < (1 << 6));
646
647 if (src->Absolute)
648 flags |= IR3_REG_ABS;
649 if (src->Negate)
650 flags |= IR3_REG_NEGATE;
651
652 if (src->Indirect) {
653 flags |= IR3_REG_RELATIV;
654
655 /* shouldn't happen, and we can't cope with it below: */
656 compile_assert(ctx, wrmask == 0x1);
657
658 /* wrap in a meta-deref to track both the src and address: */
659 orig = instr;
660
661 instr = ir3_instr_create(ctx->block, -1, OPC_META_DEREF);
662 ir3_reg_create(instr, 0, 0);
663 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->block->address;
664 }
665
666 reg = ir3_reg_create(instr, regid(num, chan), flags);
667
668 reg->wrmask = wrmask;
669 if (wrmask == 0x1) {
670 /* normal case */
671 ssa_src(ctx, reg, src, chan);
672 } else if ((src->File == TGSI_FILE_TEMPORARY) ||
673 (src->File == TGSI_FILE_OUTPUT) ||
674 (src->File == TGSI_FILE_INPUT)) {
675 struct ir3_instruction *collect;
676 unsigned i;
677
678 compile_assert(ctx, !src->Indirect);
679
680 /* if instruction reads multiple, we need to create
681 * some place-holder collect the registers:
682 */
683 collect = ir3_instr_create(ctx->block, -1, OPC_META_FI);
684 ir3_reg_create(collect, 0, 0); /* unused dst reg */
685
686 for (i = 0; i < 4; i++) {
687 if (wrmask & (1 << i)) {
688 /* and src reg used point to the original instr */
689 ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA),
690 src, chan + i);
691 } else if (wrmask & ~((i << i) - 1)) {
692 /* if any remaining components, then dummy
693 * placeholder src reg to fill in the blanks:
694 */
695 ir3_reg_create(collect, 0, 0);
696 }
697 }
698
699 reg->flags |= IR3_REG_SSA;
700 reg->instr = collect;
701 }
702
703 if (src->Indirect) {
704 reg = ir3_reg_create(orig, 0, flags | IR3_REG_SSA);
705 reg->instr = instr;
706 }
707 return reg;
708 }
709
710 static struct ir3_register *
711 add_src_reg(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
712 const struct tgsi_src_register *src, unsigned chan)
713 {
714 return add_src_reg_wrmask(ctx, instr, src, chan, 0x1);
715 }
716
717 static void
718 src_from_dst(struct tgsi_src_register *src, struct tgsi_dst_register *dst)
719 {
720 src->File = dst->File;
721 src->Indirect = dst->Indirect;
722 src->Dimension = dst->Dimension;
723 src->Index = dst->Index;
724 src->Absolute = 0;
725 src->Negate = 0;
726 src->SwizzleX = TGSI_SWIZZLE_X;
727 src->SwizzleY = TGSI_SWIZZLE_Y;
728 src->SwizzleZ = TGSI_SWIZZLE_Z;
729 src->SwizzleW = TGSI_SWIZZLE_W;
730 }
731
732 /* Get internal-temp src/dst to use for a sequence of instructions
733 * generated by a single TGSI op.
734 */
735 static struct tgsi_src_register *
736 get_internal_temp(struct ir3_compile_context *ctx,
737 struct tgsi_dst_register *tmp_dst)
738 {
739 struct tgsi_src_register *tmp_src;
740 int n;
741
742 tmp_dst->File = TGSI_FILE_TEMPORARY;
743 tmp_dst->WriteMask = TGSI_WRITEMASK_XYZW;
744 tmp_dst->Indirect = 0;
745 tmp_dst->Dimension = 0;
746
747 /* assign next temporary: */
748 n = ctx->num_internal_temps++;
749 compile_assert(ctx, n < ARRAY_SIZE(ctx->internal_temps));
750 tmp_src = &ctx->internal_temps[n];
751
752 tmp_dst->Index = ctx->info.file_max[TGSI_FILE_TEMPORARY] + n + 1;
753
754 src_from_dst(tmp_src, tmp_dst);
755
756 return tmp_src;
757 }
758
759 static inline bool
760 is_const(struct tgsi_src_register *src)
761 {
762 return (src->File == TGSI_FILE_CONSTANT) ||
763 (src->File == TGSI_FILE_IMMEDIATE);
764 }
765
766 static inline bool
767 is_relative(struct tgsi_src_register *src)
768 {
769 return src->Indirect;
770 }
771
772 static inline bool
773 is_rel_or_const(struct tgsi_src_register *src)
774 {
775 return is_relative(src) || is_const(src);
776 }
777
778 static type_t
779 get_ftype(struct ir3_compile_context *ctx)
780 {
781 return TYPE_F32;
782 }
783
784 static type_t
785 get_utype(struct ir3_compile_context *ctx)
786 {
787 return TYPE_U32;
788 }
789
790 static type_t
791 get_stype(struct ir3_compile_context *ctx)
792 {
793 return TYPE_S32;
794 }
795
796 static unsigned
797 src_swiz(struct tgsi_src_register *src, int chan)
798 {
799 switch (chan) {
800 case 0: return src->SwizzleX;
801 case 1: return src->SwizzleY;
802 case 2: return src->SwizzleZ;
803 case 3: return src->SwizzleW;
804 }
805 assert(0);
806 return 0;
807 }
808
809 /* for instructions that cannot take a const register as src, if needed
810 * generate a move to temporary gpr:
811 */
812 static struct tgsi_src_register *
813 get_unconst(struct ir3_compile_context *ctx, struct tgsi_src_register *src)
814 {
815 struct tgsi_dst_register tmp_dst;
816 struct tgsi_src_register *tmp_src;
817
818 compile_assert(ctx, is_rel_or_const(src));
819
820 tmp_src = get_internal_temp(ctx, &tmp_dst);
821
822 create_mov(ctx, &tmp_dst, src);
823
824 return tmp_src;
825 }
826
827 static void
828 get_immediate(struct ir3_compile_context *ctx,
829 struct tgsi_src_register *reg, uint32_t val)
830 {
831 unsigned neg, swiz, idx, i;
832 /* actually maps 1:1 currently.. not sure if that is safe to rely on: */
833 static const unsigned swiz2tgsi[] = {
834 TGSI_SWIZZLE_X, TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Z, TGSI_SWIZZLE_W,
835 };
836
837 for (i = 0; i < ctx->immediate_idx; i++) {
838 swiz = i % 4;
839 idx = i / 4;
840
841 if (ctx->so->immediates[idx].val[swiz] == val) {
842 neg = 0;
843 break;
844 }
845
846 if (ctx->so->immediates[idx].val[swiz] == -val) {
847 neg = 1;
848 break;
849 }
850 }
851
852 if (i == ctx->immediate_idx) {
853 /* need to generate a new immediate: */
854 swiz = i % 4;
855 idx = i / 4;
856 neg = 0;
857 ctx->so->immediates[idx].val[swiz] = val;
858 ctx->so->immediates_count = idx + 1;
859 ctx->immediate_idx++;
860 }
861
862 reg->File = TGSI_FILE_IMMEDIATE;
863 reg->Indirect = 0;
864 reg->Dimension = 0;
865 reg->Index = idx;
866 reg->Absolute = 0;
867 reg->Negate = neg;
868 reg->SwizzleX = swiz2tgsi[swiz];
869 reg->SwizzleY = swiz2tgsi[swiz];
870 reg->SwizzleZ = swiz2tgsi[swiz];
871 reg->SwizzleW = swiz2tgsi[swiz];
872 }
873
874 static void
875 create_mov(struct ir3_compile_context *ctx, struct tgsi_dst_register *dst,
876 struct tgsi_src_register *src)
877 {
878 type_t type_mov = get_ftype(ctx);
879 unsigned i;
880
881 for (i = 0; i < 4; i++) {
882 /* move to destination: */
883 if (dst->WriteMask & (1 << i)) {
884 struct ir3_instruction *instr;
885
886 if (src->Absolute || src->Negate) {
887 /* can't have abs or neg on a mov instr, so use
888 * absneg.f instead to handle these cases:
889 */
890 instr = instr_create(ctx, 2, OPC_ABSNEG_F);
891 } else {
892 instr = instr_create(ctx, 1, 0);
893 instr->cat1.src_type = type_mov;
894 instr->cat1.dst_type = type_mov;
895 }
896
897 add_dst_reg(ctx, instr, dst, i);
898 add_src_reg(ctx, instr, src, src_swiz(src, i));
899 }
900 }
901 }
902
903 static void
904 create_clamp(struct ir3_compile_context *ctx,
905 struct tgsi_dst_register *dst, struct tgsi_src_register *val,
906 struct tgsi_src_register *minval, struct tgsi_src_register *maxval)
907 {
908 struct ir3_instruction *instr;
909
910 instr = instr_create(ctx, 2, OPC_MAX_F);
911 vectorize(ctx, instr, dst, 2, val, 0, minval, 0);
912
913 instr = instr_create(ctx, 2, OPC_MIN_F);
914 vectorize(ctx, instr, dst, 2, val, 0, maxval, 0);
915 }
916
917 static void
918 create_clamp_imm(struct ir3_compile_context *ctx,
919 struct tgsi_dst_register *dst,
920 uint32_t minval, uint32_t maxval)
921 {
922 struct tgsi_src_register minconst, maxconst;
923 struct tgsi_src_register src;
924
925 src_from_dst(&src, dst);
926
927 get_immediate(ctx, &minconst, minval);
928 get_immediate(ctx, &maxconst, maxval);
929
930 create_clamp(ctx, dst, &src, &minconst, &maxconst);
931 }
932
933 static struct tgsi_dst_register *
934 get_dst(struct ir3_compile_context *ctx, struct tgsi_full_instruction *inst)
935 {
936 struct tgsi_dst_register *dst = &inst->Dst[0].Register;
937 unsigned i;
938
939 compile_assert(ctx, !ctx->using_tmp_dst);
940 ctx->using_tmp_dst = true;
941
942 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
943 struct tgsi_src_register *src = &inst->Src[i].Register;
944 if ((src->File == dst->File) && (src->Index == dst->Index)) {
945 if ((dst->WriteMask == TGSI_WRITEMASK_XYZW) &&
946 (src->SwizzleX == TGSI_SWIZZLE_X) &&
947 (src->SwizzleY == TGSI_SWIZZLE_Y) &&
948 (src->SwizzleZ == TGSI_SWIZZLE_Z) &&
949 (src->SwizzleW == TGSI_SWIZZLE_W))
950 continue;
951 ctx->tmp_src = get_internal_temp(ctx, &ctx->tmp_dst);
952 ctx->tmp_dst.WriteMask = dst->WriteMask;
953 dst = &ctx->tmp_dst;
954 break;
955 }
956 }
957 return dst;
958 }
959
960 static void
961 put_dst(struct ir3_compile_context *ctx, struct tgsi_full_instruction *inst,
962 struct tgsi_dst_register *dst)
963 {
964 compile_assert(ctx, ctx->using_tmp_dst);
965 ctx->using_tmp_dst = false;
966
967 /* if necessary, add mov back into original dst: */
968 if (dst != &inst->Dst[0].Register) {
969 create_mov(ctx, &inst->Dst[0].Register, ctx->tmp_src);
970 }
971 }
972
973 /* helper to generate the necessary repeat and/or additional instructions
974 * to turn a scalar instruction into a vector operation:
975 */
976 static void
977 vectorize(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
978 struct tgsi_dst_register *dst, int nsrcs, ...)
979 {
980 va_list ap;
981 int i, j, n = 0;
982
983 instr_atomic_start(ctx);
984
985 for (i = 0; i < 4; i++) {
986 if (dst->WriteMask & (1 << i)) {
987 struct ir3_instruction *cur;
988
989 if (n++ == 0) {
990 cur = instr;
991 } else {
992 cur = instr_create(ctx, instr->category, instr->opc);
993 memcpy(cur->info, instr->info, sizeof(cur->info));
994 }
995
996 add_dst_reg(ctx, cur, dst, i);
997
998 va_start(ap, nsrcs);
999 for (j = 0; j < nsrcs; j++) {
1000 struct tgsi_src_register *src =
1001 va_arg(ap, struct tgsi_src_register *);
1002 unsigned flags = va_arg(ap, unsigned);
1003 struct ir3_register *reg;
1004 if (flags & IR3_REG_IMMED) {
1005 reg = ir3_reg_create(cur, 0, IR3_REG_IMMED);
1006 /* this is an ugly cast.. should have put flags first! */
1007 reg->iim_val = *(int *)&src;
1008 } else {
1009 reg = add_src_reg(ctx, cur, src, src_swiz(src, i));
1010 }
1011 reg->flags |= flags & ~IR3_REG_NEGATE;
1012 if (flags & IR3_REG_NEGATE)
1013 reg->flags ^= IR3_REG_NEGATE;
1014 }
1015 va_end(ap);
1016 }
1017 }
1018
1019 instr_atomic_end(ctx);
1020 }
1021
1022 /*
1023 * Handlers for TGSI instructions which do not have a 1:1 mapping to
1024 * native instructions:
1025 */
1026
1027 static void
1028 trans_clamp(const struct instr_translater *t,
1029 struct ir3_compile_context *ctx,
1030 struct tgsi_full_instruction *inst)
1031 {
1032 struct tgsi_dst_register *dst = get_dst(ctx, inst);
1033 struct tgsi_src_register *src0 = &inst->Src[0].Register;
1034 struct tgsi_src_register *src1 = &inst->Src[1].Register;
1035 struct tgsi_src_register *src2 = &inst->Src[2].Register;
1036
1037 create_clamp(ctx, dst, src0, src1, src2);
1038
1039 put_dst(ctx, inst, dst);
1040 }
1041
1042 /* ARL(x) = x, but mova from hrN.x to a0.. */
1043 static void
1044 trans_arl(const struct instr_translater *t,
1045 struct ir3_compile_context *ctx,
1046 struct tgsi_full_instruction *inst)
1047 {
1048 struct ir3_instruction *instr;
1049 struct tgsi_dst_register tmp_dst;
1050 struct tgsi_src_register *tmp_src;
1051 struct tgsi_dst_register *dst = &inst->Dst[0].Register;
1052 struct tgsi_src_register *src = &inst->Src[0].Register;
1053 unsigned chan = src->SwizzleX;
1054
1055 compile_assert(ctx, dst->File == TGSI_FILE_ADDRESS);
1056
1057 /* NOTE: we allocate a temporary from a flat register
1058 * namespace (ignoring half vs full). It turns out
1059 * not to really matter since registers get reassigned
1060 * later in ir3_ra which (hopefully!) can deal a bit
1061 * better with mixed half and full precision.
1062 */
1063 tmp_src = get_internal_temp(ctx, &tmp_dst);
1064
1065 /* cov.{u,f}{32,16}s16 Rtmp, Rsrc */
1066 instr = instr_create(ctx, 1, 0);
1067 instr->cat1.src_type = (t->tgsi_opc == TGSI_OPCODE_ARL) ?
1068 get_ftype(ctx) : get_utype(ctx);
1069 instr->cat1.dst_type = TYPE_S16;
1070 add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF;
1071 add_src_reg(ctx, instr, src, chan);
1072
1073 /* shl.b Rtmp, Rtmp, 2 */
1074 instr = instr_create(ctx, 2, OPC_SHL_B);
1075 add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF;
1076 add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF;
1077 ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 2;
1078
1079 /* mova a0, Rtmp */
1080 instr = instr_create(ctx, 1, 0);
1081 instr->cat1.src_type = TYPE_S16;
1082 instr->cat1.dst_type = TYPE_S16;
1083 add_dst_reg(ctx, instr, dst, 0)->flags |= IR3_REG_HALF;
1084 add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF;
1085 }
1086
1087 /*
1088 * texture fetch/sample instructions:
1089 */
1090
1091 struct tex_info {
1092 int8_t order[4];
1093 int8_t args;
1094 unsigned src_wrmask, flags;
1095 };
1096
1097 struct target_info {
1098 uint8_t dims;
1099 uint8_t cube;
1100 uint8_t array;
1101 uint8_t shadow;
1102 };
1103
1104 static const struct target_info tex_targets[] = {
1105 [TGSI_TEXTURE_1D] = { 1, 0, 0, 0 },
1106 [TGSI_TEXTURE_2D] = { 2, 0, 0, 0 },
1107 [TGSI_TEXTURE_3D] = { 3, 0, 0, 0 },
1108 [TGSI_TEXTURE_CUBE] = { 3, 1, 0, 0 },
1109 [TGSI_TEXTURE_RECT] = { 2, 0, 0, 0 },
1110 [TGSI_TEXTURE_SHADOW1D] = { 1, 0, 0, 1 },
1111 [TGSI_TEXTURE_SHADOW2D] = { 2, 0, 0, 1 },
1112 [TGSI_TEXTURE_SHADOWRECT] = { 2, 0, 0, 1 },
1113 [TGSI_TEXTURE_1D_ARRAY] = { 1, 0, 1, 0 },
1114 [TGSI_TEXTURE_2D_ARRAY] = { 2, 0, 1, 0 },
1115 [TGSI_TEXTURE_SHADOW1D_ARRAY] = { 1, 0, 1, 1 },
1116 [TGSI_TEXTURE_SHADOW2D_ARRAY] = { 2, 0, 1, 1 },
1117 [TGSI_TEXTURE_SHADOWCUBE] = { 3, 1, 0, 1 },
1118 [TGSI_TEXTURE_2D_MSAA] = { 2, 0, 0, 0 },
1119 [TGSI_TEXTURE_2D_ARRAY_MSAA] = { 2, 0, 1, 0 },
1120 [TGSI_TEXTURE_CUBE_ARRAY] = { 3, 1, 1, 0 },
1121 [TGSI_TEXTURE_SHADOWCUBE_ARRAY] = { 3, 1, 1, 1 },
1122 };
1123
1124 static void
1125 fill_tex_info(struct ir3_compile_context *ctx,
1126 struct tgsi_full_instruction *inst,
1127 struct tex_info *info)
1128 {
1129 const struct target_info *tgt = &tex_targets[inst->Texture.Texture];
1130
1131 if (tgt->dims == 3)
1132 info->flags |= IR3_INSTR_3D;
1133 if (tgt->array)
1134 info->flags |= IR3_INSTR_A;
1135 if (tgt->shadow)
1136 info->flags |= IR3_INSTR_S;
1137
1138 switch (inst->Instruction.Opcode) {
1139 case TGSI_OPCODE_TXB:
1140 case TGSI_OPCODE_TXB2:
1141 case TGSI_OPCODE_TXL:
1142 case TGSI_OPCODE_TXF:
1143 info->args = 2;
1144 break;
1145 case TGSI_OPCODE_TXP:
1146 info->flags |= IR3_INSTR_P;
1147 /* fallthrough */
1148 case TGSI_OPCODE_TEX:
1149 case TGSI_OPCODE_TXD:
1150 info->args = 1;
1151 break;
1152 }
1153
1154 /*
1155 * lay out the first argument in the proper order:
1156 * - actual coordinates first
1157 * - array index
1158 * - shadow reference
1159 * - projection w
1160 *
1161 * bias/lod go into the second arg
1162 */
1163 int arg, pos = 0;
1164 for (arg = 0; arg < tgt->dims; arg++)
1165 info->order[arg] = pos++;
1166 if (tgt->dims == 1)
1167 info->order[pos++] = -1;
1168 if (tgt->shadow)
1169 info->order[pos++] = MAX2(arg + tgt->array, 2);
1170 if (tgt->array)
1171 info->order[pos++] = arg++;
1172 if (info->flags & IR3_INSTR_P)
1173 info->order[pos++] = 3;
1174
1175 info->src_wrmask = (1 << pos) - 1;
1176
1177 for (; pos < 4; pos++)
1178 info->order[pos] = -1;
1179
1180 assert(pos <= 4);
1181 }
1182
1183 static bool check_swiz(struct tgsi_src_register *src, const int8_t order[4])
1184 {
1185 unsigned i;
1186 for (i = 1; (i < 4) && order[i] >= 0; i++)
1187 if (src_swiz(src, i) != (src_swiz(src, 0) + order[i]))
1188 return false;
1189 return true;
1190 }
1191
1192 static bool is_1d(unsigned tex)
1193 {
1194 return tex_targets[tex].dims == 1;
1195 }
1196
1197 static struct tgsi_src_register *
1198 get_tex_coord(struct ir3_compile_context *ctx,
1199 struct tgsi_full_instruction *inst,
1200 const struct tex_info *tinf)
1201 {
1202 struct tgsi_src_register *coord = &inst->Src[0].Register;
1203 struct ir3_instruction *instr;
1204 unsigned tex = inst->Texture.Texture;
1205 struct tgsi_dst_register tmp_dst;
1206 struct tgsi_src_register *tmp_src;
1207 type_t type_mov = get_ftype(ctx);
1208 unsigned j;
1209
1210 /* need to move things around: */
1211 tmp_src = get_internal_temp(ctx, &tmp_dst);
1212
1213 for (j = 0; j < 4; j++) {
1214 if (tinf->order[j] < 0)
1215 continue;
1216 instr = instr_create(ctx, 1, 0); /* mov */
1217 instr->cat1.src_type = type_mov;
1218 instr->cat1.dst_type = type_mov;
1219 add_dst_reg(ctx, instr, &tmp_dst, j);
1220 add_src_reg(ctx, instr, coord,
1221 src_swiz(coord, tinf->order[j]));
1222 }
1223
1224 /* fix up .y coord: */
1225 if (is_1d(tex)) {
1226 struct ir3_register *imm;
1227 instr = instr_create(ctx, 1, 0); /* mov */
1228 instr->cat1.src_type = type_mov;
1229 instr->cat1.dst_type = type_mov;
1230 add_dst_reg(ctx, instr, &tmp_dst, 1); /* .y */
1231 imm = ir3_reg_create(instr, 0, IR3_REG_IMMED);
1232 if (inst->Instruction.Opcode == TGSI_OPCODE_TXF)
1233 imm->iim_val = 0;
1234 else
1235 imm->fim_val = 0.5;
1236 }
1237
1238 return tmp_src;
1239 }
1240
1241 static void
1242 trans_samp(const struct instr_translater *t,
1243 struct ir3_compile_context *ctx,
1244 struct tgsi_full_instruction *inst)
1245 {
1246 struct ir3_instruction *instr, *collect;
1247 struct ir3_register *reg;
1248 struct tgsi_dst_register *dst = &inst->Dst[0].Register;
1249 struct tgsi_src_register *orig, *coord, *samp, *offset, *dpdx, *dpdy;
1250 struct tgsi_src_register zero;
1251 const struct target_info *tgt = &tex_targets[inst->Texture.Texture];
1252 struct tex_info tinf;
1253 int i;
1254
1255 memset(&tinf, 0, sizeof(tinf));
1256 fill_tex_info(ctx, inst, &tinf);
1257 coord = get_tex_coord(ctx, inst, &tinf);
1258 get_immediate(ctx, &zero, 0);
1259
1260 switch (inst->Instruction.Opcode) {
1261 case TGSI_OPCODE_TXB2:
1262 orig = &inst->Src[1].Register;
1263 samp = &inst->Src[2].Register;
1264 break;
1265 case TGSI_OPCODE_TXD:
1266 orig = &inst->Src[0].Register;
1267 dpdx = &inst->Src[1].Register;
1268 dpdy = &inst->Src[2].Register;
1269 samp = &inst->Src[3].Register;
1270 if (is_rel_or_const(dpdx))
1271 dpdx = get_unconst(ctx, dpdx);
1272 if (is_rel_or_const(dpdy))
1273 dpdy = get_unconst(ctx, dpdy);
1274 break;
1275 default:
1276 orig = &inst->Src[0].Register;
1277 samp = &inst->Src[1].Register;
1278 break;
1279 }
1280 if (tinf.args > 1 && is_rel_or_const(orig))
1281 orig = get_unconst(ctx, orig);
1282
1283 /* scale up integer coords for TXF based on the LOD */
1284 if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
1285 struct tgsi_dst_register tmp_dst;
1286 struct tgsi_src_register *tmp_src;
1287 type_t type_mov = get_utype(ctx);
1288
1289 tmp_src = get_internal_temp(ctx, &tmp_dst);
1290 for (i = 0; i < tgt->dims; i++) {
1291 instr = instr_create(ctx, 2, OPC_SHL_B);
1292 add_dst_reg(ctx, instr, &tmp_dst, i);
1293 add_src_reg(ctx, instr, coord, src_swiz(coord, i));
1294 add_src_reg(ctx, instr, orig, orig->SwizzleW);
1295 }
1296 if (tgt->dims < 2) {
1297 instr = instr_create(ctx, 1, 0);
1298 instr->cat1.src_type = type_mov;
1299 instr->cat1.dst_type = type_mov;
1300 add_dst_reg(ctx, instr, &tmp_dst, i);
1301 add_src_reg(ctx, instr, &zero, 0);
1302 i++;
1303 }
1304 if (tgt->array) {
1305 instr = instr_create(ctx, 1, 0);
1306 instr->cat1.src_type = type_mov;
1307 instr->cat1.dst_type = type_mov;
1308 add_dst_reg(ctx, instr, &tmp_dst, i);
1309 add_src_reg(ctx, instr, coord, src_swiz(coord, i));
1310 }
1311 coord = tmp_src;
1312 }
1313
1314 if (inst->Texture.NumOffsets) {
1315 struct tgsi_texture_offset *tex_offset = &inst->TexOffsets[0];
1316 struct tgsi_src_register offset_src = {0};
1317
1318 offset_src.File = tex_offset->File;
1319 offset_src.Index = tex_offset->Index;
1320 offset_src.SwizzleX = tex_offset->SwizzleX;
1321 offset_src.SwizzleY = tex_offset->SwizzleY;
1322 offset_src.SwizzleZ = tex_offset->SwizzleZ;
1323 offset = get_unconst(ctx, &offset_src);
1324 tinf.flags |= IR3_INSTR_O;
1325 }
1326
1327 instr = instr_create(ctx, 5, t->opc);
1328 if (ctx->integer_s & (1 << samp->Index))
1329 instr->cat5.type = get_utype(ctx);
1330 else
1331 instr->cat5.type = get_ftype(ctx);
1332 instr->cat5.samp = samp->Index;
1333 instr->cat5.tex = samp->Index;
1334 instr->flags |= tinf.flags;
1335
1336 add_dst_reg_wrmask(ctx, instr, dst, 0, dst->WriteMask);
1337
1338 reg = ir3_reg_create(instr, 0, IR3_REG_SSA);
1339
1340 collect = ir3_instr_create(ctx->block, -1, OPC_META_FI);
1341 ir3_reg_create(collect, 0, 0);
1342 for (i = 0; i < 4; i++) {
1343 if (tinf.src_wrmask & (1 << i))
1344 ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA),
1345 coord, src_swiz(coord, i));
1346 else if (tinf.src_wrmask & ~((1 << i) - 1))
1347 ir3_reg_create(collect, 0, 0);
1348 }
1349
1350 /* Attach derivatives onto the end of the fan-in. Derivatives start after
1351 * the 4th argument, so make sure that fi is padded up to 4 first.
1352 */
1353 if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) {
1354 while (collect->regs_count < 5)
1355 ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), &zero, 0);
1356 for (i = 0; i < tgt->dims; i++)
1357 ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), dpdx, i);
1358 if (tgt->dims < 2)
1359 ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), &zero, 0);
1360 for (i = 0; i < tgt->dims; i++)
1361 ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), dpdy, i);
1362 if (tgt->dims < 2)
1363 ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), &zero, 0);
1364 tinf.src_wrmask |= ((1 << (2 * MAX2(tgt->dims, 2))) - 1) << 4;
1365 }
1366
1367 reg->instr = collect;
1368 reg->wrmask = tinf.src_wrmask;
1369
1370 /* The second argument contains the offsets, followed by the lod/bias
1371 * argument. This is constructed more manually due to the dynamic nature.
1372 */
1373 if (inst->Texture.NumOffsets == 0 && tinf.args == 1)
1374 return;
1375
1376 reg = ir3_reg_create(instr, 0, IR3_REG_SSA);
1377
1378 collect = ir3_instr_create(ctx->block, -1, OPC_META_FI);
1379 ir3_reg_create(collect, 0, 0);
1380
1381 if (inst->Texture.NumOffsets) {
1382 for (i = 0; i < tgt->dims; i++)
1383 ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA),
1384 offset, i);
1385 if (tgt->dims < 2)
1386 ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), &zero, 0);
1387 }
1388 if (inst->Instruction.Opcode == TGSI_OPCODE_TXB2)
1389 ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA),
1390 orig, orig->SwizzleX);
1391 else if (tinf.args > 1)
1392 ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA),
1393 orig, orig->SwizzleW);
1394
1395 reg->instr = collect;
1396 reg->wrmask = (1 << (collect->regs_count - 1)) - 1;
1397 }
1398
1399 static void
1400 trans_txq(const struct instr_translater *t,
1401 struct ir3_compile_context *ctx,
1402 struct tgsi_full_instruction *inst)
1403 {
1404 struct ir3_instruction *instr;
1405 struct tgsi_dst_register *dst = &inst->Dst[0].Register;
1406 struct tgsi_src_register *level = &inst->Src[0].Register;
1407 struct tgsi_src_register *samp = &inst->Src[1].Register;
1408 struct tex_info tinf;
1409
1410 memset(&tinf, 0, sizeof(tinf));
1411 fill_tex_info(ctx, inst, &tinf);
1412 if (is_rel_or_const(level))
1413 level = get_unconst(ctx, level);
1414
1415 instr = instr_create(ctx, 5, OPC_GETSIZE);
1416 instr->cat5.type = get_utype(ctx);
1417 instr->cat5.samp = samp->Index;
1418 instr->cat5.tex = samp->Index;
1419 instr->flags |= tinf.flags;
1420
1421 add_dst_reg_wrmask(ctx, instr, dst, 0, dst->WriteMask);
1422 add_src_reg_wrmask(ctx, instr, level, level->SwizzleX, 0x1);
1423 }
1424
1425 /* DDX/DDY */
1426 static void
1427 trans_deriv(const struct instr_translater *t,
1428 struct ir3_compile_context *ctx,
1429 struct tgsi_full_instruction *inst)
1430 {
1431 struct ir3_instruction *instr;
1432 struct tgsi_dst_register *dst = &inst->Dst[0].Register;
1433 struct tgsi_src_register *src = &inst->Src[0].Register;
1434 static const int8_t order[4] = {0, 1, 2, 3};
1435
1436 if (!check_swiz(src, order)) {
1437 struct tgsi_dst_register tmp_dst;
1438 struct tgsi_src_register *tmp_src;
1439
1440 tmp_src = get_internal_temp(ctx, &tmp_dst);
1441 create_mov(ctx, &tmp_dst, src);
1442
1443 src = tmp_src;
1444 }
1445
1446 /* This might be a workaround for hw bug? Blob compiler always
1447 * seems to work two components at a time for dsy/dsx. It does
1448 * actually seem to work in some cases (or at least some piglit
1449 * tests) for four components at a time. But seems more reliable
1450 * to split this into two instructions like the blob compiler
1451 * does:
1452 */
1453
1454 instr = instr_create(ctx, 5, t->opc);
1455 instr->cat5.type = get_ftype(ctx);
1456 add_dst_reg_wrmask(ctx, instr, dst, 0, dst->WriteMask & 0x3);
1457 add_src_reg_wrmask(ctx, instr, src, 0, dst->WriteMask & 0x3);
1458
1459 instr = instr_create(ctx, 5, t->opc);
1460 instr->cat5.type = get_ftype(ctx);
1461 add_dst_reg_wrmask(ctx, instr, dst, 2, (dst->WriteMask >> 2) & 0x3);
1462 add_src_reg_wrmask(ctx, instr, src, 2, (dst->WriteMask >> 2) & 0x3);
1463 }
1464
1465 /*
1466 * SEQ(a,b) = (a == b) ? 1.0 : 0.0
1467 * cmps.f.eq tmp0, a, b
1468 * cov.u16f16 dst, tmp0
1469 *
1470 * SNE(a,b) = (a != b) ? 1.0 : 0.0
1471 * cmps.f.ne tmp0, a, b
1472 * cov.u16f16 dst, tmp0
1473 *
1474 * SGE(a,b) = (a >= b) ? 1.0 : 0.0
1475 * cmps.f.ge tmp0, a, b
1476 * cov.u16f16 dst, tmp0
1477 *
1478 * SLE(a,b) = (a <= b) ? 1.0 : 0.0
1479 * cmps.f.le tmp0, a, b
1480 * cov.u16f16 dst, tmp0
1481 *
1482 * SGT(a,b) = (a > b) ? 1.0 : 0.0
1483 * cmps.f.gt tmp0, a, b
1484 * cov.u16f16 dst, tmp0
1485 *
1486 * SLT(a,b) = (a < b) ? 1.0 : 0.0
1487 * cmps.f.lt tmp0, a, b
1488 * cov.u16f16 dst, tmp0
1489 *
1490 * CMP(a,b,c) = (a < 0.0) ? b : c
1491 * cmps.f.lt tmp0, a, {0.0}
1492 * sel.b16 dst, b, tmp0, c
1493 */
1494 static void
1495 trans_cmp(const struct instr_translater *t,
1496 struct ir3_compile_context *ctx,
1497 struct tgsi_full_instruction *inst)
1498 {
1499 struct ir3_instruction *instr;
1500 struct tgsi_dst_register tmp_dst;
1501 struct tgsi_src_register *tmp_src;
1502 struct tgsi_src_register constval0;
1503 /* final instruction for CMP() uses orig src1 and src2: */
1504 struct tgsi_dst_register *dst = get_dst(ctx, inst);
1505 struct tgsi_src_register *a0, *a1, *a2;
1506 unsigned condition;
1507
1508 tmp_src = get_internal_temp(ctx, &tmp_dst);
1509
1510 a0 = &inst->Src[0].Register; /* a */
1511 a1 = &inst->Src[1].Register; /* b */
1512
1513 switch (t->tgsi_opc) {
1514 case TGSI_OPCODE_SEQ:
1515 case TGSI_OPCODE_FSEQ:
1516 condition = IR3_COND_EQ;
1517 break;
1518 case TGSI_OPCODE_SNE:
1519 case TGSI_OPCODE_FSNE:
1520 condition = IR3_COND_NE;
1521 break;
1522 case TGSI_OPCODE_SGE:
1523 case TGSI_OPCODE_FSGE:
1524 condition = IR3_COND_GE;
1525 break;
1526 case TGSI_OPCODE_SLT:
1527 case TGSI_OPCODE_FSLT:
1528 condition = IR3_COND_LT;
1529 break;
1530 case TGSI_OPCODE_SLE:
1531 condition = IR3_COND_LE;
1532 break;
1533 case TGSI_OPCODE_SGT:
1534 condition = IR3_COND_GT;
1535 break;
1536 case TGSI_OPCODE_CMP:
1537 get_immediate(ctx, &constval0, fui(0.0));
1538 a0 = &inst->Src[0].Register; /* a */
1539 a1 = &constval0; /* {0.0} */
1540 condition = IR3_COND_LT;
1541 break;
1542 default:
1543 compile_assert(ctx, 0);
1544 return;
1545 }
1546
1547 if (is_const(a0) && is_const(a1))
1548 a0 = get_unconst(ctx, a0);
1549
1550 /* cmps.f.<cond> tmp, a0, a1 */
1551 instr = instr_create(ctx, 2, OPC_CMPS_F);
1552 instr->cat2.condition = condition;
1553 vectorize(ctx, instr, &tmp_dst, 2, a0, 0, a1, 0);
1554
1555 switch (t->tgsi_opc) {
1556 case TGSI_OPCODE_SEQ:
1557 case TGSI_OPCODE_SGE:
1558 case TGSI_OPCODE_SLE:
1559 case TGSI_OPCODE_SNE:
1560 case TGSI_OPCODE_SGT:
1561 case TGSI_OPCODE_SLT:
1562 /* cov.u16f16 dst, tmp0 */
1563 instr = instr_create(ctx, 1, 0);
1564 instr->cat1.src_type = get_utype(ctx);
1565 instr->cat1.dst_type = get_ftype(ctx);
1566 vectorize(ctx, instr, dst, 1, tmp_src, 0);
1567 break;
1568 case TGSI_OPCODE_FSEQ:
1569 case TGSI_OPCODE_FSGE:
1570 case TGSI_OPCODE_FSNE:
1571 case TGSI_OPCODE_FSLT:
1572 /* absneg.s dst, (neg)tmp0 */
1573 instr = instr_create(ctx, 2, OPC_ABSNEG_S);
1574 vectorize(ctx, instr, dst, 1, tmp_src, IR3_REG_NEGATE);
1575 break;
1576 case TGSI_OPCODE_CMP:
1577 a1 = &inst->Src[1].Register;
1578 a2 = &inst->Src[2].Register;
1579 /* sel.{b32,b16} dst, src2, tmp, src1 */
1580 instr = instr_create(ctx, 3, OPC_SEL_B32);
1581 vectorize(ctx, instr, dst, 3, a1, 0, tmp_src, 0, a2, 0);
1582
1583 break;
1584 }
1585
1586 put_dst(ctx, inst, dst);
1587 }
1588
1589 /*
1590 * USNE(a,b) = (a != b) ? ~0 : 0
1591 * cmps.u32.ne dst, a, b
1592 *
1593 * USEQ(a,b) = (a == b) ? ~0 : 0
1594 * cmps.u32.eq dst, a, b
1595 *
1596 * ISGE(a,b) = (a > b) ? ~0 : 0
1597 * cmps.s32.ge dst, a, b
1598 *
1599 * USGE(a,b) = (a > b) ? ~0 : 0
1600 * cmps.u32.ge dst, a, b
1601 *
1602 * ISLT(a,b) = (a < b) ? ~0 : 0
1603 * cmps.s32.lt dst, a, b
1604 *
1605 * USLT(a,b) = (a < b) ? ~0 : 0
1606 * cmps.u32.lt dst, a, b
1607 *
1608 */
1609 static void
1610 trans_icmp(const struct instr_translater *t,
1611 struct ir3_compile_context *ctx,
1612 struct tgsi_full_instruction *inst)
1613 {
1614 struct ir3_instruction *instr;
1615 struct tgsi_dst_register *dst = get_dst(ctx, inst);
1616 struct tgsi_dst_register tmp_dst;
1617 struct tgsi_src_register *tmp_src;
1618 struct tgsi_src_register *a0, *a1;
1619 unsigned condition;
1620
1621 a0 = &inst->Src[0].Register; /* a */
1622 a1 = &inst->Src[1].Register; /* b */
1623
1624 switch (t->tgsi_opc) {
1625 case TGSI_OPCODE_USNE:
1626 condition = IR3_COND_NE;
1627 break;
1628 case TGSI_OPCODE_USEQ:
1629 condition = IR3_COND_EQ;
1630 break;
1631 case TGSI_OPCODE_ISGE:
1632 case TGSI_OPCODE_USGE:
1633 condition = IR3_COND_GE;
1634 break;
1635 case TGSI_OPCODE_ISLT:
1636 case TGSI_OPCODE_USLT:
1637 condition = IR3_COND_LT;
1638 break;
1639
1640 default:
1641 compile_assert(ctx, 0);
1642 return;
1643 }
1644
1645 if (is_const(a0) && is_const(a1))
1646 a0 = get_unconst(ctx, a0);
1647
1648 tmp_src = get_internal_temp(ctx, &tmp_dst);
1649 /* cmps.{u32,s32}.<cond> tmp, a0, a1 */
1650 instr = instr_create(ctx, 2, t->opc);
1651 instr->cat2.condition = condition;
1652 vectorize(ctx, instr, &tmp_dst, 2, a0, 0, a1, 0);
1653
1654 /* absneg.s dst, (neg)tmp */
1655 instr = instr_create(ctx, 2, OPC_ABSNEG_S);
1656 vectorize(ctx, instr, dst, 1, tmp_src, IR3_REG_NEGATE);
1657
1658 put_dst(ctx, inst, dst);
1659 }
1660
1661 /*
1662 * UCMP(a,b,c) = a ? b : c
1663 * sel.b16 dst, b, a, c
1664 */
1665 static void
1666 trans_ucmp(const struct instr_translater *t,
1667 struct ir3_compile_context *ctx,
1668 struct tgsi_full_instruction *inst)
1669 {
1670 struct ir3_instruction *instr;
1671 struct tgsi_dst_register *dst = get_dst(ctx, inst);
1672 struct tgsi_src_register *a0, *a1, *a2;
1673
1674 a0 = &inst->Src[0].Register; /* a */
1675 a1 = &inst->Src[1].Register; /* b */
1676 a2 = &inst->Src[2].Register; /* c */
1677
1678 if (is_rel_or_const(a0))
1679 a0 = get_unconst(ctx, a0);
1680
1681 /* sel.{b32,b16} dst, b, a, c */
1682 instr = instr_create(ctx, 3, OPC_SEL_B32);
1683 vectorize(ctx, instr, dst, 3, a1, 0, a0, 0, a2, 0);
1684 put_dst(ctx, inst, dst);
1685 }
1686
1687 /*
1688 * ISSG(a) = a < 0 ? -1 : a > 0 ? 1 : 0
1689 * cmps.s.lt tmp_neg, a, 0 # 1 if a is negative
1690 * cmps.s.gt tmp_pos, a, 0 # 1 if a is positive
1691 * sub.u dst, tmp_pos, tmp_neg
1692 */
1693 static void
1694 trans_issg(const struct instr_translater *t,
1695 struct ir3_compile_context *ctx,
1696 struct tgsi_full_instruction *inst)
1697 {
1698 struct ir3_instruction *instr;
1699 struct tgsi_dst_register *dst = get_dst(ctx, inst);
1700 struct tgsi_src_register *a = &inst->Src[0].Register;
1701 struct tgsi_dst_register neg_dst, pos_dst;
1702 struct tgsi_src_register *neg_src, *pos_src;
1703
1704 neg_src = get_internal_temp(ctx, &neg_dst);
1705 pos_src = get_internal_temp(ctx, &pos_dst);
1706
1707 /* cmps.s.lt neg, a, 0 */
1708 instr = instr_create(ctx, 2, OPC_CMPS_S);
1709 instr->cat2.condition = IR3_COND_LT;
1710 vectorize(ctx, instr, &neg_dst, 2, a, 0, 0, IR3_REG_IMMED);
1711
1712 /* cmps.s.gt pos, a, 0 */
1713 instr = instr_create(ctx, 2, OPC_CMPS_S);
1714 instr->cat2.condition = IR3_COND_GT;
1715 vectorize(ctx, instr, &pos_dst, 2, a, 0, 0, IR3_REG_IMMED);
1716
1717 /* sub.u dst, pos, neg */
1718 instr = instr_create(ctx, 2, OPC_SUB_U);
1719 vectorize(ctx, instr, dst, 2, pos_src, 0, neg_src, 0);
1720
1721 put_dst(ctx, inst, dst);
1722 }
1723
1724
1725
1726 /*
1727 * Conditional / Flow control
1728 */
1729
1730 static void
1731 push_branch(struct ir3_compile_context *ctx, bool inv,
1732 struct ir3_instruction *instr, struct ir3_instruction *cond)
1733 {
1734 unsigned int idx = ctx->branch_count++;
1735 compile_assert(ctx, idx < ARRAY_SIZE(ctx->branch));
1736 ctx->branch[idx].instr = instr;
1737 ctx->branch[idx].inv = inv;
1738 /* else side of branch has same condition: */
1739 if (!inv)
1740 ctx->branch[idx].cond = cond;
1741 }
1742
1743 static struct ir3_instruction *
1744 pop_branch(struct ir3_compile_context *ctx)
1745 {
1746 unsigned int idx = --ctx->branch_count;
1747 return ctx->branch[idx].instr;
1748 }
1749
1750 static void
1751 trans_if(const struct instr_translater *t,
1752 struct ir3_compile_context *ctx,
1753 struct tgsi_full_instruction *inst)
1754 {
1755 struct ir3_instruction *instr, *cond;
1756 struct tgsi_src_register *src = &inst->Src[0].Register;
1757 struct tgsi_dst_register tmp_dst;
1758 struct tgsi_src_register *tmp_src;
1759 struct tgsi_src_register constval;
1760
1761 get_immediate(ctx, &constval, fui(0.0));
1762 tmp_src = get_internal_temp(ctx, &tmp_dst);
1763
1764 if (is_const(src))
1765 src = get_unconst(ctx, src);
1766
1767 /* cmps.{f,u}.ne tmp0, b, {0.0} */
1768 instr = instr_create(ctx, 2, t->opc);
1769 add_dst_reg(ctx, instr, &tmp_dst, 0);
1770 add_src_reg(ctx, instr, src, src->SwizzleX);
1771 add_src_reg(ctx, instr, &constval, constval.SwizzleX);
1772 instr->cat2.condition = IR3_COND_NE;
1773
1774 compile_assert(ctx, instr->regs[1]->flags & IR3_REG_SSA); /* because get_unconst() */
1775 cond = instr->regs[1]->instr;
1776
1777 /* meta:flow tmp0 */
1778 instr = instr_create(ctx, -1, OPC_META_FLOW);
1779 ir3_reg_create(instr, 0, 0); /* dummy dst */
1780 add_src_reg(ctx, instr, tmp_src, TGSI_SWIZZLE_X);
1781
1782 push_branch(ctx, false, instr, cond);
1783 instr->flow.if_block = push_block(ctx);
1784 }
1785
1786 static void
1787 trans_else(const struct instr_translater *t,
1788 struct ir3_compile_context *ctx,
1789 struct tgsi_full_instruction *inst)
1790 {
1791 struct ir3_instruction *instr;
1792
1793 pop_block(ctx);
1794
1795 instr = pop_branch(ctx);
1796
1797 compile_assert(ctx, (instr->category == -1) &&
1798 (instr->opc == OPC_META_FLOW));
1799
1800 push_branch(ctx, true, instr, NULL);
1801 instr->flow.else_block = push_block(ctx);
1802 }
1803
1804 static struct ir3_instruction *
1805 find_temporary(struct ir3_block *block, unsigned n)
1806 {
1807 if (block->parent && !block->temporaries[n])
1808 return find_temporary(block->parent, n);
1809 return block->temporaries[n];
1810 }
1811
1812 static struct ir3_instruction *
1813 find_output(struct ir3_block *block, unsigned n)
1814 {
1815 if (block->parent && !block->outputs[n])
1816 return find_output(block->parent, n);
1817 return block->outputs[n];
1818 }
1819
1820 static struct ir3_instruction *
1821 create_phi(struct ir3_compile_context *ctx, struct ir3_instruction *cond,
1822 struct ir3_instruction *a, struct ir3_instruction *b)
1823 {
1824 struct ir3_instruction *phi;
1825
1826 compile_assert(ctx, cond);
1827
1828 /* Either side of the condition could be null.. which
1829 * indicates a variable written on only one side of the
1830 * branch. Normally this should only be variables not
1831 * used outside of that side of the branch. So we could
1832 * just 'return a ? a : b;' in that case. But for better
1833 * defined undefined behavior we just stick in imm{0.0}.
1834 * In the common case of a value only used within the
1835 * one side of the branch, the PHI instruction will not
1836 * get scheduled
1837 */
1838 if (!a)
1839 a = create_immed(ctx, 0.0);
1840 if (!b)
1841 b = create_immed(ctx, 0.0);
1842
1843 phi = instr_create(ctx, -1, OPC_META_PHI);
1844 ir3_reg_create(phi, 0, 0); /* dummy dst */
1845 ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = cond;
1846 ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = a;
1847 ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = b;
1848
1849 return phi;
1850 }
1851
1852 static void
1853 trans_endif(const struct instr_translater *t,
1854 struct ir3_compile_context *ctx,
1855 struct tgsi_full_instruction *inst)
1856 {
1857 struct ir3_instruction *instr;
1858 struct ir3_block *ifb, *elseb;
1859 struct ir3_instruction **ifout, **elseout;
1860 unsigned i, ifnout = 0, elsenout = 0;
1861
1862 pop_block(ctx);
1863
1864 instr = pop_branch(ctx);
1865
1866 compile_assert(ctx, (instr->category == -1) &&
1867 (instr->opc == OPC_META_FLOW));
1868
1869 ifb = instr->flow.if_block;
1870 elseb = instr->flow.else_block;
1871 /* if there is no else block, the parent block is used for the
1872 * branch-not-taken src of the PHI instructions:
1873 */
1874 if (!elseb)
1875 elseb = ifb->parent;
1876
1877 /* worst case sizes: */
1878 ifnout = ifb->ntemporaries + ifb->noutputs;
1879 elsenout = elseb->ntemporaries + elseb->noutputs;
1880
1881 ifout = ir3_alloc(ctx->ir, sizeof(ifb->outputs[0]) * ifnout);
1882 if (elseb != ifb->parent)
1883 elseout = ir3_alloc(ctx->ir, sizeof(ifb->outputs[0]) * elsenout);
1884
1885 ifnout = 0;
1886 elsenout = 0;
1887
1888 /* generate PHI instructions for any temporaries written: */
1889 for (i = 0; i < ifb->ntemporaries; i++) {
1890 struct ir3_instruction *a = ifb->temporaries[i];
1891 struct ir3_instruction *b = elseb->temporaries[i];
1892
1893 /* if temporary written in if-block, or if else block
1894 * is present and temporary written in else-block:
1895 */
1896 if (a || ((elseb != ifb->parent) && b)) {
1897 struct ir3_instruction *phi;
1898
1899 /* if only written on one side, find the closest
1900 * enclosing update on other side:
1901 */
1902 if (!a)
1903 a = find_temporary(ifb, i);
1904 if (!b)
1905 b = find_temporary(elseb, i);
1906
1907 ifout[ifnout] = a;
1908 a = create_output(ifb, a, ifnout++);
1909
1910 if (elseb != ifb->parent) {
1911 elseout[elsenout] = b;
1912 b = create_output(elseb, b, elsenout++);
1913 }
1914
1915 phi = create_phi(ctx, instr, a, b);
1916 ctx->block->temporaries[i] = phi;
1917 }
1918 }
1919
1920 compile_assert(ctx, ifb->noutputs == elseb->noutputs);
1921
1922 /* .. and any outputs written: */
1923 for (i = 0; i < ifb->noutputs; i++) {
1924 struct ir3_instruction *a = ifb->outputs[i];
1925 struct ir3_instruction *b = elseb->outputs[i];
1926
1927 /* if output written in if-block, or if else block
1928 * is present and output written in else-block:
1929 */
1930 if (a || ((elseb != ifb->parent) && b)) {
1931 struct ir3_instruction *phi;
1932
1933 /* if only written on one side, find the closest
1934 * enclosing update on other side:
1935 */
1936 if (!a)
1937 a = find_output(ifb, i);
1938 if (!b)
1939 b = find_output(elseb, i);
1940
1941 ifout[ifnout] = a;
1942 a = create_output(ifb, a, ifnout++);
1943
1944 if (elseb != ifb->parent) {
1945 elseout[elsenout] = b;
1946 b = create_output(elseb, b, elsenout++);
1947 }
1948
1949 phi = create_phi(ctx, instr, a, b);
1950 ctx->block->outputs[i] = phi;
1951 }
1952 }
1953
1954 ifb->noutputs = ifnout;
1955 ifb->outputs = ifout;
1956
1957 if (elseb != ifb->parent) {
1958 elseb->noutputs = elsenout;
1959 elseb->outputs = elseout;
1960 }
1961
1962 // TODO maybe we want to compact block->inputs?
1963 }
1964
1965 /*
1966 * Kill
1967 */
1968
1969 static void
1970 trans_kill(const struct instr_translater *t,
1971 struct ir3_compile_context *ctx,
1972 struct tgsi_full_instruction *inst)
1973 {
1974 struct ir3_instruction *instr, *immed, *cond = NULL;
1975 bool inv = false;
1976
1977 /* unconditional kill, use enclosing if condition: */
1978 if (ctx->branch_count > 0) {
1979 unsigned int idx = ctx->branch_count - 1;
1980 cond = ctx->branch[idx].cond;
1981 inv = ctx->branch[idx].inv;
1982 } else {
1983 cond = create_immed(ctx, 1.0);
1984 }
1985
1986 compile_assert(ctx, cond);
1987
1988 immed = create_immed(ctx, 0.0);
1989
1990 /* cmps.f.ne p0.x, cond, {0.0} */
1991 instr = instr_create(ctx, 2, OPC_CMPS_F);
1992 instr->cat2.condition = IR3_COND_NE;
1993 ir3_reg_create(instr, regid(REG_P0, 0), 0);
1994 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond;
1995 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = immed;
1996 cond = instr;
1997
1998 /* kill p0.x */
1999 instr = instr_create(ctx, 0, OPC_KILL);
2000 instr->cat0.inv = inv;
2001 ir3_reg_create(instr, 0, 0); /* dummy dst */
2002 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond;
2003
2004 ctx->kill[ctx->kill_count++] = instr;
2005
2006 ctx->so->has_kill = true;
2007 }
2008
2009 /*
2010 * Kill-If
2011 */
2012
2013 static void
2014 trans_killif(const struct instr_translater *t,
2015 struct ir3_compile_context *ctx,
2016 struct tgsi_full_instruction *inst)
2017 {
2018 struct tgsi_src_register *src = &inst->Src[0].Register;
2019 struct ir3_instruction *instr, *immed, *cond = NULL;
2020 bool inv = false;
2021
2022 immed = create_immed(ctx, 0.0);
2023
2024 /* cmps.f.ne p0.x, cond, {0.0} */
2025 instr = instr_create(ctx, 2, OPC_CMPS_F);
2026 instr->cat2.condition = IR3_COND_NE;
2027 ir3_reg_create(instr, regid(REG_P0, 0), 0);
2028 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = immed;
2029 add_src_reg(ctx, instr, src, src->SwizzleX);
2030
2031 cond = instr;
2032
2033 /* kill p0.x */
2034 instr = instr_create(ctx, 0, OPC_KILL);
2035 instr->cat0.inv = inv;
2036 ir3_reg_create(instr, 0, 0); /* dummy dst */
2037 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond;
2038
2039 ctx->kill[ctx->kill_count++] = instr;
2040
2041 ctx->so->has_kill = true;
2042
2043 }
2044 /*
2045 * I2F / U2F / F2I / F2U
2046 */
2047
2048 static void
2049 trans_cov(const struct instr_translater *t,
2050 struct ir3_compile_context *ctx,
2051 struct tgsi_full_instruction *inst)
2052 {
2053 struct ir3_instruction *instr;
2054 struct tgsi_dst_register *dst = get_dst(ctx, inst);
2055 struct tgsi_src_register *src = &inst->Src[0].Register;
2056
2057 // cov.f32s32 dst, tmp0 /
2058 instr = instr_create(ctx, 1, 0);
2059 switch (t->tgsi_opc) {
2060 case TGSI_OPCODE_U2F:
2061 instr->cat1.src_type = TYPE_U32;
2062 instr->cat1.dst_type = TYPE_F32;
2063 break;
2064 case TGSI_OPCODE_I2F:
2065 instr->cat1.src_type = TYPE_S32;
2066 instr->cat1.dst_type = TYPE_F32;
2067 break;
2068 case TGSI_OPCODE_F2U:
2069 instr->cat1.src_type = TYPE_F32;
2070 instr->cat1.dst_type = TYPE_U32;
2071 break;
2072 case TGSI_OPCODE_F2I:
2073 instr->cat1.src_type = TYPE_F32;
2074 instr->cat1.dst_type = TYPE_S32;
2075 break;
2076
2077 }
2078 vectorize(ctx, instr, dst, 1, src, 0);
2079 put_dst(ctx, inst, dst);
2080 }
2081
2082 /*
2083 * UMUL / UMAD
2084 *
2085 * There is no 32-bit multiply instruction, so splitting a and b into high and
2086 * low components, we get that
2087 *
2088 * dst = al * bl + ah * bl << 16 + al * bh << 16
2089 *
2090 * mull.u tmp0, a, b (mul low, i.e. al * bl)
2091 * madsh.m16 tmp1, a, b, tmp0 (mul-add shift high mix, i.e. ah * bl << 16)
2092 * madsh.m16 dst, b, a, tmp1 (i.e. al * bh << 16)
2093 *
2094 * For UMAD, add in the extra argument after mull.u.
2095 */
2096 static void
2097 trans_umul(const struct instr_translater *t,
2098 struct ir3_compile_context *ctx,
2099 struct tgsi_full_instruction *inst)
2100 {
2101 struct ir3_instruction *instr;
2102 struct tgsi_dst_register *dst = get_dst(ctx, inst);
2103 struct tgsi_src_register *a = &inst->Src[0].Register;
2104 struct tgsi_src_register *b = &inst->Src[1].Register;
2105
2106 struct tgsi_dst_register tmp0_dst, tmp1_dst;
2107 struct tgsi_src_register *tmp0_src, *tmp1_src;
2108
2109 tmp0_src = get_internal_temp(ctx, &tmp0_dst);
2110 tmp1_src = get_internal_temp(ctx, &tmp1_dst);
2111
2112 if (is_rel_or_const(a))
2113 a = get_unconst(ctx, a);
2114 if (is_rel_or_const(b))
2115 b = get_unconst(ctx, b);
2116
2117 /* mull.u tmp0, a, b */
2118 instr = instr_create(ctx, 2, OPC_MULL_U);
2119 vectorize(ctx, instr, &tmp0_dst, 2, a, 0, b, 0);
2120
2121 if (t->tgsi_opc == TGSI_OPCODE_UMAD) {
2122 struct tgsi_src_register *c = &inst->Src[2].Register;
2123
2124 /* add.u tmp0, tmp0, c */
2125 instr = instr_create(ctx, 2, OPC_ADD_U);
2126 vectorize(ctx, instr, &tmp0_dst, 2, tmp0_src, 0, c, 0);
2127 }
2128
2129 /* madsh.m16 tmp1, a, b, tmp0 */
2130 instr = instr_create(ctx, 3, OPC_MADSH_M16);
2131 vectorize(ctx, instr, &tmp1_dst, 3, a, 0, b, 0, tmp0_src, 0);
2132
2133 /* madsh.m16 dst, b, a, tmp1 */
2134 instr = instr_create(ctx, 3, OPC_MADSH_M16);
2135 vectorize(ctx, instr, dst, 3, b, 0, a, 0, tmp1_src, 0);
2136 put_dst(ctx, inst, dst);
2137 }
2138
2139 /*
2140 * IDIV / UDIV / MOD / UMOD
2141 *
2142 * See NV50LegalizeSSA::handleDIV for the origin of this implementation. For
2143 * MOD/UMOD, it becomes a - [IU]DIV(a, modulus) * modulus.
2144 */
2145 static void
2146 trans_idiv(const struct instr_translater *t,
2147 struct ir3_compile_context *ctx,
2148 struct tgsi_full_instruction *inst)
2149 {
2150 struct ir3_instruction *instr;
2151 struct tgsi_dst_register *dst = get_dst(ctx, inst), *premod_dst = dst;
2152 struct tgsi_src_register *a = &inst->Src[0].Register;
2153 struct tgsi_src_register *b = &inst->Src[1].Register;
2154
2155 struct tgsi_dst_register af_dst, bf_dst, q_dst, r_dst, a_dst, b_dst;
2156 struct tgsi_src_register *af_src, *bf_src, *q_src, *r_src, *a_src, *b_src;
2157
2158 struct tgsi_src_register negative_2, thirty_one;
2159 type_t src_type;
2160
2161 if (t->tgsi_opc == TGSI_OPCODE_IDIV || t->tgsi_opc == TGSI_OPCODE_MOD)
2162 src_type = get_stype(ctx);
2163 else
2164 src_type = get_utype(ctx);
2165
2166 af_src = get_internal_temp(ctx, &af_dst);
2167 bf_src = get_internal_temp(ctx, &bf_dst);
2168 q_src = get_internal_temp(ctx, &q_dst);
2169 r_src = get_internal_temp(ctx, &r_dst);
2170 a_src = get_internal_temp(ctx, &a_dst);
2171 b_src = get_internal_temp(ctx, &b_dst);
2172
2173 get_immediate(ctx, &negative_2, -2);
2174 get_immediate(ctx, &thirty_one, 31);
2175
2176 if (t->tgsi_opc == TGSI_OPCODE_MOD || t->tgsi_opc == TGSI_OPCODE_UMOD)
2177 premod_dst = &q_dst;
2178
2179 /* cov.[us]32f32 af, numerator */
2180 instr = instr_create(ctx, 1, 0);
2181 instr->cat1.src_type = src_type;
2182 instr->cat1.dst_type = get_ftype(ctx);
2183 vectorize(ctx, instr, &af_dst, 1, a, 0);
2184
2185 /* cov.[us]32f32 bf, denominator */
2186 instr = instr_create(ctx, 1, 0);
2187 instr->cat1.src_type = src_type;
2188 instr->cat1.dst_type = get_ftype(ctx);
2189 vectorize(ctx, instr, &bf_dst, 1, b, 0);
2190
2191 /* Get the absolute values for IDIV */
2192 if (type_sint(src_type)) {
2193 /* absneg.f af, (abs)af */
2194 instr = instr_create(ctx, 2, OPC_ABSNEG_F);
2195 vectorize(ctx, instr, &af_dst, 1, af_src, IR3_REG_ABS);
2196
2197 /* absneg.f bf, (abs)bf */
2198 instr = instr_create(ctx, 2, OPC_ABSNEG_F);
2199 vectorize(ctx, instr, &bf_dst, 1, bf_src, IR3_REG_ABS);
2200
2201 /* absneg.s a, (abs)numerator */
2202 instr = instr_create(ctx, 2, OPC_ABSNEG_S);
2203 vectorize(ctx, instr, &a_dst, 1, a, IR3_REG_ABS);
2204
2205 /* absneg.s b, (abs)denominator */
2206 instr = instr_create(ctx, 2, OPC_ABSNEG_S);
2207 vectorize(ctx, instr, &b_dst, 1, b, IR3_REG_ABS);
2208 } else {
2209 /* mov.u32u32 a, numerator */
2210 instr = instr_create(ctx, 1, 0);
2211 instr->cat1.src_type = src_type;
2212 instr->cat1.dst_type = src_type;
2213 vectorize(ctx, instr, &a_dst, 1, a, 0);
2214
2215 /* mov.u32u32 b, denominator */
2216 instr = instr_create(ctx, 1, 0);
2217 instr->cat1.src_type = src_type;
2218 instr->cat1.dst_type = src_type;
2219 vectorize(ctx, instr, &b_dst, 1, b, 0);
2220 }
2221
2222 /* rcp.f bf, bf */
2223 instr = instr_create(ctx, 4, OPC_RCP);
2224 vectorize(ctx, instr, &bf_dst, 1, bf_src, 0);
2225
2226 /* That's right, subtract 2 as an integer from the float */
2227 /* add.u bf, bf, -2 */
2228 instr = instr_create(ctx, 2, OPC_ADD_U);
2229 vectorize(ctx, instr, &bf_dst, 2, bf_src, 0, &negative_2, 0);
2230
2231 /* mul.f q, af, bf */
2232 instr = instr_create(ctx, 2, OPC_MUL_F);
2233 vectorize(ctx, instr, &q_dst, 2, af_src, 0, bf_src, 0);
2234
2235 /* cov.f32[us]32 q, q */
2236 instr = instr_create(ctx, 1, 0);
2237 instr->cat1.src_type = get_ftype(ctx);
2238 instr->cat1.dst_type = src_type;
2239 vectorize(ctx, instr, &q_dst, 1, q_src, 0);
2240
2241 /* integer multiply q by b */
2242 /* mull.u r, q, b */
2243 instr = instr_create(ctx, 2, OPC_MULL_U);
2244 vectorize(ctx, instr, &r_dst, 2, q_src, 0, b_src, 0);
2245
2246 /* madsh.m16 r, q, b, r */
2247 instr = instr_create(ctx, 3, OPC_MADSH_M16);
2248 vectorize(ctx, instr, &r_dst, 3, q_src, 0, b_src, 0, r_src, 0);
2249
2250 /* madsh.m16, r, b, q, r */
2251 instr = instr_create(ctx, 3, OPC_MADSH_M16);
2252 vectorize(ctx, instr, &r_dst, 3, b_src, 0, q_src, 0, r_src, 0);
2253
2254 /* sub.u r, a, r */
2255 instr = instr_create(ctx, 2, OPC_SUB_U);
2256 vectorize(ctx, instr, &r_dst, 2, a_src, 0, r_src, 0);
2257
2258 /* cov.u32f32, r, r */
2259 instr = instr_create(ctx, 1, 0);
2260 instr->cat1.src_type = get_utype(ctx);
2261 instr->cat1.dst_type = get_ftype(ctx);
2262 vectorize(ctx, instr, &r_dst, 1, r_src, 0);
2263
2264 /* mul.f r, r, bf */
2265 instr = instr_create(ctx, 2, OPC_MUL_F);
2266 vectorize(ctx, instr, &r_dst, 2, r_src, 0, bf_src, 0);
2267
2268 /* cov.f32u32 r, r */
2269 instr = instr_create(ctx, 1, 0);
2270 instr->cat1.src_type = get_ftype(ctx);
2271 instr->cat1.dst_type = get_utype(ctx);
2272 vectorize(ctx, instr, &r_dst, 1, r_src, 0);
2273
2274 /* add.u q, q, r */
2275 instr = instr_create(ctx, 2, OPC_ADD_U);
2276 vectorize(ctx, instr, &q_dst, 2, q_src, 0, r_src, 0);
2277
2278 /* mull.u r, q, b */
2279 instr = instr_create(ctx, 2, OPC_MULL_U);
2280 vectorize(ctx, instr, &r_dst, 2, q_src, 0, b_src, 0);
2281
2282 /* madsh.m16 r, q, b, r */
2283 instr = instr_create(ctx, 3, OPC_MADSH_M16);
2284 vectorize(ctx, instr, &r_dst, 3, q_src, 0, b_src, 0, r_src, 0);
2285
2286 /* madsh.m16 r, b, q, r */
2287 instr = instr_create(ctx, 3, OPC_MADSH_M16);
2288 vectorize(ctx, instr, &r_dst, 3, b_src, 0, q_src, 0, r_src, 0);
2289
2290 /* sub.u r, a, r */
2291 instr = instr_create(ctx, 2, OPC_SUB_U);
2292 vectorize(ctx, instr, &r_dst, 2, a_src, 0, r_src, 0);
2293
2294 /* cmps.u.ge r, r, b */
2295 instr = instr_create(ctx, 2, OPC_CMPS_U);
2296 instr->cat2.condition = IR3_COND_GE;
2297 vectorize(ctx, instr, &r_dst, 2, r_src, 0, b_src, 0);
2298
2299 if (type_uint(src_type)) {
2300 /* add.u dst, q, r */
2301 instr = instr_create(ctx, 2, OPC_ADD_U);
2302 vectorize(ctx, instr, premod_dst, 2, q_src, 0, r_src, 0);
2303 } else {
2304 /* add.u q, q, r */
2305 instr = instr_create(ctx, 2, OPC_ADD_U);
2306 vectorize(ctx, instr, &q_dst, 2, q_src, 0, r_src, 0);
2307
2308 /* negate result based on the original arguments */
2309 if (is_const(a) && is_const(b))
2310 a = get_unconst(ctx, a);
2311
2312 /* xor.b r, numerator, denominator */
2313 instr = instr_create(ctx, 2, OPC_XOR_B);
2314 vectorize(ctx, instr, &r_dst, 2, a, 0, b, 0);
2315
2316 /* shr.b r, r, 31 */
2317 instr = instr_create(ctx, 2, OPC_SHR_B);
2318 vectorize(ctx, instr, &r_dst, 2, r_src, 0, &thirty_one, 0);
2319
2320 /* absneg.s b, (neg)q */
2321 instr = instr_create(ctx, 2, OPC_ABSNEG_S);
2322 vectorize(ctx, instr, &b_dst, 1, q_src, IR3_REG_NEGATE);
2323
2324 /* sel.b dst, b, r, q */
2325 instr = instr_create(ctx, 3, OPC_SEL_B32);
2326 vectorize(ctx, instr, premod_dst, 3, b_src, 0, r_src, 0, q_src, 0);
2327 }
2328
2329 if (t->tgsi_opc == TGSI_OPCODE_MOD || t->tgsi_opc == TGSI_OPCODE_UMOD) {
2330 /* The division result will have ended up in q. */
2331
2332 if (is_rel_or_const(b))
2333 b = get_unconst(ctx, b);
2334
2335 /* mull.u r, q, b */
2336 instr = instr_create(ctx, 2, OPC_MULL_U);
2337 vectorize(ctx, instr, &r_dst, 2, q_src, 0, b, 0);
2338
2339 /* madsh.m16 r, q, b, r */
2340 instr = instr_create(ctx, 3, OPC_MADSH_M16);
2341 vectorize(ctx, instr, &r_dst, 3, q_src, 0, b, 0, r_src, 0);
2342
2343 /* madsh.m16 r, b, q, r */
2344 instr = instr_create(ctx, 3, OPC_MADSH_M16);
2345 vectorize(ctx, instr, &r_dst, 3, b, 0, q_src, 0, r_src, 0);
2346
2347 /* sub.u dst, a, r */
2348 instr = instr_create(ctx, 2, OPC_SUB_U);
2349 vectorize(ctx, instr, dst, 2, a, 0, r_src, 0);
2350 }
2351
2352 put_dst(ctx, inst, dst);
2353 }
2354
2355 /*
2356 * Handlers for TGSI instructions which do have 1:1 mapping to native
2357 * instructions:
2358 */
2359
2360 static void
2361 instr_cat0(const struct instr_translater *t,
2362 struct ir3_compile_context *ctx,
2363 struct tgsi_full_instruction *inst)
2364 {
2365 instr_create(ctx, 0, t->opc);
2366 }
2367
2368 static void
2369 instr_cat1(const struct instr_translater *t,
2370 struct ir3_compile_context *ctx,
2371 struct tgsi_full_instruction *inst)
2372 {
2373 struct tgsi_dst_register *dst = get_dst(ctx, inst);
2374 struct tgsi_src_register *src = &inst->Src[0].Register;
2375 create_mov(ctx, dst, src);
2376 put_dst(ctx, inst, dst);
2377 }
2378
2379 static void
2380 instr_cat2(const struct instr_translater *t,
2381 struct ir3_compile_context *ctx,
2382 struct tgsi_full_instruction *inst)
2383 {
2384 struct tgsi_dst_register *dst = get_dst(ctx, inst);
2385 struct tgsi_src_register *src0 = &inst->Src[0].Register;
2386 struct tgsi_src_register *src1 = &inst->Src[1].Register;
2387 struct ir3_instruction *instr;
2388 unsigned src0_flags = 0, src1_flags = 0;
2389
2390 switch (t->tgsi_opc) {
2391 case TGSI_OPCODE_ABS:
2392 case TGSI_OPCODE_IABS:
2393 src0_flags = IR3_REG_ABS;
2394 break;
2395 case TGSI_OPCODE_INEG:
2396 src0_flags = IR3_REG_NEGATE;
2397 break;
2398 case TGSI_OPCODE_SUB:
2399 src1_flags = IR3_REG_NEGATE;
2400 break;
2401 }
2402
2403 switch (t->opc) {
2404 case OPC_ABSNEG_F:
2405 case OPC_ABSNEG_S:
2406 case OPC_CLZ_B:
2407 case OPC_CLZ_S:
2408 case OPC_SIGN_F:
2409 case OPC_FLOOR_F:
2410 case OPC_CEIL_F:
2411 case OPC_RNDNE_F:
2412 case OPC_RNDAZ_F:
2413 case OPC_TRUNC_F:
2414 case OPC_NOT_B:
2415 case OPC_BFREV_B:
2416 case OPC_SETRM:
2417 case OPC_CBITS_B:
2418 /* these only have one src reg */
2419 instr = instr_create(ctx, 2, t->opc);
2420 vectorize(ctx, instr, dst, 1, src0, src0_flags);
2421 break;
2422 default:
2423 if (is_const(src0) && is_const(src1))
2424 src0 = get_unconst(ctx, src0);
2425
2426 instr = instr_create(ctx, 2, t->opc);
2427 vectorize(ctx, instr, dst, 2, src0, src0_flags,
2428 src1, src1_flags);
2429 break;
2430 }
2431
2432 put_dst(ctx, inst, dst);
2433 }
2434
2435 static void
2436 instr_cat3(const struct instr_translater *t,
2437 struct ir3_compile_context *ctx,
2438 struct tgsi_full_instruction *inst)
2439 {
2440 struct tgsi_dst_register *dst = get_dst(ctx, inst);
2441 struct tgsi_src_register *src0 = &inst->Src[0].Register;
2442 struct tgsi_src_register *src1 = &inst->Src[1].Register;
2443 struct ir3_instruction *instr;
2444
2445 /* in particular, can't handle const for src1 for cat3..
2446 * for mad, we can swap first two src's if needed:
2447 */
2448 if (is_rel_or_const(src1)) {
2449 if (is_mad(t->opc) && !is_rel_or_const(src0)) {
2450 struct tgsi_src_register *tmp;
2451 tmp = src0;
2452 src0 = src1;
2453 src1 = tmp;
2454 } else {
2455 src1 = get_unconst(ctx, src1);
2456 }
2457 }
2458
2459 instr = instr_create(ctx, 3, t->opc);
2460 vectorize(ctx, instr, dst, 3, src0, 0, src1, 0,
2461 &inst->Src[2].Register, 0);
2462 put_dst(ctx, inst, dst);
2463 }
2464
2465 static void
2466 instr_cat4(const struct instr_translater *t,
2467 struct ir3_compile_context *ctx,
2468 struct tgsi_full_instruction *inst)
2469 {
2470 struct tgsi_dst_register *dst = get_dst(ctx, inst);
2471 struct tgsi_src_register *src = &inst->Src[0].Register;
2472 struct ir3_instruction *instr;
2473 unsigned i;
2474
2475 /* seems like blob compiler avoids const as src.. */
2476 if (is_const(src))
2477 src = get_unconst(ctx, src);
2478
2479 /* we need to replicate into each component: */
2480 for (i = 0; i < 4; i++) {
2481 if (dst->WriteMask & (1 << i)) {
2482 instr = instr_create(ctx, 4, t->opc);
2483 add_dst_reg(ctx, instr, dst, i);
2484 add_src_reg(ctx, instr, src, src->SwizzleX);
2485 }
2486 }
2487
2488 put_dst(ctx, inst, dst);
2489 }
2490
2491 static const struct instr_translater translaters[TGSI_OPCODE_LAST] = {
2492 #define INSTR(n, f, ...) \
2493 [TGSI_OPCODE_ ## n] = { .fxn = (f), .tgsi_opc = TGSI_OPCODE_ ## n, ##__VA_ARGS__ }
2494
2495 INSTR(MOV, instr_cat1),
2496 INSTR(RCP, instr_cat4, .opc = OPC_RCP),
2497 INSTR(RSQ, instr_cat4, .opc = OPC_RSQ),
2498 INSTR(SQRT, instr_cat4, .opc = OPC_SQRT),
2499 INSTR(MUL, instr_cat2, .opc = OPC_MUL_F),
2500 INSTR(ADD, instr_cat2, .opc = OPC_ADD_F),
2501 INSTR(SUB, instr_cat2, .opc = OPC_ADD_F),
2502 INSTR(MIN, instr_cat2, .opc = OPC_MIN_F),
2503 INSTR(MAX, instr_cat2, .opc = OPC_MAX_F),
2504 INSTR(UADD, instr_cat2, .opc = OPC_ADD_U),
2505 INSTR(IMIN, instr_cat2, .opc = OPC_MIN_S),
2506 INSTR(UMIN, instr_cat2, .opc = OPC_MIN_U),
2507 INSTR(IMAX, instr_cat2, .opc = OPC_MAX_S),
2508 INSTR(UMAX, instr_cat2, .opc = OPC_MAX_U),
2509 INSTR(AND, instr_cat2, .opc = OPC_AND_B),
2510 INSTR(OR, instr_cat2, .opc = OPC_OR_B),
2511 INSTR(NOT, instr_cat2, .opc = OPC_NOT_B),
2512 INSTR(XOR, instr_cat2, .opc = OPC_XOR_B),
2513 INSTR(UMUL, trans_umul),
2514 INSTR(UMAD, trans_umul),
2515 INSTR(UDIV, trans_idiv),
2516 INSTR(IDIV, trans_idiv),
2517 INSTR(MOD, trans_idiv),
2518 INSTR(UMOD, trans_idiv),
2519 INSTR(SHL, instr_cat2, .opc = OPC_SHL_B),
2520 INSTR(USHR, instr_cat2, .opc = OPC_SHR_B),
2521 INSTR(ISHR, instr_cat2, .opc = OPC_ASHR_B),
2522 INSTR(IABS, instr_cat2, .opc = OPC_ABSNEG_S),
2523 INSTR(INEG, instr_cat2, .opc = OPC_ABSNEG_S),
2524 INSTR(AND, instr_cat2, .opc = OPC_AND_B),
2525 INSTR(MAD, instr_cat3, .opc = OPC_MAD_F32, .hopc = OPC_MAD_F16),
2526 INSTR(TRUNC, instr_cat2, .opc = OPC_TRUNC_F),
2527 INSTR(CLAMP, trans_clamp),
2528 INSTR(FLR, instr_cat2, .opc = OPC_FLOOR_F),
2529 INSTR(ROUND, instr_cat2, .opc = OPC_RNDNE_F),
2530 INSTR(SSG, instr_cat2, .opc = OPC_SIGN_F),
2531 INSTR(CEIL, instr_cat2, .opc = OPC_CEIL_F),
2532 INSTR(ARL, trans_arl),
2533 INSTR(UARL, trans_arl),
2534 INSTR(EX2, instr_cat4, .opc = OPC_EXP2),
2535 INSTR(LG2, instr_cat4, .opc = OPC_LOG2),
2536 INSTR(ABS, instr_cat2, .opc = OPC_ABSNEG_F),
2537 INSTR(COS, instr_cat4, .opc = OPC_COS),
2538 INSTR(SIN, instr_cat4, .opc = OPC_SIN),
2539 INSTR(TEX, trans_samp, .opc = OPC_SAM),
2540 INSTR(TXP, trans_samp, .opc = OPC_SAM),
2541 INSTR(TXB, trans_samp, .opc = OPC_SAMB),
2542 INSTR(TXB2, trans_samp, .opc = OPC_SAMB),
2543 INSTR(TXL, trans_samp, .opc = OPC_SAML),
2544 INSTR(TXD, trans_samp, .opc = OPC_SAMGQ),
2545 INSTR(TXF, trans_samp, .opc = OPC_ISAML),
2546 INSTR(TXQ, trans_txq),
2547 INSTR(DDX, trans_deriv, .opc = OPC_DSX),
2548 INSTR(DDY, trans_deriv, .opc = OPC_DSY),
2549 INSTR(SGT, trans_cmp),
2550 INSTR(SLT, trans_cmp),
2551 INSTR(FSLT, trans_cmp),
2552 INSTR(SGE, trans_cmp),
2553 INSTR(FSGE, trans_cmp),
2554 INSTR(SLE, trans_cmp),
2555 INSTR(SNE, trans_cmp),
2556 INSTR(FSNE, trans_cmp),
2557 INSTR(SEQ, trans_cmp),
2558 INSTR(FSEQ, trans_cmp),
2559 INSTR(CMP, trans_cmp),
2560 INSTR(USNE, trans_icmp, .opc = OPC_CMPS_U),
2561 INSTR(USEQ, trans_icmp, .opc = OPC_CMPS_U),
2562 INSTR(ISGE, trans_icmp, .opc = OPC_CMPS_S),
2563 INSTR(USGE, trans_icmp, .opc = OPC_CMPS_U),
2564 INSTR(ISLT, trans_icmp, .opc = OPC_CMPS_S),
2565 INSTR(USLT, trans_icmp, .opc = OPC_CMPS_U),
2566 INSTR(UCMP, trans_ucmp),
2567 INSTR(ISSG, trans_issg),
2568 INSTR(IF, trans_if, .opc = OPC_CMPS_F),
2569 INSTR(UIF, trans_if, .opc = OPC_CMPS_U),
2570 INSTR(ELSE, trans_else),
2571 INSTR(ENDIF, trans_endif),
2572 INSTR(END, instr_cat0, .opc = OPC_END),
2573 INSTR(KILL, trans_kill, .opc = OPC_KILL),
2574 INSTR(KILL_IF, trans_killif, .opc = OPC_KILL),
2575 INSTR(I2F, trans_cov),
2576 INSTR(U2F, trans_cov),
2577 INSTR(F2I, trans_cov),
2578 INSTR(F2U, trans_cov),
2579 };
2580
2581 static ir3_semantic
2582 decl_semantic(const struct tgsi_declaration_semantic *sem)
2583 {
2584 return ir3_semantic_name(sem->Name, sem->Index);
2585 }
2586
2587 static struct ir3_instruction *
2588 decl_in_frag_bary(struct ir3_compile_context *ctx, unsigned regid,
2589 unsigned j, unsigned inloc)
2590 {
2591 struct ir3_instruction *instr;
2592 struct ir3_register *src;
2593
2594 /* bary.f dst, #inloc, r0.x */
2595 instr = instr_create(ctx, 2, OPC_BARY_F);
2596 ir3_reg_create(instr, regid, 0); /* dummy dst */
2597 ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = inloc;
2598 src = ir3_reg_create(instr, 0, IR3_REG_SSA);
2599 src->wrmask = 0x3;
2600 src->instr = ctx->frag_pos;
2601
2602 return instr;
2603 }
2604
2605 /* TGSI_SEMANTIC_POSITION
2606 * """"""""""""""""""""""
2607 *
2608 * For fragment shaders, TGSI_SEMANTIC_POSITION is used to indicate that
2609 * fragment shader input contains the fragment's window position. The X
2610 * component starts at zero and always increases from left to right.
2611 * The Y component starts at zero and always increases but Y=0 may either
2612 * indicate the top of the window or the bottom depending on the fragment
2613 * coordinate origin convention (see TGSI_PROPERTY_FS_COORD_ORIGIN).
2614 * The Z coordinate ranges from 0 to 1 to represent depth from the front
2615 * to the back of the Z buffer. The W component contains the reciprocol
2616 * of the interpolated vertex position W component.
2617 */
2618 static struct ir3_instruction *
2619 decl_in_frag_coord(struct ir3_compile_context *ctx, unsigned regid,
2620 unsigned j)
2621 {
2622 struct ir3_instruction *instr, *src;
2623
2624 compile_assert(ctx, !ctx->frag_coord[j]);
2625
2626 ctx->frag_coord[j] = create_input(ctx->block, NULL, 0);
2627
2628
2629 switch (j) {
2630 case 0: /* .x */
2631 case 1: /* .y */
2632 /* for frag_coord, we get unsigned values.. we need
2633 * to subtract (integer) 8 and divide by 16 (right-
2634 * shift by 4) then convert to float:
2635 */
2636
2637 /* add.s tmp, src, -8 */
2638 instr = instr_create(ctx, 2, OPC_ADD_S);
2639 ir3_reg_create(instr, regid, 0); /* dummy dst */
2640 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->frag_coord[j];
2641 ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = -8;
2642 src = instr;
2643
2644 /* shr.b tmp, tmp, 4 */
2645 instr = instr_create(ctx, 2, OPC_SHR_B);
2646 ir3_reg_create(instr, regid, 0); /* dummy dst */
2647 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
2648 ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 4;
2649 src = instr;
2650
2651 /* mov.u32f32 dst, tmp */
2652 instr = instr_create(ctx, 1, 0);
2653 instr->cat1.src_type = TYPE_U32;
2654 instr->cat1.dst_type = TYPE_F32;
2655 ir3_reg_create(instr, regid, 0); /* dummy dst */
2656 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
2657
2658 break;
2659 case 2: /* .z */
2660 case 3: /* .w */
2661 /* seems that we can use these as-is: */
2662 instr = ctx->frag_coord[j];
2663 break;
2664 default:
2665 compile_error(ctx, "invalid channel\n");
2666 instr = create_immed(ctx, 0.0);
2667 break;
2668 }
2669
2670 return instr;
2671 }
2672
2673 /* TGSI_SEMANTIC_FACE
2674 * """"""""""""""""""
2675 *
2676 * This label applies to fragment shader inputs only and indicates that
2677 * the register contains front/back-face information of the form (F, 0,
2678 * 0, 1). The first component will be positive when the fragment belongs
2679 * to a front-facing polygon, and negative when the fragment belongs to a
2680 * back-facing polygon.
2681 */
2682 static struct ir3_instruction *
2683 decl_in_frag_face(struct ir3_compile_context *ctx, unsigned regid,
2684 unsigned j)
2685 {
2686 struct ir3_instruction *instr, *src;
2687
2688 switch (j) {
2689 case 0: /* .x */
2690 compile_assert(ctx, !ctx->frag_face);
2691
2692 ctx->frag_face = create_input(ctx->block, NULL, 0);
2693
2694 /* for faceness, we always get -1 or 0 (int).. but TGSI expects
2695 * positive vs negative float.. and piglit further seems to
2696 * expect -1.0 or 1.0:
2697 *
2698 * mul.s tmp, hr0.x, 2
2699 * add.s tmp, tmp, 1
2700 * mov.s16f32, dst, tmp
2701 *
2702 */
2703
2704 instr = instr_create(ctx, 2, OPC_MUL_S);
2705 ir3_reg_create(instr, regid, 0); /* dummy dst */
2706 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->frag_face;
2707 ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 2;
2708 src = instr;
2709
2710 instr = instr_create(ctx, 2, OPC_ADD_S);
2711 ir3_reg_create(instr, regid, 0); /* dummy dst */
2712 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
2713 ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 1;
2714 src = instr;
2715
2716 instr = instr_create(ctx, 1, 0); /* mov */
2717 instr->cat1.src_type = TYPE_S32;
2718 instr->cat1.dst_type = TYPE_F32;
2719 ir3_reg_create(instr, regid, 0); /* dummy dst */
2720 ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
2721
2722 break;
2723 case 1: /* .y */
2724 case 2: /* .z */
2725 instr = create_immed(ctx, 0.0);
2726 break;
2727 case 3: /* .w */
2728 instr = create_immed(ctx, 1.0);
2729 break;
2730 default:
2731 compile_error(ctx, "invalid channel\n");
2732 instr = create_immed(ctx, 0.0);
2733 break;
2734 }
2735
2736 return instr;
2737 }
2738
2739 static void
2740 decl_in(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl)
2741 {
2742 struct ir3_shader_variant *so = ctx->so;
2743 unsigned name = decl->Semantic.Name;
2744 unsigned i;
2745
2746 /* I don't think we should get frag shader input without
2747 * semantic info? Otherwise how do inputs get linked to
2748 * vert outputs?
2749 */
2750 compile_assert(ctx, (ctx->type == TGSI_PROCESSOR_VERTEX) ||
2751 decl->Declaration.Semantic);
2752
2753 for (i = decl->Range.First; i <= decl->Range.Last; i++) {
2754 unsigned n = so->inputs_count++;
2755 unsigned r = regid(i, 0);
2756 unsigned ncomp, j;
2757
2758 /* we'll figure out the actual components used after scheduling */
2759 ncomp = 4;
2760
2761 DBG("decl in -> r%d", i);
2762
2763 compile_assert(ctx, n < ARRAY_SIZE(so->inputs));
2764
2765 so->inputs[n].semantic = decl_semantic(&decl->Semantic);
2766 so->inputs[n].compmask = (1 << ncomp) - 1;
2767 so->inputs[n].regid = r;
2768 so->inputs[n].inloc = ctx->next_inloc;
2769 so->inputs[n].interpolate = decl->Interp.Interpolate;
2770
2771 for (j = 0; j < ncomp; j++) {
2772 struct ir3_instruction *instr = NULL;
2773
2774 if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
2775 /* for fragment shaders, POSITION and FACE are handled
2776 * specially, not using normal varying / bary.f
2777 */
2778 if (name == TGSI_SEMANTIC_POSITION) {
2779 so->inputs[n].bary = false;
2780 so->frag_coord = true;
2781 instr = decl_in_frag_coord(ctx, r + j, j);
2782 } else if (name == TGSI_SEMANTIC_FACE) {
2783 so->inputs[n].bary = false;
2784 so->frag_face = true;
2785 instr = decl_in_frag_face(ctx, r + j, j);
2786 } else {
2787 so->inputs[n].bary = true;
2788 instr = decl_in_frag_bary(ctx, r + j, j,
2789 so->inputs[n].inloc + j - 8);
2790 }
2791 } else {
2792 instr = create_input(ctx->block, NULL, (i * 4) + j);
2793 }
2794
2795 ctx->block->inputs[(i * 4) + j] = instr;
2796 }
2797
2798 if (so->inputs[n].bary || (ctx->type == TGSI_PROCESSOR_VERTEX)) {
2799 ctx->next_inloc += ncomp;
2800 so->total_in += ncomp;
2801 }
2802 }
2803 }
2804
2805 static void
2806 decl_out(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl)
2807 {
2808 struct ir3_shader_variant *so = ctx->so;
2809 unsigned comp = 0;
2810 unsigned name = decl->Semantic.Name;
2811 unsigned i;
2812
2813 compile_assert(ctx, decl->Declaration.Semantic);
2814
2815 DBG("decl out[%d] -> r%d", name, decl->Range.First);
2816
2817 if (ctx->type == TGSI_PROCESSOR_VERTEX) {
2818 switch (name) {
2819 case TGSI_SEMANTIC_POSITION:
2820 so->writes_pos = true;
2821 break;
2822 case TGSI_SEMANTIC_PSIZE:
2823 so->writes_psize = true;
2824 break;
2825 case TGSI_SEMANTIC_COLOR:
2826 case TGSI_SEMANTIC_BCOLOR:
2827 case TGSI_SEMANTIC_GENERIC:
2828 case TGSI_SEMANTIC_FOG:
2829 case TGSI_SEMANTIC_TEXCOORD:
2830 break;
2831 default:
2832 compile_error(ctx, "unknown VS semantic name: %s\n",
2833 tgsi_semantic_names[name]);
2834 }
2835 } else {
2836 switch (name) {
2837 case TGSI_SEMANTIC_POSITION:
2838 comp = 2; /* tgsi will write to .z component */
2839 so->writes_pos = true;
2840 break;
2841 case TGSI_SEMANTIC_COLOR:
2842 break;
2843 default:
2844 compile_error(ctx, "unknown FS semantic name: %s\n",
2845 tgsi_semantic_names[name]);
2846 }
2847 }
2848
2849 for (i = decl->Range.First; i <= decl->Range.Last; i++) {
2850 unsigned n = so->outputs_count++;
2851 unsigned ncomp, j;
2852
2853 ncomp = 4;
2854
2855 compile_assert(ctx, n < ARRAY_SIZE(so->outputs));
2856
2857 so->outputs[n].semantic = decl_semantic(&decl->Semantic);
2858 so->outputs[n].regid = regid(i, comp);
2859
2860 /* avoid undefined outputs, stick a dummy mov from imm{0.0},
2861 * which if the output is actually assigned will be over-
2862 * written
2863 */
2864 for (j = 0; j < ncomp; j++)
2865 ctx->block->outputs[(i * 4) + j] = create_immed(ctx, 0.0);
2866 }
2867 }
2868
2869 /* from TGSI perspective, we actually have inputs. But most of the "inputs"
2870 * for a fragment shader are just bary.f instructions. The *actual* inputs
2871 * from the hw perspective are the frag_pos and optionally frag_coord and
2872 * frag_face.
2873 */
2874 static void
2875 fixup_frag_inputs(struct ir3_compile_context *ctx)
2876 {
2877 struct ir3_shader_variant *so = ctx->so;
2878 struct ir3_block *block = ctx->block;
2879 struct ir3_instruction **inputs;
2880 struct ir3_instruction *instr;
2881 int n, regid = 0;
2882
2883 block->ninputs = 0;
2884
2885 n = 4; /* always have frag_pos */
2886 n += COND(so->frag_face, 4);
2887 n += COND(so->frag_coord, 4);
2888
2889 inputs = ir3_alloc(ctx->ir, n * (sizeof(struct ir3_instruction *)));
2890
2891 if (so->frag_face) {
2892 /* this ultimately gets assigned to hr0.x so doesn't conflict
2893 * with frag_coord/frag_pos..
2894 */
2895 inputs[block->ninputs++] = ctx->frag_face;
2896 ctx->frag_face->regs[0]->num = 0;
2897
2898 /* remaining channels not used, but let's avoid confusing
2899 * other parts that expect inputs to come in groups of vec4
2900 */
2901 inputs[block->ninputs++] = NULL;
2902 inputs[block->ninputs++] = NULL;
2903 inputs[block->ninputs++] = NULL;
2904 }
2905
2906 /* since we don't know where to set the regid for frag_coord,
2907 * we have to use r0.x for it. But we don't want to *always*
2908 * use r1.x for frag_pos as that could increase the register
2909 * footprint on simple shaders:
2910 */
2911 if (so->frag_coord) {
2912 ctx->frag_coord[0]->regs[0]->num = regid++;
2913 ctx->frag_coord[1]->regs[0]->num = regid++;
2914 ctx->frag_coord[2]->regs[0]->num = regid++;
2915 ctx->frag_coord[3]->regs[0]->num = regid++;
2916
2917 inputs[block->ninputs++] = ctx->frag_coord[0];
2918 inputs[block->ninputs++] = ctx->frag_coord[1];
2919 inputs[block->ninputs++] = ctx->frag_coord[2];
2920 inputs[block->ninputs++] = ctx->frag_coord[3];
2921 }
2922
2923 /* we always have frag_pos: */
2924 so->pos_regid = regid;
2925
2926 /* r0.x */
2927 instr = create_input(block, NULL, block->ninputs);
2928 instr->regs[0]->num = regid++;
2929 inputs[block->ninputs++] = instr;
2930 ctx->frag_pos->regs[1]->instr = instr;
2931
2932 /* r0.y */
2933 instr = create_input(block, NULL, block->ninputs);
2934 instr->regs[0]->num = regid++;
2935 inputs[block->ninputs++] = instr;
2936 ctx->frag_pos->regs[2]->instr = instr;
2937
2938 block->inputs = inputs;
2939 }
2940
2941 static void
2942 compile_instructions(struct ir3_compile_context *ctx)
2943 {
2944 push_block(ctx);
2945
2946 /* for fragment shader, we have a single input register (usually
2947 * r0.xy) which is used as the base for bary.f varying fetch instrs:
2948 */
2949 if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
2950 struct ir3_instruction *instr;
2951 instr = ir3_instr_create(ctx->block, -1, OPC_META_FI);
2952 ir3_reg_create(instr, 0, 0);
2953 ir3_reg_create(instr, 0, IR3_REG_SSA); /* r0.x */
2954 ir3_reg_create(instr, 0, IR3_REG_SSA); /* r0.y */
2955 ctx->frag_pos = instr;
2956 }
2957
2958 while (!tgsi_parse_end_of_tokens(&ctx->parser)) {
2959 tgsi_parse_token(&ctx->parser);
2960
2961 switch (ctx->parser.FullToken.Token.Type) {
2962 case TGSI_TOKEN_TYPE_DECLARATION: {
2963 struct tgsi_full_declaration *decl =
2964 &ctx->parser.FullToken.FullDeclaration;
2965 if (decl->Declaration.File == TGSI_FILE_OUTPUT) {
2966 decl_out(ctx, decl);
2967 } else if (decl->Declaration.File == TGSI_FILE_INPUT) {
2968 decl_in(ctx, decl);
2969 }
2970 break;
2971 }
2972 case TGSI_TOKEN_TYPE_IMMEDIATE: {
2973 /* TODO: if we know the immediate is small enough, and only
2974 * used with instructions that can embed an immediate, we
2975 * can skip this:
2976 */
2977 struct tgsi_full_immediate *imm =
2978 &ctx->parser.FullToken.FullImmediate;
2979 unsigned n = ctx->so->immediates_count++;
2980 compile_assert(ctx, n < ARRAY_SIZE(ctx->so->immediates));
2981 memcpy(ctx->so->immediates[n].val, imm->u, 16);
2982 break;
2983 }
2984 case TGSI_TOKEN_TYPE_INSTRUCTION: {
2985 struct tgsi_full_instruction *inst =
2986 &ctx->parser.FullToken.FullInstruction;
2987 unsigned opc = inst->Instruction.Opcode;
2988 const struct instr_translater *t = &translaters[opc];
2989
2990 if (t->fxn) {
2991 t->fxn(t, ctx, inst);
2992 ctx->num_internal_temps = 0;
2993
2994 compile_assert(ctx, !ctx->using_tmp_dst);
2995 } else {
2996 compile_error(ctx, "unknown TGSI opc: %s\n",
2997 tgsi_get_opcode_name(opc));
2998 }
2999
3000 switch (inst->Instruction.Saturate) {
3001 case TGSI_SAT_ZERO_ONE:
3002 create_clamp_imm(ctx, &inst->Dst[0].Register,
3003 fui(0.0), fui(1.0));
3004 break;
3005 case TGSI_SAT_MINUS_PLUS_ONE:
3006 create_clamp_imm(ctx, &inst->Dst[0].Register,
3007 fui(-1.0), fui(1.0));
3008 break;
3009 }
3010
3011 instr_finish(ctx);
3012
3013 break;
3014 }
3015 default:
3016 break;
3017 }
3018 }
3019 }
3020
3021 static void
3022 compile_dump(struct ir3_compile_context *ctx)
3023 {
3024 const char *name = (ctx->so->type == SHADER_VERTEX) ? "vert" : "frag";
3025 static unsigned n = 0;
3026 char fname[16];
3027 FILE *f;
3028 snprintf(fname, sizeof(fname), "%s-%04u.dot", name, n++);
3029 f = fopen(fname, "w");
3030 if (!f)
3031 return;
3032 ir3_block_depth(ctx->block);
3033 ir3_dump(ctx->ir, name, ctx->block, f);
3034 fclose(f);
3035 }
3036
3037 int
3038 ir3_compile_shader(struct ir3_shader_variant *so,
3039 const struct tgsi_token *tokens, struct ir3_shader_key key,
3040 bool cp)
3041 {
3042 struct ir3_compile_context ctx;
3043 struct ir3_block *block;
3044 struct ir3_instruction **inputs;
3045 unsigned i, j, actual_in;
3046 int ret = 0, max_bary;
3047
3048 assert(!so->ir);
3049
3050 so->ir = ir3_create();
3051
3052 assert(so->ir);
3053
3054 if (compile_init(&ctx, so, tokens) != TGSI_PARSE_OK) {
3055 DBG("INIT failed!");
3056 ret = -1;
3057 goto out;
3058 }
3059
3060 compile_instructions(&ctx);
3061
3062 block = ctx.block;
3063 so->ir->block = block;
3064
3065 /* keep track of the inputs from TGSI perspective.. */
3066 inputs = block->inputs;
3067
3068 /* but fixup actual inputs for frag shader: */
3069 if (ctx.type == TGSI_PROCESSOR_FRAGMENT)
3070 fixup_frag_inputs(&ctx);
3071
3072 /* at this point, for binning pass, throw away unneeded outputs: */
3073 if (key.binning_pass) {
3074 for (i = 0, j = 0; i < so->outputs_count; i++) {
3075 unsigned name = sem2name(so->outputs[i].semantic);
3076 unsigned idx = sem2name(so->outputs[i].semantic);
3077
3078 /* throw away everything but first position/psize */
3079 if ((idx == 0) && ((name == TGSI_SEMANTIC_POSITION) ||
3080 (name == TGSI_SEMANTIC_PSIZE))) {
3081 if (i != j) {
3082 so->outputs[j] = so->outputs[i];
3083 block->outputs[(j*4)+0] = block->outputs[(i*4)+0];
3084 block->outputs[(j*4)+1] = block->outputs[(i*4)+1];
3085 block->outputs[(j*4)+2] = block->outputs[(i*4)+2];
3086 block->outputs[(j*4)+3] = block->outputs[(i*4)+3];
3087 }
3088 j++;
3089 }
3090 }
3091 so->outputs_count = j;
3092 block->noutputs = j * 4;
3093 }
3094
3095 /* for rendering to alpha format, we only need the .w component,
3096 * and we need it to be in the .x position:
3097 */
3098 if (key.alpha) {
3099 for (i = 0, j = 0; i < so->outputs_count; i++) {
3100 unsigned name = sem2name(so->outputs[i].semantic);
3101
3102 /* move .w component to .x and discard others: */
3103 if (name == TGSI_SEMANTIC_COLOR) {
3104 block->outputs[(i*4)+0] = block->outputs[(i*4)+3];
3105 block->outputs[(i*4)+1] = NULL;
3106 block->outputs[(i*4)+2] = NULL;
3107 block->outputs[(i*4)+3] = NULL;
3108 }
3109 }
3110 }
3111
3112 /* at this point, we want the kill's in the outputs array too,
3113 * so that they get scheduled (since they have no dst).. we've
3114 * already ensured that the array is big enough in push_block():
3115 */
3116 if (ctx.type == TGSI_PROCESSOR_FRAGMENT) {
3117 for (i = 0; i < ctx.kill_count; i++)
3118 block->outputs[block->noutputs++] = ctx.kill[i];
3119 }
3120
3121 if (fd_mesa_debug & FD_DBG_OPTDUMP)
3122 compile_dump(&ctx);
3123
3124 ret = ir3_block_flatten(block);
3125 if (ret < 0) {
3126 DBG("FLATTEN failed!");
3127 goto out;
3128 }
3129 if ((ret > 0) && (fd_mesa_debug & FD_DBG_OPTDUMP))
3130 compile_dump(&ctx);
3131
3132 if (fd_mesa_debug & FD_DBG_OPTMSGS) {
3133 printf("BEFORE CP:\n");
3134 ir3_dump_instr_list(block->head);
3135 }
3136
3137 if (cp && !(fd_mesa_debug & FD_DBG_NOCP))
3138 ir3_block_cp(block);
3139
3140 if (fd_mesa_debug & FD_DBG_OPTDUMP)
3141 compile_dump(&ctx);
3142
3143 ir3_block_depth(block);
3144
3145 if (fd_mesa_debug & FD_DBG_OPTMSGS) {
3146 printf("AFTER DEPTH:\n");
3147 ir3_dump_instr_list(block->head);
3148 }
3149
3150 ret = ir3_block_sched(block);
3151 if (ret) {
3152 DBG("SCHED failed!");
3153 goto out;
3154 }
3155
3156 if (fd_mesa_debug & FD_DBG_OPTMSGS) {
3157 printf("AFTER SCHED:\n");
3158 ir3_dump_instr_list(block->head);
3159 }
3160
3161 ret = ir3_block_ra(block, so->type, key.half_precision,
3162 so->frag_coord, so->frag_face);
3163 if (ret) {
3164 DBG("RA failed!");
3165 goto out;
3166 }
3167
3168 ir3_block_legalize(block, &so->has_samp, &max_bary);
3169
3170 if (fd_mesa_debug & FD_DBG_OPTMSGS) {
3171 printf("AFTER RA:\n");
3172 ir3_dump_instr_list(block->head);
3173 }
3174
3175 /* fixup input/outputs: */
3176 for (i = 0; i < so->outputs_count; i++) {
3177 so->outputs[i].regid = block->outputs[i*4]->regs[0]->num;
3178 /* preserve hack for depth output.. tgsi writes depth to .z,
3179 * but what we give the hw is the scalar register:
3180 */
3181 if ((ctx.type == TGSI_PROCESSOR_FRAGMENT) &&
3182 (sem2name(so->outputs[i].semantic) == TGSI_SEMANTIC_POSITION))
3183 so->outputs[i].regid += 2;
3184 }
3185 /* Note that some or all channels of an input may be unused: */
3186 actual_in = 0;
3187 for (i = 0; i < so->inputs_count; i++) {
3188 unsigned j, regid = ~0, compmask = 0;
3189 so->inputs[i].ncomp = 0;
3190 for (j = 0; j < 4; j++) {
3191 struct ir3_instruction *in = inputs[(i*4) + j];
3192 if (in) {
3193 compmask |= (1 << j);
3194 regid = in->regs[0]->num - j;
3195 actual_in++;
3196 so->inputs[i].ncomp++;
3197 }
3198 }
3199 so->inputs[i].regid = regid;
3200 so->inputs[i].compmask = compmask;
3201 }
3202
3203 /* fragment shader always gets full vec4's even if it doesn't
3204 * fetch all components, but vertex shader we need to update
3205 * with the actual number of components fetch, otherwise thing
3206 * will hang due to mismaptch between VFD_DECODE's and
3207 * TOTALATTRTOVS
3208 */
3209 if (so->type == SHADER_VERTEX)
3210 so->total_in = actual_in;
3211 else
3212 so->total_in = align(max_bary + 1, 4);
3213
3214 out:
3215 if (ret) {
3216 ir3_destroy(so->ir);
3217 so->ir = NULL;
3218 }
3219 compile_free(&ctx);
3220
3221 return ret;
3222 }