b1ed2e09457feb67c8750b73a70f80efc134aa70
[mesa.git] / src / gallium / drivers / freedreno / a3xx / fd3_compiler.c
1 /* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
2
3 /*
4 * Copyright (C) 2013 Rob Clark <robclark@freedesktop.org>
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 * SOFTWARE.
24 *
25 * Authors:
26 * Rob Clark <robclark@freedesktop.org>
27 */
28
29 #include <stdarg.h>
30
31 #include "pipe/p_state.h"
32 #include "util/u_string.h"
33 #include "util/u_memory.h"
34 #include "util/u_inlines.h"
35 #include "tgsi/tgsi_parse.h"
36 #include "tgsi/tgsi_ureg.h"
37 #include "tgsi/tgsi_info.h"
38 #include "tgsi/tgsi_strings.h"
39 #include "tgsi/tgsi_dump.h"
40 #include "tgsi/tgsi_scan.h"
41
42 #include "fd3_compiler.h"
43 #include "fd3_program.h"
44 #include "fd3_util.h"
45
46 #include "instr-a3xx.h"
47 #include "ir3.h"
48
49
50 struct fd3_compile_context {
51 const struct tgsi_token *tokens;
52 struct ir3_shader *ir;
53 struct fd3_shader_stateobj *so;
54
55 struct ir3_block *block;
56 struct ir3_instruction *current_instr;
57
58 /* we need to defer updates to block->outputs[] until the end
59 * of an instruction (so we don't see new value until *after*
60 * the src registers are processed)
61 */
62 struct {
63 struct ir3_instruction *instr, **instrp;
64 } output_updates[16];
65 unsigned num_output_updates;
66
67 /* are we in a sequence of "atomic" instructions?
68 */
69 bool atomic;
70
71 /* For fragment shaders, from the hw perspective the only
72 * actual input is r0.xy position register passed to bary.f.
73 * But TGSI doesn't know that, it still declares things as
74 * IN[] registers. So we do all the input tracking normally
75 * and fix things up after compile_instructions()
76 */
77 struct ir3_instruction *frag_pos;
78
79 struct tgsi_parse_context parser;
80 unsigned type;
81
82 struct tgsi_shader_info info;
83
84 /* for calculating input/output positions/linkages: */
85 unsigned next_inloc;
86
87 unsigned num_internal_temps;
88 struct tgsi_src_register internal_temps[6];
89
90 /* inputs start at r0, temporaries start after last input, and
91 * outputs start after last temporary.
92 *
93 * We could be more clever, because this is not a hw restriction,
94 * but probably best just to implement an optimizing pass to
95 * reduce the # of registers used and get rid of redundant mov's
96 * (to output register).
97 */
98 unsigned base_reg[TGSI_FILE_COUNT];
99
100 /* idx/slot for last compiler generated immediate */
101 unsigned immediate_idx;
102
103 /* stack of branch instructions that mark (potentially nested)
104 * branch if/else/loop/etc
105 */
106 struct ir3_instruction *branch[16];
107 unsigned int branch_count;
108
109 /* used when dst is same as one of the src, to avoid overwriting a
110 * src element before the remaining scalar instructions that make
111 * up the vector operation
112 */
113 struct tgsi_dst_register tmp_dst;
114 struct tgsi_src_register *tmp_src;
115 };
116
117
118 static void vectorize(struct fd3_compile_context *ctx,
119 struct ir3_instruction *instr, struct tgsi_dst_register *dst,
120 int nsrcs, ...);
121 static void create_mov(struct fd3_compile_context *ctx,
122 struct tgsi_dst_register *dst, struct tgsi_src_register *src);
123 static type_t get_ftype(struct fd3_compile_context *ctx);
124
125 static unsigned
126 compile_init(struct fd3_compile_context *ctx, struct fd3_shader_stateobj *so,
127 const struct tgsi_token *tokens)
128 {
129 unsigned ret, base = 0;
130 struct tgsi_shader_info *info = &ctx->info;
131
132 ctx->tokens = tokens;
133 ctx->ir = so->ir;
134 ctx->so = so;
135 ctx->next_inloc = 8;
136 ctx->num_internal_temps = 0;
137 ctx->branch_count = 0;
138 ctx->block = NULL;
139 ctx->current_instr = NULL;
140 ctx->num_output_updates = 0;
141 ctx->atomic = false;
142
143 memset(ctx->base_reg, 0, sizeof(ctx->base_reg));
144
145 tgsi_scan_shader(tokens, &ctx->info);
146
147 #define FM(x) (1 << TGSI_FILE_##x)
148 /* optimize can't deal with relative addressing: */
149 if (info->indirect_files & (FM(TEMPORARY) | FM(INPUT) |
150 FM(OUTPUT) | FM(IMMEDIATE) | FM(CONSTANT)))
151 return TGSI_PARSE_ERROR;
152
153 /* Immediates go after constants: */
154 ctx->base_reg[TGSI_FILE_CONSTANT] = 0;
155 ctx->base_reg[TGSI_FILE_IMMEDIATE] =
156 info->file_max[TGSI_FILE_CONSTANT] + 1;
157
158 /* if full precision and fragment shader, don't clobber
159 * r0.xy w/ bary fetch:
160 */
161 if ((so->type == SHADER_FRAGMENT) && !so->half_precision)
162 base = 1;
163
164 /* Temporaries after outputs after inputs: */
165 ctx->base_reg[TGSI_FILE_INPUT] = base;
166 ctx->base_reg[TGSI_FILE_OUTPUT] = base +
167 info->file_max[TGSI_FILE_INPUT] + 1;
168 ctx->base_reg[TGSI_FILE_TEMPORARY] = base +
169 info->file_max[TGSI_FILE_INPUT] + 1 +
170 info->file_max[TGSI_FILE_OUTPUT] + 1;
171
172 so->first_immediate = ctx->base_reg[TGSI_FILE_IMMEDIATE];
173 ctx->immediate_idx = 4 * (ctx->info.file_max[TGSI_FILE_IMMEDIATE] + 1);
174
175 ret = tgsi_parse_init(&ctx->parser, tokens);
176 if (ret != TGSI_PARSE_OK)
177 return ret;
178
179 ctx->type = ctx->parser.FullHeader.Processor.Processor;
180
181 return ret;
182 }
183
184 static void
185 compile_error(struct fd3_compile_context *ctx, const char *format, ...)
186 {
187 va_list ap;
188 va_start(ap, format);
189 _debug_vprintf(format, ap);
190 va_end(ap);
191 tgsi_dump(ctx->tokens, 0);
192 debug_assert(0);
193 }
194
195 #define compile_assert(ctx, cond) do { \
196 if (!(cond)) compile_error((ctx), "failed assert: "#cond"\n"); \
197 } while (0)
198
199 static void
200 compile_free(struct fd3_compile_context *ctx)
201 {
202 tgsi_parse_free(&ctx->parser);
203 }
204
205 struct instr_translater {
206 void (*fxn)(const struct instr_translater *t,
207 struct fd3_compile_context *ctx,
208 struct tgsi_full_instruction *inst);
209 unsigned tgsi_opc;
210 opc_t opc;
211 opc_t hopc; /* opc to use for half_precision mode, if different */
212 unsigned arg;
213 };
214
215 static void
216 instr_finish(struct fd3_compile_context *ctx)
217 {
218 unsigned i;
219
220 if (ctx->atomic)
221 return;
222
223 for (i = 0; i < ctx->num_output_updates; i++)
224 *(ctx->output_updates[i].instrp) = ctx->output_updates[i].instr;
225
226 ctx->num_output_updates = 0;
227 }
228
229 /* For "atomic" groups of instructions, for example the four scalar
230 * instructions to perform a vec4 operation. Basically this just
231 * blocks out handling of output_updates so the next scalar instruction
232 * still sees the result from before the start of the atomic group.
233 *
234 * NOTE: when used properly, this could probably replace get/put_dst()
235 * stuff.
236 */
237 static void
238 instr_atomic_start(struct fd3_compile_context *ctx)
239 {
240 ctx->atomic = true;
241 }
242
243 static void
244 instr_atomic_end(struct fd3_compile_context *ctx)
245 {
246 ctx->atomic = false;
247 instr_finish(ctx);
248 }
249
250 static struct ir3_instruction *
251 instr_create(struct fd3_compile_context *ctx, int category, opc_t opc)
252 {
253 instr_finish(ctx);
254 return (ctx->current_instr = ir3_instr_create(ctx->block, category, opc));
255 }
256
257 static struct ir3_instruction *
258 instr_clone(struct fd3_compile_context *ctx, struct ir3_instruction *instr)
259 {
260 instr_finish(ctx);
261 return (ctx->current_instr = ir3_instr_clone(instr));
262 }
263
264 static struct ir3_block *
265 push_block(struct fd3_compile_context *ctx)
266 {
267 struct ir3_block *block;
268 unsigned ntmp, nin, nout;
269
270 #define SCALAR_REGS(file) (4 * (ctx->info.file_max[TGSI_FILE_ ## file] + 1))
271
272 /* hmm, give ourselves room to create 4 extra temporaries (vec4):
273 */
274 ntmp = SCALAR_REGS(TEMPORARY);
275 ntmp += 4 * 4;
276
277 /* for outermost block, 'inputs' are the actual shader INPUT
278 * register file. Reads from INPUT registers always go back to
279 * top block. For nested blocks, 'inputs' is used to track any
280 * TEMPORARY file register from one of the enclosing blocks that
281 * is ready in this block.
282 */
283 if (!ctx->block) {
284 /* NOTE: fragment shaders actually have two inputs (r0.xy, the
285 * position)
286 */
287 nin = SCALAR_REGS(INPUT);
288 if (ctx->type == TGSI_PROCESSOR_FRAGMENT)
289 nin = MAX2(2, nin);
290 } else {
291 nin = ntmp;
292 }
293
294 nout = SCALAR_REGS(OUTPUT);
295
296 block = ir3_block_create(ctx->ir, ntmp, nin, nout);
297
298 block->parent = ctx->block;
299 ctx->block = block;
300
301 return block;
302 }
303
304 static void
305 pop_block(struct fd3_compile_context *ctx)
306 {
307 ctx->block = ctx->block->parent;
308 compile_assert(ctx, ctx->block);
309 }
310
311 static void
312 ssa_dst(struct fd3_compile_context *ctx, struct ir3_instruction *instr,
313 const struct tgsi_dst_register *dst, unsigned chan)
314 {
315 unsigned n = regid(dst->Index, chan);
316 unsigned idx = ctx->num_output_updates;
317
318 compile_assert(ctx, idx < ARRAY_SIZE(ctx->output_updates));
319
320 /* NOTE: defer update of temporaries[idx] or output[idx]
321 * until instr_finish(), so that if the current instruction
322 * reads the same TEMP/OUT[] it gets the old value:
323 *
324 * bleh.. this might be a bit easier to just figure out
325 * in instr_finish(). But at that point we've already
326 * lost information about OUTPUT vs TEMPORARY register
327 * file..
328 */
329
330 switch (dst->File) {
331 case TGSI_FILE_OUTPUT:
332 compile_assert(ctx, n < ctx->block->noutputs);
333 ctx->output_updates[idx].instrp = &ctx->block->outputs[n];
334 ctx->output_updates[idx].instr = instr;
335 ctx->num_output_updates++;
336 break;
337 case TGSI_FILE_TEMPORARY:
338 compile_assert(ctx, n < ctx->block->ntemporaries);
339 ctx->output_updates[idx].instrp = &ctx->block->temporaries[n];
340 ctx->output_updates[idx].instr = instr;
341 ctx->num_output_updates++;
342 break;
343 }
344 }
345
346 static struct ir3_instruction *
347 create_output(struct ir3_block *block, struct ir3_instruction *instr,
348 unsigned n)
349 {
350 struct ir3_instruction *out;
351
352 out = ir3_instr_create(block, -1, OPC_META_OUTPUT);
353 out->inout.block = block;
354 ir3_reg_create(out, n, 0);
355 if (instr)
356 ir3_reg_create(out, 0, IR3_REG_SSA)->instr = instr;
357
358 return out;
359 }
360
361 static struct ir3_instruction *
362 create_input(struct ir3_block *block, struct ir3_instruction *instr,
363 unsigned n)
364 {
365 struct ir3_instruction *in;
366
367 in = ir3_instr_create(block, -1, OPC_META_INPUT);
368 in->inout.block = block;
369 ir3_reg_create(in, n, 0);
370 if (instr)
371 ir3_reg_create(in, 0, IR3_REG_SSA)->instr = instr;
372
373 return in;
374 }
375
376 static struct ir3_instruction *
377 block_input(struct ir3_block *block, unsigned n)
378 {
379 /* references to INPUT register file always go back up to
380 * top level:
381 */
382 if (block->parent)
383 return block_input(block->parent, n);
384 return block->inputs[n];
385 }
386
387 /* return temporary in scope, creating if needed meta-input node
388 * to track block inputs
389 */
390 static struct ir3_instruction *
391 block_temporary(struct ir3_block *block, unsigned n)
392 {
393 /* references to TEMPORARY register file, find the nearest
394 * enclosing block which has already assigned this temporary,
395 * creating meta-input instructions along the way to keep
396 * track of block inputs
397 */
398 if (block->parent && !block->temporaries[n]) {
399 /* if already have input for this block, reuse: */
400 if (!block->inputs[n])
401 block->inputs[n] = block_temporary(block->parent, n);
402
403 /* and create new input to return: */
404 return create_input(block, block->inputs[n], n);
405 }
406 return block->temporaries[n];
407 }
408
409 static struct ir3_instruction *
410 create_immed(struct fd3_compile_context *ctx, float val)
411 {
412 /* this can happen when registers (or components of a TGSI
413 * register) are used as src before they have been assigned
414 * (undefined contents). To avoid confusing the rest of the
415 * compiler, and to generally keep things peachy, substitute
416 * an instruction that sets the src to 0.0. Or to keep
417 * things undefined, I could plug in a random number? :-P
418 *
419 * NOTE: *don't* use instr_create() here!
420 */
421 struct ir3_instruction *instr;
422 instr = ir3_instr_create(ctx->block, 1, 0);
423 instr->cat1.src_type = get_ftype(ctx);
424 instr->cat1.dst_type = get_ftype(ctx);
425 ir3_reg_create(instr, 0, 0);
426 ir3_reg_create(instr, 0, IR3_REG_IMMED)->fim_val = val;
427 return instr;
428 }
429
430 static void
431 ssa_src(struct fd3_compile_context *ctx, struct ir3_register *reg,
432 const struct tgsi_src_register *src, unsigned chan)
433 {
434 struct ir3_block *block = ctx->block;
435 unsigned n = regid(src->Index, chan);
436
437 switch (src->File) {
438 case TGSI_FILE_INPUT:
439 reg->flags |= IR3_REG_SSA;
440 reg->instr = block_input(ctx->block, n);
441 break;
442 case TGSI_FILE_OUTPUT:
443 /* really this should just happen in case of 'MOV_SAT OUT[n], ..',
444 * for the following clamp instructions:
445 */
446 reg->flags |= IR3_REG_SSA;
447 reg->instr = block->outputs[n];
448 /* we don't have to worry about read from an OUTPUT that was
449 * assigned outside of the current block, because the _SAT
450 * clamp instructions will always be in the same block as
451 * the original instruction which wrote the OUTPUT
452 */
453 compile_assert(ctx, reg->instr);
454 break;
455 case TGSI_FILE_TEMPORARY:
456 reg->flags |= IR3_REG_SSA;
457 reg->instr = block_temporary(ctx->block, n);
458 break;
459 }
460
461 if ((reg->flags & IR3_REG_SSA) && !reg->instr) {
462 /* this can happen when registers (or components of a TGSI
463 * register) are used as src before they have been assigned
464 * (undefined contents). To avoid confusing the rest of the
465 * compiler, and to generally keep things peachy, substitute
466 * an instruction that sets the src to 0.0. Or to keep
467 * things undefined, I could plug in a random number? :-P
468 *
469 * NOTE: *don't* use instr_create() here!
470 */
471 reg->instr = create_immed(ctx, 0.0);
472 }
473 }
474
475 static struct ir3_register *
476 add_dst_reg_wrmask(struct fd3_compile_context *ctx,
477 struct ir3_instruction *instr, const struct tgsi_dst_register *dst,
478 unsigned chan, unsigned wrmask)
479 {
480 unsigned flags = 0, num = 0;
481 struct ir3_register *reg;
482
483 switch (dst->File) {
484 case TGSI_FILE_OUTPUT:
485 case TGSI_FILE_TEMPORARY:
486 num = dst->Index + ctx->base_reg[dst->File];
487 break;
488 case TGSI_FILE_ADDRESS:
489 num = REG_A0;
490 break;
491 default:
492 compile_error(ctx, "unsupported dst register file: %s\n",
493 tgsi_file_name(dst->File));
494 break;
495 }
496
497 if (dst->Indirect)
498 flags |= IR3_REG_RELATIV;
499 if (ctx->so->half_precision)
500 flags |= IR3_REG_HALF;
501
502 reg = ir3_reg_create(instr, regid(num, chan), flags);
503
504 /* NOTE: do not call ssa_dst() if atomic.. vectorize()
505 * itself will call ssa_dst(). This is to filter out
506 * the (initially bogus) .x component dst which is
507 * created (but not necessarily used, ie. if the net
508 * result of the vector operation does not write to
509 * the .x component)
510 */
511
512 reg->wrmask = wrmask;
513 if (wrmask == 0x1) {
514 /* normal case */
515 if (!ctx->atomic)
516 ssa_dst(ctx, instr, dst, chan);
517 } else if ((dst->File == TGSI_FILE_TEMPORARY) ||
518 (dst->File == TGSI_FILE_OUTPUT)) {
519 unsigned i;
520
521 /* if instruction writes multiple, we need to create
522 * some place-holder collect the registers:
523 */
524 for (i = 0; i < 4; i++) {
525 if (wrmask & (1 << i)) {
526 struct ir3_instruction *collect =
527 ir3_instr_create(ctx->block, -1, OPC_META_FO);
528 collect->fo.off = i;
529 /* unused dst reg: */
530 ir3_reg_create(collect, 0, 0);
531 /* and src reg used to hold original instr */
532 ir3_reg_create(collect, 0, IR3_REG_SSA)->instr = instr;
533 if (!ctx->atomic)
534 ssa_dst(ctx, collect, dst, chan+i);
535 }
536 }
537 }
538
539 return reg;
540 }
541
542 static struct ir3_register *
543 add_dst_reg(struct fd3_compile_context *ctx, struct ir3_instruction *instr,
544 const struct tgsi_dst_register *dst, unsigned chan)
545 {
546 return add_dst_reg_wrmask(ctx, instr, dst, chan, 0x1);
547 }
548
549 static struct ir3_register *
550 add_src_reg_wrmask(struct fd3_compile_context *ctx,
551 struct ir3_instruction *instr, const struct tgsi_src_register *src,
552 unsigned chan, unsigned wrmask)
553 {
554 unsigned flags = 0, num = 0;
555 struct ir3_register *reg;
556
557 /* TODO we need to use a mov to temp for const >= 64.. or maybe
558 * we could use relative addressing..
559 */
560 compile_assert(ctx, src->Index < 64);
561
562 switch (src->File) {
563 case TGSI_FILE_IMMEDIATE:
564 /* TODO if possible, use actual immediate instead of const.. but
565 * TGSI has vec4 immediates, we can only embed scalar (of limited
566 * size, depending on instruction..)
567 */
568 case TGSI_FILE_CONSTANT:
569 flags |= IR3_REG_CONST;
570 num = src->Index + ctx->base_reg[src->File];
571 break;
572 case TGSI_FILE_OUTPUT:
573 /* NOTE: we should only end up w/ OUTPUT file for things like
574 * clamp()'ing saturated dst instructions
575 */
576 case TGSI_FILE_INPUT:
577 case TGSI_FILE_TEMPORARY:
578 num = src->Index + ctx->base_reg[src->File];
579 break;
580 default:
581 compile_error(ctx, "unsupported src register file: %s\n",
582 tgsi_file_name(src->File));
583 break;
584 }
585
586 if (src->Absolute)
587 flags |= IR3_REG_ABS;
588 if (src->Negate)
589 flags |= IR3_REG_NEGATE;
590 if (src->Indirect)
591 flags |= IR3_REG_RELATIV;
592 if (ctx->so->half_precision)
593 flags |= IR3_REG_HALF;
594
595 reg = ir3_reg_create(instr, regid(num, chan), flags);
596
597 reg->wrmask = wrmask;
598 if (wrmask == 0x1) {
599 /* normal case */
600 ssa_src(ctx, reg, src, chan);
601 } else if ((src->File == TGSI_FILE_TEMPORARY) ||
602 (src->File == TGSI_FILE_OUTPUT) ||
603 (src->File == TGSI_FILE_INPUT)) {
604 struct ir3_instruction *collect;
605 unsigned i;
606
607 /* if instruction reads multiple, we need to create
608 * some place-holder collect the registers:
609 */
610 collect = ir3_instr_create(ctx->block, -1, OPC_META_FI);
611 ir3_reg_create(collect, 0, 0); /* unused dst reg */
612
613 for (i = 0; i < 4; i++) {
614 if (wrmask & (1 << i)) {
615 /* and src reg used point to the original instr */
616 ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA),
617 src, chan + i);
618 } else if (wrmask & ~((i << i) - 1)) {
619 /* if any remaining components, then dummy
620 * placeholder src reg to fill in the blanks:
621 */
622 ir3_reg_create(collect, 0, 0);
623 }
624 }
625
626 reg->flags |= IR3_REG_SSA;
627 reg->instr = collect;
628 }
629
630 return reg;
631 }
632
633 static struct ir3_register *
634 add_src_reg(struct fd3_compile_context *ctx, struct ir3_instruction *instr,
635 const struct tgsi_src_register *src, unsigned chan)
636 {
637 return add_src_reg_wrmask(ctx, instr, src, chan, 0x1);
638 }
639
640 static void
641 src_from_dst(struct tgsi_src_register *src, struct tgsi_dst_register *dst)
642 {
643 src->File = dst->File;
644 src->Indirect = dst->Indirect;
645 src->Dimension = dst->Dimension;
646 src->Index = dst->Index;
647 src->Absolute = 0;
648 src->Negate = 0;
649 src->SwizzleX = TGSI_SWIZZLE_X;
650 src->SwizzleY = TGSI_SWIZZLE_Y;
651 src->SwizzleZ = TGSI_SWIZZLE_Z;
652 src->SwizzleW = TGSI_SWIZZLE_W;
653 }
654
655 /* Get internal-temp src/dst to use for a sequence of instructions
656 * generated by a single TGSI op.
657 */
658 static struct tgsi_src_register *
659 get_internal_temp(struct fd3_compile_context *ctx,
660 struct tgsi_dst_register *tmp_dst)
661 {
662 struct tgsi_src_register *tmp_src;
663 int n;
664
665 tmp_dst->File = TGSI_FILE_TEMPORARY;
666 tmp_dst->WriteMask = TGSI_WRITEMASK_XYZW;
667 tmp_dst->Indirect = 0;
668 tmp_dst->Dimension = 0;
669
670 /* assign next temporary: */
671 n = ctx->num_internal_temps++;
672 compile_assert(ctx, n < ARRAY_SIZE(ctx->internal_temps));
673 tmp_src = &ctx->internal_temps[n];
674
675 tmp_dst->Index = ctx->info.file_max[TGSI_FILE_TEMPORARY] + n + 1;
676
677 src_from_dst(tmp_src, tmp_dst);
678
679 return tmp_src;
680 }
681
682 /* Get internal half-precision temp src/dst to use for a sequence of
683 * instructions generated by a single TGSI op.
684 */
685 static struct tgsi_src_register *
686 get_internal_temp_hr(struct fd3_compile_context *ctx,
687 struct tgsi_dst_register *tmp_dst)
688 {
689 struct tgsi_src_register *tmp_src;
690 int n;
691
692 if (ctx->so->half_precision)
693 return get_internal_temp(ctx, tmp_dst);
694
695 tmp_dst->File = TGSI_FILE_TEMPORARY;
696 tmp_dst->WriteMask = TGSI_WRITEMASK_XYZW;
697 tmp_dst->Indirect = 0;
698 tmp_dst->Dimension = 0;
699
700 /* assign next temporary: */
701 n = ctx->num_internal_temps++;
702 compile_assert(ctx, n < ARRAY_SIZE(ctx->internal_temps));
703 tmp_src = &ctx->internal_temps[n];
704
705 /* just use hr0 because no one else should be using half-
706 * precision regs:
707 */
708 tmp_dst->Index = 0;
709
710 src_from_dst(tmp_src, tmp_dst);
711
712 return tmp_src;
713 }
714
715 static inline bool
716 is_const(struct tgsi_src_register *src)
717 {
718 return (src->File == TGSI_FILE_CONSTANT) ||
719 (src->File == TGSI_FILE_IMMEDIATE);
720 }
721
722 static inline bool
723 is_relative(struct tgsi_src_register *src)
724 {
725 return src->Indirect;
726 }
727
728 static inline bool
729 is_rel_or_const(struct tgsi_src_register *src)
730 {
731 return is_relative(src) || is_const(src);
732 }
733
734 static type_t
735 get_ftype(struct fd3_compile_context *ctx)
736 {
737 return ctx->so->half_precision ? TYPE_F16 : TYPE_F32;
738 }
739
740 static type_t
741 get_utype(struct fd3_compile_context *ctx)
742 {
743 return ctx->so->half_precision ? TYPE_U16 : TYPE_U32;
744 }
745
746 static unsigned
747 src_swiz(struct tgsi_src_register *src, int chan)
748 {
749 switch (chan) {
750 case 0: return src->SwizzleX;
751 case 1: return src->SwizzleY;
752 case 2: return src->SwizzleZ;
753 case 3: return src->SwizzleW;
754 }
755 assert(0);
756 return 0;
757 }
758
759 /* for instructions that cannot take a const register as src, if needed
760 * generate a move to temporary gpr:
761 */
762 static struct tgsi_src_register *
763 get_unconst(struct fd3_compile_context *ctx, struct tgsi_src_register *src)
764 {
765 struct tgsi_dst_register tmp_dst;
766 struct tgsi_src_register *tmp_src;
767
768 compile_assert(ctx, is_rel_or_const(src));
769
770 tmp_src = get_internal_temp(ctx, &tmp_dst);
771
772 create_mov(ctx, &tmp_dst, src);
773
774 return tmp_src;
775 }
776
777 static void
778 get_immediate(struct fd3_compile_context *ctx,
779 struct tgsi_src_register *reg, uint32_t val)
780 {
781 unsigned neg, swiz, idx, i;
782 /* actually maps 1:1 currently.. not sure if that is safe to rely on: */
783 static const unsigned swiz2tgsi[] = {
784 TGSI_SWIZZLE_X, TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Z, TGSI_SWIZZLE_W,
785 };
786
787 for (i = 0; i < ctx->immediate_idx; i++) {
788 swiz = i % 4;
789 idx = i / 4;
790
791 if (ctx->so->immediates[idx].val[swiz] == val) {
792 neg = 0;
793 break;
794 }
795
796 if (ctx->so->immediates[idx].val[swiz] == -val) {
797 neg = 1;
798 break;
799 }
800 }
801
802 if (i == ctx->immediate_idx) {
803 /* need to generate a new immediate: */
804 swiz = i % 4;
805 idx = i / 4;
806 neg = 0;
807 ctx->so->immediates[idx].val[swiz] = val;
808 ctx->so->immediates_count = idx + 1;
809 ctx->immediate_idx++;
810 }
811
812 reg->File = TGSI_FILE_IMMEDIATE;
813 reg->Indirect = 0;
814 reg->Dimension = 0;
815 reg->Index = idx;
816 reg->Absolute = 0;
817 reg->Negate = neg;
818 reg->SwizzleX = swiz2tgsi[swiz];
819 reg->SwizzleY = swiz2tgsi[swiz];
820 reg->SwizzleZ = swiz2tgsi[swiz];
821 reg->SwizzleW = swiz2tgsi[swiz];
822 }
823
824 static void
825 create_mov(struct fd3_compile_context *ctx, struct tgsi_dst_register *dst,
826 struct tgsi_src_register *src)
827 {
828 type_t type_mov = get_ftype(ctx);
829 unsigned i;
830
831 for (i = 0; i < 4; i++) {
832 /* move to destination: */
833 if (dst->WriteMask & (1 << i)) {
834 struct ir3_instruction *instr;
835
836 if (src->Absolute || src->Negate) {
837 /* can't have abs or neg on a mov instr, so use
838 * absneg.f instead to handle these cases:
839 */
840 instr = instr_create(ctx, 2, OPC_ABSNEG_F);
841 } else {
842 instr = instr_create(ctx, 1, 0);
843 instr->cat1.src_type = type_mov;
844 instr->cat1.dst_type = type_mov;
845 }
846
847 add_dst_reg(ctx, instr, dst, i);
848 add_src_reg(ctx, instr, src, src_swiz(src, i));
849 }
850 }
851 }
852
853 static void
854 create_clamp(struct fd3_compile_context *ctx,
855 struct tgsi_dst_register *dst, struct tgsi_src_register *val,
856 struct tgsi_src_register *minval, struct tgsi_src_register *maxval)
857 {
858 struct ir3_instruction *instr;
859
860 instr = instr_create(ctx, 2, OPC_MAX_F);
861 vectorize(ctx, instr, dst, 2, val, 0, minval, 0);
862
863 instr = instr_create(ctx, 2, OPC_MIN_F);
864 vectorize(ctx, instr, dst, 2, val, 0, maxval, 0);
865 }
866
867 static void
868 create_clamp_imm(struct fd3_compile_context *ctx,
869 struct tgsi_dst_register *dst,
870 uint32_t minval, uint32_t maxval)
871 {
872 struct tgsi_src_register minconst, maxconst;
873 struct tgsi_src_register src;
874
875 src_from_dst(&src, dst);
876
877 get_immediate(ctx, &minconst, minval);
878 get_immediate(ctx, &maxconst, maxval);
879
880 create_clamp(ctx, dst, &src, &minconst, &maxconst);
881 }
882
883 static struct tgsi_dst_register *
884 get_dst(struct fd3_compile_context *ctx, struct tgsi_full_instruction *inst)
885 {
886 struct tgsi_dst_register *dst = &inst->Dst[0].Register;
887 unsigned i;
888 for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
889 struct tgsi_src_register *src = &inst->Src[i].Register;
890 if ((src->File == dst->File) && (src->Index == dst->Index)) {
891 if ((dst->WriteMask == TGSI_WRITEMASK_XYZW) &&
892 (src->SwizzleX == TGSI_SWIZZLE_X) &&
893 (src->SwizzleY == TGSI_SWIZZLE_Y) &&
894 (src->SwizzleZ == TGSI_SWIZZLE_Z) &&
895 (src->SwizzleW == TGSI_SWIZZLE_W))
896 continue;
897 ctx->tmp_src = get_internal_temp(ctx, &ctx->tmp_dst);
898 ctx->tmp_dst.WriteMask = dst->WriteMask;
899 dst = &ctx->tmp_dst;
900 break;
901 }
902 }
903 return dst;
904 }
905
906 static void
907 put_dst(struct fd3_compile_context *ctx, struct tgsi_full_instruction *inst,
908 struct tgsi_dst_register *dst)
909 {
910 /* if necessary, add mov back into original dst: */
911 if (dst != &inst->Dst[0].Register) {
912 create_mov(ctx, &inst->Dst[0].Register, ctx->tmp_src);
913 }
914 }
915
916 /* helper to generate the necessary repeat and/or additional instructions
917 * to turn a scalar instruction into a vector operation:
918 */
919 static void
920 vectorize(struct fd3_compile_context *ctx, struct ir3_instruction *instr,
921 struct tgsi_dst_register *dst, int nsrcs, ...)
922 {
923 va_list ap;
924 int i, j, n = 0;
925
926 instr_atomic_start(ctx);
927
928 add_dst_reg(ctx, instr, dst, TGSI_SWIZZLE_X);
929
930 va_start(ap, nsrcs);
931 for (j = 0; j < nsrcs; j++) {
932 struct tgsi_src_register *src =
933 va_arg(ap, struct tgsi_src_register *);
934 unsigned flags = va_arg(ap, unsigned);
935 struct ir3_register *reg;
936 if (flags & IR3_REG_IMMED) {
937 reg = ir3_reg_create(instr, 0, IR3_REG_IMMED);
938 /* this is an ugly cast.. should have put flags first! */
939 reg->iim_val = *(int *)&src;
940 } else {
941 reg = add_src_reg(ctx, instr, src, TGSI_SWIZZLE_X);
942 }
943 reg->flags |= flags & ~IR3_REG_NEGATE;
944 if (flags & IR3_REG_NEGATE)
945 reg->flags ^= IR3_REG_NEGATE;
946 }
947 va_end(ap);
948
949 for (i = 0; i < 4; i++) {
950 if (dst->WriteMask & (1 << i)) {
951 struct ir3_instruction *cur;
952
953 if (n++ == 0) {
954 cur = instr;
955 } else {
956 cur = instr_clone(ctx, instr);
957 }
958
959 ssa_dst(ctx, cur, dst, i);
960
961 /* fix-up dst register component: */
962 cur->regs[0]->num = regid(cur->regs[0]->num >> 2, i);
963
964 /* fix-up src register component: */
965 va_start(ap, nsrcs);
966 for (j = 0; j < nsrcs; j++) {
967 struct ir3_register *reg = cur->regs[j+1];
968 struct tgsi_src_register *src =
969 va_arg(ap, struct tgsi_src_register *);
970 unsigned flags = va_arg(ap, unsigned);
971 if (reg->flags & IR3_REG_SSA) {
972 ssa_src(ctx, reg, src, src_swiz(src, i));
973 } else if (!(flags & IR3_REG_IMMED)) {
974 reg->num = regid(reg->num >> 2, src_swiz(src, i));
975 }
976 }
977 va_end(ap);
978 }
979 }
980
981 instr_atomic_end(ctx);
982 }
983
984 /*
985 * Handlers for TGSI instructions which do not have a 1:1 mapping to
986 * native instructions:
987 */
988
989 static void
990 trans_clamp(const struct instr_translater *t,
991 struct fd3_compile_context *ctx,
992 struct tgsi_full_instruction *inst)
993 {
994 struct tgsi_dst_register *dst = get_dst(ctx, inst);
995 struct tgsi_src_register *src0 = &inst->Src[0].Register;
996 struct tgsi_src_register *src1 = &inst->Src[1].Register;
997 struct tgsi_src_register *src2 = &inst->Src[2].Register;
998
999 create_clamp(ctx, dst, src0, src1, src2);
1000
1001 put_dst(ctx, inst, dst);
1002 }
1003
1004 /* ARL(x) = x, but mova from hrN.x to a0.. */
1005 static void
1006 trans_arl(const struct instr_translater *t,
1007 struct fd3_compile_context *ctx,
1008 struct tgsi_full_instruction *inst)
1009 {
1010 struct ir3_instruction *instr;
1011 struct tgsi_dst_register tmp_dst;
1012 struct tgsi_src_register *tmp_src;
1013 struct tgsi_dst_register *dst = &inst->Dst[0].Register;
1014 struct tgsi_src_register *src = &inst->Src[0].Register;
1015 unsigned chan = src->SwizzleX;
1016 compile_assert(ctx, dst->File == TGSI_FILE_ADDRESS);
1017
1018 tmp_src = get_internal_temp_hr(ctx, &tmp_dst);
1019
1020 /* cov.{f32,f16}s16 Rtmp, Rsrc */
1021 instr = instr_create(ctx, 1, 0);
1022 instr->cat1.src_type = get_ftype(ctx);
1023 instr->cat1.dst_type = TYPE_S16;
1024 add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF;
1025 add_src_reg(ctx, instr, src, chan);
1026
1027 /* shl.b Rtmp, Rtmp, 2 */
1028 instr = instr_create(ctx, 2, OPC_SHL_B);
1029 add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF;
1030 add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF;
1031 ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 2;
1032
1033 /* mova a0, Rtmp */
1034 instr = instr_create(ctx, 1, 0);
1035 instr->cat1.src_type = TYPE_S16;
1036 instr->cat1.dst_type = TYPE_S16;
1037 add_dst_reg(ctx, instr, dst, 0)->flags |= IR3_REG_HALF;
1038 add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF;
1039 }
1040
1041 /* texture fetch/sample instructions: */
1042 static void
1043 trans_samp(const struct instr_translater *t,
1044 struct fd3_compile_context *ctx,
1045 struct tgsi_full_instruction *inst)
1046 {
1047 struct ir3_instruction *instr;
1048 struct tgsi_src_register *coord = &inst->Src[0].Register;
1049 struct tgsi_src_register *samp = &inst->Src[1].Register;
1050 unsigned tex = inst->Texture.Texture;
1051 int8_t *order;
1052 unsigned i, flags = 0, src_wrmask;
1053 bool needs_mov = false;
1054
1055 switch (t->arg) {
1056 case TGSI_OPCODE_TEX:
1057 if (tex == TGSI_TEXTURE_2D) {
1058 order = (int8_t[4]){ 0, 1, -1, -1 };
1059 src_wrmask = TGSI_WRITEMASK_XY;
1060 } else {
1061 order = (int8_t[4]){ 0, 1, 2, -1 };
1062 src_wrmask = TGSI_WRITEMASK_XYZ;
1063 }
1064 break;
1065 case TGSI_OPCODE_TXP:
1066 if (tex == TGSI_TEXTURE_2D) {
1067 order = (int8_t[4]){ 0, 1, 3, -1 };
1068 src_wrmask = TGSI_WRITEMASK_XYZ;
1069 } else {
1070 order = (int8_t[4]){ 0, 1, 2, 3 };
1071 src_wrmask = TGSI_WRITEMASK_XYZW;
1072 }
1073 flags |= IR3_INSTR_P;
1074 break;
1075 default:
1076 compile_assert(ctx, 0);
1077 break;
1078 }
1079
1080 if ((tex == TGSI_TEXTURE_3D) || (tex == TGSI_TEXTURE_CUBE))
1081 flags |= IR3_INSTR_3D;
1082
1083 /* cat5 instruction cannot seem to handle const or relative: */
1084 if (is_rel_or_const(coord))
1085 needs_mov = true;
1086
1087 /* The texture sample instructions need to coord in successive
1088 * registers/components (ie. src.xy but not src.yx). And TXP
1089 * needs the .w component in .z for 2D.. so in some cases we
1090 * might need to emit some mov instructions to shuffle things
1091 * around:
1092 */
1093 for (i = 1; (i < 4) && (order[i] >= 0) && !needs_mov; i++)
1094 if (src_swiz(coord, i) != (src_swiz(coord, 0) + order[i]))
1095 needs_mov = true;
1096
1097 if (needs_mov) {
1098 struct tgsi_dst_register tmp_dst;
1099 struct tgsi_src_register *tmp_src;
1100 unsigned j;
1101
1102 type_t type_mov = get_ftype(ctx);
1103
1104 /* need to move things around: */
1105 tmp_src = get_internal_temp(ctx, &tmp_dst);
1106
1107 for (j = 0; (j < 4) && (order[j] >= 0); j++) {
1108 instr = instr_create(ctx, 1, 0);
1109 instr->cat1.src_type = type_mov;
1110 instr->cat1.dst_type = type_mov;
1111 add_dst_reg(ctx, instr, &tmp_dst, j);
1112 add_src_reg(ctx, instr, coord,
1113 src_swiz(coord, order[j]));
1114 }
1115
1116 coord = tmp_src;
1117 }
1118
1119 instr = instr_create(ctx, 5, t->opc);
1120 instr->cat5.type = get_ftype(ctx);
1121 instr->cat5.samp = samp->Index;
1122 instr->cat5.tex = samp->Index;
1123 instr->flags |= flags;
1124
1125 add_dst_reg_wrmask(ctx, instr, &inst->Dst[0].Register, 0,
1126 inst->Dst[0].Register.WriteMask);
1127
1128 add_src_reg_wrmask(ctx, instr, coord, coord->SwizzleX, src_wrmask);
1129 }
1130
1131 /*
1132 * SEQ(a,b) = (a == b) ? 1.0 : 0.0
1133 * cmps.f.eq tmp0, a, b
1134 * cov.u16f16 dst, tmp0
1135 *
1136 * SNE(a,b) = (a != b) ? 1.0 : 0.0
1137 * cmps.f.ne tmp0, a, b
1138 * cov.u16f16 dst, tmp0
1139 *
1140 * SGE(a,b) = (a >= b) ? 1.0 : 0.0
1141 * cmps.f.ge tmp0, a, b
1142 * cov.u16f16 dst, tmp0
1143 *
1144 * SLE(a,b) = (a <= b) ? 1.0 : 0.0
1145 * cmps.f.le tmp0, a, b
1146 * cov.u16f16 dst, tmp0
1147 *
1148 * SGT(a,b) = (a > b) ? 1.0 : 0.0
1149 * cmps.f.gt tmp0, a, b
1150 * cov.u16f16 dst, tmp0
1151 *
1152 * SLT(a,b) = (a < b) ? 1.0 : 0.0
1153 * cmps.f.lt tmp0, a, b
1154 * cov.u16f16 dst, tmp0
1155 *
1156 * CMP(a,b,c) = (a < 0.0) ? b : c
1157 * cmps.f.lt tmp0, a, {0.0}
1158 * sel.b16 dst, b, tmp0, c
1159 */
1160 static void
1161 trans_cmp(const struct instr_translater *t,
1162 struct fd3_compile_context *ctx,
1163 struct tgsi_full_instruction *inst)
1164 {
1165 struct ir3_instruction *instr;
1166 struct tgsi_dst_register tmp_dst;
1167 struct tgsi_src_register *tmp_src;
1168 struct tgsi_src_register constval0;
1169 /* final instruction for CMP() uses orig src1 and src2: */
1170 struct tgsi_dst_register *dst = get_dst(ctx, inst);
1171 struct tgsi_src_register *a0, *a1, *a2;
1172 unsigned condition;
1173
1174 tmp_src = get_internal_temp(ctx, &tmp_dst);
1175
1176 a0 = &inst->Src[0].Register; /* a */
1177 a1 = &inst->Src[1].Register; /* b */
1178
1179 switch (t->tgsi_opc) {
1180 case TGSI_OPCODE_SEQ:
1181 condition = IR3_COND_EQ;
1182 break;
1183 case TGSI_OPCODE_SNE:
1184 condition = IR3_COND_NE;
1185 break;
1186 case TGSI_OPCODE_SGE:
1187 condition = IR3_COND_GE;
1188 break;
1189 case TGSI_OPCODE_SLT:
1190 condition = IR3_COND_LT;
1191 break;
1192 case TGSI_OPCODE_SLE:
1193 condition = IR3_COND_LE;
1194 break;
1195 case TGSI_OPCODE_SGT:
1196 condition = IR3_COND_GT;
1197 break;
1198 case TGSI_OPCODE_CMP:
1199 get_immediate(ctx, &constval0, fui(0.0));
1200 a0 = &inst->Src[0].Register; /* a */
1201 a1 = &constval0; /* {0.0} */
1202 condition = IR3_COND_LT;
1203 break;
1204 default:
1205 compile_assert(ctx, 0);
1206 return;
1207 }
1208
1209 if (is_const(a0) && is_const(a1))
1210 a0 = get_unconst(ctx, a0);
1211
1212 /* cmps.f.<cond> tmp, a0, a1 */
1213 instr = instr_create(ctx, 2, OPC_CMPS_F);
1214 instr->cat2.condition = condition;
1215 vectorize(ctx, instr, &tmp_dst, 2, a0, 0, a1, 0);
1216
1217 switch (t->tgsi_opc) {
1218 case TGSI_OPCODE_SEQ:
1219 case TGSI_OPCODE_SGE:
1220 case TGSI_OPCODE_SLE:
1221 case TGSI_OPCODE_SNE:
1222 case TGSI_OPCODE_SGT:
1223 case TGSI_OPCODE_SLT:
1224 /* cov.u16f16 dst, tmp0 */
1225 instr = instr_create(ctx, 1, 0);
1226 instr->cat1.src_type = get_utype(ctx);
1227 instr->cat1.dst_type = get_ftype(ctx);
1228 vectorize(ctx, instr, dst, 1, tmp_src, 0);
1229 break;
1230 case TGSI_OPCODE_CMP:
1231 a1 = &inst->Src[1].Register;
1232 a2 = &inst->Src[2].Register;
1233 /* sel.{b32,b16} dst, src2, tmp, src1 */
1234 instr = instr_create(ctx, 3,
1235 ctx->so->half_precision ? OPC_SEL_B16 : OPC_SEL_B32);
1236 vectorize(ctx, instr, dst, 3, a1, 0, tmp_src, 0, a2, 0);
1237
1238 break;
1239 }
1240
1241 put_dst(ctx, inst, dst);
1242 }
1243
1244 /*
1245 * Conditional / Flow control
1246 */
1247
1248 static void
1249 push_branch(struct fd3_compile_context *ctx, struct ir3_instruction *instr)
1250 {
1251 ctx->branch[ctx->branch_count++] = instr;
1252 }
1253
1254 static struct ir3_instruction *
1255 pop_branch(struct fd3_compile_context *ctx)
1256 {
1257 return ctx->branch[--ctx->branch_count];
1258 }
1259
1260 static void
1261 trans_if(const struct instr_translater *t,
1262 struct fd3_compile_context *ctx,
1263 struct tgsi_full_instruction *inst)
1264 {
1265 struct ir3_instruction *instr;
1266 struct tgsi_src_register *src = &inst->Src[0].Register;
1267 struct tgsi_dst_register tmp_dst;
1268 struct tgsi_src_register *tmp_src;
1269 struct tgsi_src_register constval;
1270
1271 get_immediate(ctx, &constval, fui(0.0));
1272 tmp_src = get_internal_temp(ctx, &tmp_dst);
1273
1274 if (is_const(src))
1275 src = get_unconst(ctx, src);
1276
1277 /* cmps.f.eq tmp0, b, {0.0} */
1278 instr = instr_create(ctx, 2, OPC_CMPS_F);
1279 add_dst_reg(ctx, instr, &tmp_dst, 0);
1280 add_src_reg(ctx, instr, src, src->SwizzleX);
1281 add_src_reg(ctx, instr, &constval, constval.SwizzleX);
1282 instr->cat2.condition = IR3_COND_EQ;
1283
1284 /* add.s tmp0, tmp0, -1 */
1285 instr = instr_create(ctx, 2, OPC_ADD_S);
1286 add_dst_reg(ctx, instr, &tmp_dst, TGSI_SWIZZLE_X);
1287 add_src_reg(ctx, instr, tmp_src, TGSI_SWIZZLE_X);
1288 ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = -1;
1289
1290 /* meta:flow tmp0 */
1291 instr = instr_create(ctx, -1, OPC_META_FLOW);
1292 ir3_reg_create(instr, 0, 0); /* dummy dst */
1293 add_src_reg(ctx, instr, tmp_src, TGSI_SWIZZLE_X);
1294
1295 push_branch(ctx, instr);
1296 instr->flow.if_block = push_block(ctx);
1297 }
1298
1299 static void
1300 trans_else(const struct instr_translater *t,
1301 struct fd3_compile_context *ctx,
1302 struct tgsi_full_instruction *inst)
1303 {
1304 struct ir3_instruction *instr;
1305
1306 pop_block(ctx);
1307
1308 instr = pop_branch(ctx);
1309
1310 compile_assert(ctx, (instr->category == -1) &&
1311 (instr->opc == OPC_META_FLOW));
1312
1313 push_branch(ctx, instr);
1314 instr->flow.else_block = push_block(ctx);
1315 }
1316
1317 static struct ir3_instruction *
1318 find_temporary(struct ir3_block *block, unsigned n)
1319 {
1320 if (block->parent && !block->temporaries[n])
1321 return find_temporary(block->parent, n);
1322 return block->temporaries[n];
1323 }
1324
1325 static struct ir3_instruction *
1326 find_output(struct ir3_block *block, unsigned n)
1327 {
1328 if (block->parent && !block->outputs[n])
1329 return find_output(block->parent, n);
1330 return block->outputs[n];
1331 }
1332
1333 static struct ir3_instruction *
1334 create_phi(struct fd3_compile_context *ctx, struct ir3_instruction *cond,
1335 struct ir3_instruction *a, struct ir3_instruction *b)
1336 {
1337 struct ir3_instruction *phi;
1338
1339 compile_assert(ctx, cond);
1340
1341 /* Either side of the condition could be null.. which
1342 * indicates a variable written on only one side of the
1343 * branch. Normally this should only be variables not
1344 * used outside of that side of the branch. So we could
1345 * just 'return a ? a : b;' in that case. But for better
1346 * defined undefined behavior we just stick in imm{0.0}.
1347 * In the common case of a value only used within the
1348 * one side of the branch, the PHI instruction will not
1349 * get scheduled
1350 */
1351 if (!a)
1352 a = create_immed(ctx, 0.0);
1353 if (!b)
1354 b = create_immed(ctx, 0.0);
1355
1356 phi = instr_create(ctx, -1, OPC_META_PHI);
1357 ir3_reg_create(phi, 0, 0); /* dummy dst */
1358 ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = cond;
1359 ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = a;
1360 ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = b;
1361
1362 return phi;
1363 }
1364
1365 static void
1366 trans_endif(const struct instr_translater *t,
1367 struct fd3_compile_context *ctx,
1368 struct tgsi_full_instruction *inst)
1369 {
1370 struct ir3_instruction *instr;
1371 struct ir3_block *ifb, *elseb;
1372 struct ir3_instruction **ifout, **elseout;
1373 unsigned i, ifnout = 0, elsenout = 0;
1374
1375 pop_block(ctx);
1376
1377 instr = pop_branch(ctx);
1378
1379 compile_assert(ctx, (instr->category == -1) &&
1380 (instr->opc == OPC_META_FLOW));
1381
1382 ifb = instr->flow.if_block;
1383 elseb = instr->flow.else_block;
1384 /* if there is no else block, the parent block is used for the
1385 * branch-not-taken src of the PHI instructions:
1386 */
1387 if (!elseb)
1388 elseb = ifb->parent;
1389
1390 /* count up number of outputs for each block: */
1391 for (i = 0; i < ifb->ntemporaries; i++) {
1392 if (ifb->temporaries[i])
1393 ifnout++;
1394 if (elseb->temporaries[i])
1395 elsenout++;
1396 }
1397 for (i = 0; i < ifb->noutputs; i++) {
1398 if (ifb->outputs[i])
1399 ifnout++;
1400 if (elseb->outputs[i])
1401 elsenout++;
1402 }
1403
1404 ifout = ir3_alloc(ctx->ir, sizeof(ifb->outputs[0]) * ifnout);
1405 if (elseb != ifb->parent)
1406 elseout = ir3_alloc(ctx->ir, sizeof(ifb->outputs[0]) * elsenout);
1407
1408 ifnout = 0;
1409 elsenout = 0;
1410
1411 /* generate PHI instructions for any temporaries written: */
1412 for (i = 0; i < ifb->ntemporaries; i++) {
1413 struct ir3_instruction *a = ifb->temporaries[i];
1414 struct ir3_instruction *b = elseb->temporaries[i];
1415
1416 /* if temporary written in if-block, or if else block
1417 * is present and temporary written in else-block:
1418 */
1419 if (a || ((elseb != ifb->parent) && b)) {
1420 struct ir3_instruction *phi;
1421
1422 /* if only written on one side, find the closest
1423 * enclosing update on other side:
1424 */
1425 if (!a)
1426 a = find_temporary(ifb, i);
1427 if (!b)
1428 b = find_temporary(elseb, i);
1429
1430 ifout[ifnout] = a;
1431 a = create_output(ifb, a, ifnout++);
1432
1433 if (elseb != ifb->parent) {
1434 elseout[elsenout] = b;
1435 b = create_output(elseb, b, elsenout++);
1436 }
1437
1438 phi = create_phi(ctx, instr, a, b);
1439 ctx->block->temporaries[i] = phi;
1440 }
1441 }
1442
1443 /* .. and any outputs written: */
1444 for (i = 0; i < ifb->noutputs; i++) {
1445 struct ir3_instruction *a = ifb->outputs[i];
1446 struct ir3_instruction *b = elseb->outputs[i];
1447
1448 /* if output written in if-block, or if else block
1449 * is present and output written in else-block:
1450 */
1451 if (a || ((elseb != ifb->parent) && b)) {
1452 struct ir3_instruction *phi;
1453
1454 /* if only written on one side, find the closest
1455 * enclosing update on other side:
1456 */
1457 if (!a)
1458 a = find_output(ifb, i);
1459 if (!b)
1460 b = find_output(elseb, i);
1461
1462 ifout[ifnout] = a;
1463 a = create_output(ifb, a, ifnout++);
1464
1465 if (elseb != ifb->parent) {
1466 elseout[elsenout] = b;
1467 b = create_output(elseb, b, elsenout++);
1468 }
1469
1470 phi = create_phi(ctx, instr, a, b);
1471 ctx->block->outputs[i] = phi;
1472 }
1473 }
1474
1475 ifb->noutputs = ifnout;
1476 ifb->outputs = ifout;
1477
1478 if (elseb != ifb->parent) {
1479 elseb->noutputs = elsenout;
1480 elseb->outputs = elseout;
1481 }
1482
1483 // TODO maybe we want to compact block->inputs?
1484 }
1485
1486 /*
1487 * Handlers for TGSI instructions which do have 1:1 mapping to native
1488 * instructions:
1489 */
1490
1491 static void
1492 instr_cat0(const struct instr_translater *t,
1493 struct fd3_compile_context *ctx,
1494 struct tgsi_full_instruction *inst)
1495 {
1496 instr_create(ctx, 0, t->opc);
1497 }
1498
1499 static void
1500 instr_cat1(const struct instr_translater *t,
1501 struct fd3_compile_context *ctx,
1502 struct tgsi_full_instruction *inst)
1503 {
1504 struct tgsi_dst_register *dst = get_dst(ctx, inst);
1505 struct tgsi_src_register *src = &inst->Src[0].Register;
1506
1507 /* mov instructions can't handle a negate on src: */
1508 if (src->Negate) {
1509 struct tgsi_src_register constval;
1510 struct ir3_instruction *instr;
1511
1512 /* since right now, we are using uniformly either TYPE_F16 or
1513 * TYPE_F32, and we don't utilize the conversion possibilities
1514 * of mov instructions, we can get away with substituting an
1515 * add.f which can handle negate. Might need to revisit this
1516 * in the future if we start supporting widening/narrowing or
1517 * conversion to/from integer..
1518 */
1519 instr = instr_create(ctx, 2, OPC_ADD_F);
1520 get_immediate(ctx, &constval, fui(0.0));
1521 vectorize(ctx, instr, dst, 2, src, 0, &constval, 0);
1522 } else {
1523 create_mov(ctx, dst, src);
1524 /* create_mov() generates vector sequence, so no vectorize() */
1525 }
1526 put_dst(ctx, inst, dst);
1527 }
1528
1529 static void
1530 instr_cat2(const struct instr_translater *t,
1531 struct fd3_compile_context *ctx,
1532 struct tgsi_full_instruction *inst)
1533 {
1534 struct tgsi_dst_register *dst = get_dst(ctx, inst);
1535 struct tgsi_src_register *src0 = &inst->Src[0].Register;
1536 struct tgsi_src_register *src1 = &inst->Src[1].Register;
1537 struct ir3_instruction *instr;
1538 unsigned src0_flags = 0, src1_flags = 0;
1539
1540 switch (t->tgsi_opc) {
1541 case TGSI_OPCODE_ABS:
1542 src0_flags = IR3_REG_ABS;
1543 break;
1544 case TGSI_OPCODE_SUB:
1545 src1_flags = IR3_REG_NEGATE;
1546 break;
1547 }
1548
1549 switch (t->opc) {
1550 case OPC_ABSNEG_F:
1551 case OPC_ABSNEG_S:
1552 case OPC_CLZ_B:
1553 case OPC_CLZ_S:
1554 case OPC_SIGN_F:
1555 case OPC_FLOOR_F:
1556 case OPC_CEIL_F:
1557 case OPC_RNDNE_F:
1558 case OPC_RNDAZ_F:
1559 case OPC_TRUNC_F:
1560 case OPC_NOT_B:
1561 case OPC_BFREV_B:
1562 case OPC_SETRM:
1563 case OPC_CBITS_B:
1564 /* these only have one src reg */
1565 instr = instr_create(ctx, 2, t->opc);
1566 vectorize(ctx, instr, dst, 1, src0, src0_flags);
1567 break;
1568 default:
1569 if (is_const(src0) && is_const(src1))
1570 src0 = get_unconst(ctx, src0);
1571
1572 instr = instr_create(ctx, 2, t->opc);
1573 vectorize(ctx, instr, dst, 2, src0, src0_flags,
1574 src1, src1_flags);
1575 break;
1576 }
1577
1578 put_dst(ctx, inst, dst);
1579 }
1580
1581 static void
1582 instr_cat3(const struct instr_translater *t,
1583 struct fd3_compile_context *ctx,
1584 struct tgsi_full_instruction *inst)
1585 {
1586 struct tgsi_dst_register *dst = get_dst(ctx, inst);
1587 struct tgsi_src_register *src0 = &inst->Src[0].Register;
1588 struct tgsi_src_register *src1 = &inst->Src[1].Register;
1589 struct ir3_instruction *instr;
1590
1591 /* in particular, can't handle const for src1 for cat3..
1592 * for mad, we can swap first two src's if needed:
1593 */
1594 if (is_rel_or_const(src1)) {
1595 if (is_mad(t->opc) && !is_rel_or_const(src0)) {
1596 struct tgsi_src_register *tmp;
1597 tmp = src0;
1598 src0 = src1;
1599 src1 = tmp;
1600 } else {
1601 src1 = get_unconst(ctx, src1);
1602 }
1603 }
1604
1605 instr = instr_create(ctx, 3,
1606 ctx->so->half_precision ? t->hopc : t->opc);
1607 vectorize(ctx, instr, dst, 3, src0, 0, src1, 0,
1608 &inst->Src[2].Register, 0);
1609 put_dst(ctx, inst, dst);
1610 }
1611
1612 static void
1613 instr_cat4(const struct instr_translater *t,
1614 struct fd3_compile_context *ctx,
1615 struct tgsi_full_instruction *inst)
1616 {
1617 struct tgsi_dst_register *dst = get_dst(ctx, inst);
1618 struct tgsi_src_register *src = &inst->Src[0].Register;
1619 struct ir3_instruction *instr;
1620 unsigned i;
1621
1622 /* seems like blob compiler avoids const as src.. */
1623 if (is_const(src))
1624 src = get_unconst(ctx, src);
1625
1626 /* we need to replicate into each component: */
1627 for (i = 0; i < 4; i++) {
1628 if (dst->WriteMask & (1 << i)) {
1629 instr = instr_create(ctx, 4, t->opc);
1630 add_dst_reg(ctx, instr, dst, i);
1631 add_src_reg(ctx, instr, src, src->SwizzleX);
1632 }
1633 }
1634
1635 put_dst(ctx, inst, dst);
1636 }
1637
1638 static const struct instr_translater translaters[TGSI_OPCODE_LAST] = {
1639 #define INSTR(n, f, ...) \
1640 [TGSI_OPCODE_ ## n] = { .fxn = (f), .tgsi_opc = TGSI_OPCODE_ ## n, ##__VA_ARGS__ }
1641
1642 INSTR(MOV, instr_cat1),
1643 INSTR(RCP, instr_cat4, .opc = OPC_RCP),
1644 INSTR(RSQ, instr_cat4, .opc = OPC_RSQ),
1645 INSTR(SQRT, instr_cat4, .opc = OPC_SQRT),
1646 INSTR(MUL, instr_cat2, .opc = OPC_MUL_F),
1647 INSTR(ADD, instr_cat2, .opc = OPC_ADD_F),
1648 INSTR(SUB, instr_cat2, .opc = OPC_ADD_F),
1649 INSTR(MIN, instr_cat2, .opc = OPC_MIN_F),
1650 INSTR(MAX, instr_cat2, .opc = OPC_MAX_F),
1651 INSTR(MAD, instr_cat3, .opc = OPC_MAD_F32, .hopc = OPC_MAD_F16),
1652 INSTR(TRUNC, instr_cat2, .opc = OPC_TRUNC_F),
1653 INSTR(CLAMP, trans_clamp),
1654 INSTR(FLR, instr_cat2, .opc = OPC_FLOOR_F),
1655 INSTR(ROUND, instr_cat2, .opc = OPC_RNDNE_F),
1656 INSTR(ARL, trans_arl),
1657 INSTR(EX2, instr_cat4, .opc = OPC_EXP2),
1658 INSTR(LG2, instr_cat4, .opc = OPC_LOG2),
1659 INSTR(ABS, instr_cat2, .opc = OPC_ABSNEG_F),
1660 INSTR(COS, instr_cat4, .opc = OPC_COS),
1661 INSTR(SIN, instr_cat4, .opc = OPC_SIN),
1662 INSTR(TEX, trans_samp, .opc = OPC_SAM, .arg = TGSI_OPCODE_TEX),
1663 INSTR(TXP, trans_samp, .opc = OPC_SAM, .arg = TGSI_OPCODE_TXP),
1664 INSTR(SGT, trans_cmp),
1665 INSTR(SLT, trans_cmp),
1666 INSTR(SGE, trans_cmp),
1667 INSTR(SLE, trans_cmp),
1668 INSTR(SNE, trans_cmp),
1669 INSTR(SEQ, trans_cmp),
1670 INSTR(CMP, trans_cmp),
1671 INSTR(IF, trans_if),
1672 INSTR(ELSE, trans_else),
1673 INSTR(ENDIF, trans_endif),
1674 INSTR(END, instr_cat0, .opc = OPC_END),
1675 INSTR(KILL, instr_cat0, .opc = OPC_KILL),
1676 };
1677
1678 static fd3_semantic
1679 decl_semantic(const struct tgsi_declaration_semantic *sem)
1680 {
1681 return fd3_semantic_name(sem->Name, sem->Index);
1682 }
1683
1684 static void
1685 decl_in(struct fd3_compile_context *ctx, struct tgsi_full_declaration *decl)
1686 {
1687 struct fd3_shader_stateobj *so = ctx->so;
1688 unsigned base = ctx->base_reg[TGSI_FILE_INPUT];
1689 unsigned i, flags = 0;
1690
1691 /* I don't think we should get frag shader input without
1692 * semantic info? Otherwise how do inputs get linked to
1693 * vert outputs?
1694 */
1695 compile_assert(ctx, (ctx->type == TGSI_PROCESSOR_VERTEX) ||
1696 decl->Declaration.Semantic);
1697
1698 if (ctx->so->half_precision)
1699 flags |= IR3_REG_HALF;
1700
1701 for (i = decl->Range.First; i <= decl->Range.Last; i++) {
1702 unsigned n = so->inputs_count++;
1703 unsigned r = regid(i + base, 0);
1704 unsigned ncomp, j;
1705
1706 /* TODO use ctx->info.input_usage_mask[decl->Range.n] to figure out ncomp: */
1707 ncomp = 4;
1708
1709 DBG("decl in -> r%d", i + base);
1710
1711 so->inputs[n].semantic = decl_semantic(&decl->Semantic);
1712 so->inputs[n].compmask = (1 << ncomp) - 1;
1713 so->inputs[n].regid = r;
1714 so->inputs[n].inloc = ctx->next_inloc;
1715 ctx->next_inloc += ncomp;
1716
1717 so->total_in += ncomp;
1718
1719 for (j = 0; j < ncomp; j++) {
1720 struct ir3_instruction *instr;
1721
1722 if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
1723 struct ir3_register *src;
1724
1725 instr = instr_create(ctx, 2, OPC_BARY_F);
1726
1727 /* dst register: */
1728 ir3_reg_create(instr, r + j, flags);
1729
1730 /* input position: */
1731 ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val =
1732 so->inputs[n].inloc + j - 8;
1733
1734 /* input base (always r0.xy): */
1735 src = ir3_reg_create(instr, regid(0,0), IR3_REG_SSA);
1736 src->wrmask = 0x3;
1737 src->instr = ctx->frag_pos;
1738
1739 } else {
1740 instr = create_input(ctx->block, NULL, (i * 4) + j);
1741 }
1742
1743 ctx->block->inputs[(i * 4) + j] = instr;
1744 }
1745 }
1746 }
1747
1748 static void
1749 decl_out(struct fd3_compile_context *ctx, struct tgsi_full_declaration *decl)
1750 {
1751 struct fd3_shader_stateobj *so = ctx->so;
1752 unsigned base = ctx->base_reg[TGSI_FILE_OUTPUT];
1753 unsigned comp = 0;
1754 unsigned name = decl->Semantic.Name;
1755 unsigned i;
1756
1757 compile_assert(ctx, decl->Declaration.Semantic);
1758
1759 DBG("decl out[%d] -> r%d", name, decl->Range.First + base);
1760
1761 if (ctx->type == TGSI_PROCESSOR_VERTEX) {
1762 switch (name) {
1763 case TGSI_SEMANTIC_POSITION:
1764 so->writes_pos = true;
1765 /* fallthrough */
1766 case TGSI_SEMANTIC_PSIZE:
1767 case TGSI_SEMANTIC_COLOR:
1768 case TGSI_SEMANTIC_GENERIC:
1769 case TGSI_SEMANTIC_FOG:
1770 case TGSI_SEMANTIC_TEXCOORD:
1771 break;
1772 default:
1773 compile_error(ctx, "unknown VS semantic name: %s\n",
1774 tgsi_semantic_names[name]);
1775 }
1776 } else {
1777 switch (name) {
1778 case TGSI_SEMANTIC_POSITION:
1779 comp = 2; /* tgsi will write to .z component */
1780 so->writes_pos = true;
1781 /* fallthrough */
1782 case TGSI_SEMANTIC_COLOR:
1783 break;
1784 default:
1785 compile_error(ctx, "unknown FS semantic name: %s\n",
1786 tgsi_semantic_names[name]);
1787 }
1788 }
1789
1790 for (i = decl->Range.First; i <= decl->Range.Last; i++) {
1791 unsigned n = so->outputs_count++;
1792 unsigned ncomp, j;
1793
1794 ncomp = 4;
1795
1796 so->outputs[n].semantic = decl_semantic(&decl->Semantic);
1797 so->outputs[n].regid = regid(i + base, comp);
1798
1799 /* avoid undefined outputs, stick a dummy mov from imm{0.0},
1800 * which if the output is actually assigned will be over-
1801 * written
1802 */
1803 for (j = 0; j < ncomp; j++)
1804 ctx->block->outputs[(i * 4) + j] = create_immed(ctx, 0.0);
1805 }
1806 }
1807
1808 static void
1809 decl_samp(struct fd3_compile_context *ctx, struct tgsi_full_declaration *decl)
1810 {
1811 ctx->so->samplers_count++;
1812 }
1813
1814 static void
1815 compile_instructions(struct fd3_compile_context *ctx)
1816 {
1817 push_block(ctx);
1818
1819 /* for fragment shader, we have a single input register (r0.xy)
1820 * which is used as the base for bary.f varying fetch instrs:
1821 */
1822 if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
1823 struct ir3_instruction *instr;
1824 instr = ir3_instr_create(ctx->block, -1, OPC_META_FI);
1825 ir3_reg_create(instr, 0, 0);
1826 ir3_reg_create(instr, 0, IR3_REG_SSA); /* r0.x */
1827 ir3_reg_create(instr, 0, IR3_REG_SSA); /* r0.y */
1828 ctx->frag_pos = instr;
1829 }
1830
1831 while (!tgsi_parse_end_of_tokens(&ctx->parser)) {
1832 tgsi_parse_token(&ctx->parser);
1833
1834 switch (ctx->parser.FullToken.Token.Type) {
1835 case TGSI_TOKEN_TYPE_DECLARATION: {
1836 struct tgsi_full_declaration *decl =
1837 &ctx->parser.FullToken.FullDeclaration;
1838 if (decl->Declaration.File == TGSI_FILE_OUTPUT) {
1839 decl_out(ctx, decl);
1840 } else if (decl->Declaration.File == TGSI_FILE_INPUT) {
1841 decl_in(ctx, decl);
1842 } else if (decl->Declaration.File == TGSI_FILE_SAMPLER) {
1843 decl_samp(ctx, decl);
1844 }
1845 break;
1846 }
1847 case TGSI_TOKEN_TYPE_IMMEDIATE: {
1848 /* TODO: if we know the immediate is small enough, and only
1849 * used with instructions that can embed an immediate, we
1850 * can skip this:
1851 */
1852 struct tgsi_full_immediate *imm =
1853 &ctx->parser.FullToken.FullImmediate;
1854 unsigned n = ctx->so->immediates_count++;
1855 memcpy(ctx->so->immediates[n].val, imm->u, 16);
1856 break;
1857 }
1858 case TGSI_TOKEN_TYPE_INSTRUCTION: {
1859 struct tgsi_full_instruction *inst =
1860 &ctx->parser.FullToken.FullInstruction;
1861 unsigned opc = inst->Instruction.Opcode;
1862 const struct instr_translater *t = &translaters[opc];
1863
1864 if (t->fxn) {
1865 t->fxn(t, ctx, inst);
1866 ctx->num_internal_temps = 0;
1867 } else {
1868 compile_error(ctx, "unknown TGSI opc: %s\n",
1869 tgsi_get_opcode_name(opc));
1870 }
1871
1872 switch (inst->Instruction.Saturate) {
1873 case TGSI_SAT_ZERO_ONE:
1874 create_clamp_imm(ctx, &inst->Dst[0].Register,
1875 fui(0.0), fui(1.0));
1876 break;
1877 case TGSI_SAT_MINUS_PLUS_ONE:
1878 create_clamp_imm(ctx, &inst->Dst[0].Register,
1879 fui(-1.0), fui(1.0));
1880 break;
1881 }
1882
1883 instr_finish(ctx);
1884
1885 break;
1886 }
1887 default:
1888 break;
1889 }
1890 }
1891
1892 /* fixup actual inputs for frag shader: */
1893 if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
1894 struct ir3_instruction *instr;
1895
1896 ctx->block->ninputs = 2;
1897
1898 /* r0.x */
1899 instr = create_input(ctx->block, NULL, 0);
1900 ctx->block->inputs[0] = instr;
1901 ctx->frag_pos->regs[1]->instr = instr;
1902
1903 /* r0.y */
1904 instr = create_input(ctx->block, NULL, 1);
1905 ctx->block->inputs[1] = instr;
1906 ctx->frag_pos->regs[2]->instr = instr;
1907 }
1908 }
1909
1910 static void
1911 compile_dump(struct fd3_compile_context *ctx)
1912 {
1913 const char *name = (ctx->so->type == SHADER_VERTEX) ? "vert" : "frag";
1914 static unsigned n = 0;
1915 char fname[16];
1916 FILE *f;
1917 snprintf(fname, sizeof(fname), "%s-%04u.dot", name, n++);
1918 f = fopen(fname, "w");
1919 if (!f)
1920 return;
1921 ir3_block_depth(ctx->block);
1922 ir3_shader_dump(ctx->ir, name, ctx->block, f);
1923 fclose(f);
1924 }
1925
1926 int
1927 fd3_compile_shader(struct fd3_shader_stateobj *so,
1928 const struct tgsi_token *tokens)
1929 {
1930 struct fd3_compile_context ctx;
1931 unsigned i, actual_in;
1932 int ret = 0;
1933
1934 assert(!so->ir);
1935
1936 so->ir = ir3_shader_create();
1937
1938 assert(so->ir);
1939
1940 if (compile_init(&ctx, so, tokens) != TGSI_PARSE_OK) {
1941 ret = -1;
1942 goto out;
1943 }
1944
1945 compile_instructions(&ctx);
1946
1947 if (fd_mesa_debug & FD_DBG_OPTDUMP)
1948 compile_dump(&ctx);
1949
1950 ret = ir3_block_flatten(ctx.block);
1951 if (ret < 0)
1952 goto out;
1953 if ((ret > 0) && (fd_mesa_debug & FD_DBG_OPTDUMP))
1954 compile_dump(&ctx);
1955
1956 ir3_block_cp(ctx.block);
1957
1958 if (fd_mesa_debug & FD_DBG_OPTDUMP)
1959 compile_dump(&ctx);
1960
1961 ir3_block_depth(ctx.block);
1962
1963 if (fd_mesa_debug & FD_DBG_OPTMSGS) {
1964 printf("AFTER DEPTH:\n");
1965 ir3_dump_instr_list(ctx.block->head);
1966 }
1967
1968 ir3_block_sched(ctx.block);
1969
1970 if (fd_mesa_debug & FD_DBG_OPTMSGS) {
1971 printf("AFTER SCHED:\n");
1972 ir3_dump_instr_list(ctx.block->head);
1973 }
1974
1975 ret = ir3_block_ra(ctx.block, so->type);
1976 if (ret)
1977 goto out;
1978
1979 if (fd_mesa_debug & FD_DBG_OPTMSGS) {
1980 printf("AFTER RA:\n");
1981 ir3_dump_instr_list(ctx.block->head);
1982 }
1983
1984 /* fixup input/outputs: */
1985 for (i = 0; i < so->outputs_count; i++) {
1986 so->outputs[i].regid = ctx.block->outputs[i*4]->regs[0]->num;
1987 /* preserve hack for depth output.. tgsi writes depth to .z,
1988 * but what we give the hw is the scalar register:
1989 */
1990 if ((ctx.type == TGSI_PROCESSOR_FRAGMENT) &&
1991 (sem2name(so->outputs[i].semantic) == TGSI_SEMANTIC_POSITION))
1992 so->outputs[i].regid += 2;
1993 }
1994 /* Note that some or all channels of an input may be unused: */
1995 actual_in = 0;
1996 for (i = 0; i < so->inputs_count; i++) {
1997 unsigned j, regid = ~0, compmask = 0;
1998 for (j = 0; j < 4; j++) {
1999 struct ir3_instruction *in = ctx.block->inputs[(i*4) + j];
2000 if (in) {
2001 compmask |= (1 << j);
2002 regid = in->regs[0]->num - j;
2003 actual_in++;
2004 }
2005 }
2006 so->inputs[i].regid = regid;
2007 so->inputs[i].compmask = compmask;
2008 }
2009
2010 /* fragment shader always gets full vec4's even if it doesn't
2011 * fetch all components, but vertex shader we need to update
2012 * with the actual number of components fetch, otherwise thing
2013 * will hang due to mismaptch between VFD_DECODE's and
2014 * TOTALATTRTOVS
2015 */
2016 if (so->type == SHADER_VERTEX)
2017 so->total_in = actual_in;
2018
2019 out:
2020 if (ret) {
2021 ir3_shader_destroy(so->ir);
2022 so->ir = NULL;
2023 }
2024 compile_free(&ctx);
2025
2026 return ret;
2027 }