gallivm: work around slow code generated for interleaving 128bit vectors
[mesa.git] / src / gallium / drivers / freedreno / freedreno_compiler.c
1 /* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
2
3 /*
4 * Copyright (C) 2012 Rob Clark <robclark@freedesktop.org>
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 * SOFTWARE.
24 *
25 * Authors:
26 * Rob Clark <robclark@freedesktop.org>
27 */
28
29 #include "pipe/p_state.h"
30 #include "util/u_string.h"
31 #include "util/u_memory.h"
32 #include "util/u_inlines.h"
33 #include "tgsi/tgsi_parse.h"
34 #include "tgsi/tgsi_ureg.h"
35 #include "tgsi/tgsi_info.h"
36 #include "tgsi/tgsi_strings.h"
37 #include "tgsi/tgsi_dump.h"
38
39 #include "freedreno_program.h"
40 #include "freedreno_compiler.h"
41 #include "freedreno_util.h"
42
43 #include "instr-a2xx.h"
44 #include "ir-a2xx.h"
45
46 struct fd_compile_context {
47 struct fd_program_stateobj *prog;
48 struct fd_shader_stateobj *so;
49
50 struct tgsi_parse_context parser;
51 unsigned type;
52
53 /* predicate stack: */
54 int pred_depth;
55 enum ir2_pred pred_stack[8];
56
57 /* Internal-Temporary and Predicate register assignment:
58 *
59 * Some TGSI instructions which translate into multiple actual
60 * instructions need one or more temporary registers, which are not
61 * assigned from TGSI perspective (ie. not TGSI_FILE_TEMPORARY).
62 * And some instructions (texture fetch) cannot write directly to
63 * output registers. We could be more clever and re-use dst or a
64 * src register in some cases. But for now don't try to be clever.
65 * Eventually we should implement an optimization pass that re-
66 * juggles the register usage and gets rid of unneeded temporaries.
67 *
68 * The predicate register must be valid across multiple TGSI
69 * instructions, but internal temporary's do not. For this reason,
70 * once the predicate register is requested, until it is no longer
71 * needed, it gets the first register slot after after the TGSI
72 * assigned temporaries (ie. num_regs[TGSI_FILE_TEMPORARY]), and the
73 * internal temporaries get the register slots above this.
74 */
75
76 int pred_reg;
77 int num_internal_temps;
78
79 uint8_t num_regs[TGSI_FILE_COUNT];
80
81 /* maps input register idx to prog->export_linkage idx: */
82 uint8_t input_export_idx[64];
83
84 /* maps output register idx to prog->export_linkage idx: */
85 uint8_t output_export_idx[64];
86
87 /* idx/slot for last compiler generated immediate */
88 unsigned immediate_idx;
89
90 // TODO we can skip emit exports in the VS that the FS doesn't need..
91 // and get rid perhaps of num_param..
92 unsigned num_position, num_param;
93 unsigned position, psize;
94
95 uint64_t need_sync;
96
97 /* current exec CF instruction */
98 struct ir2_cf *cf;
99 };
100
101 static int
102 semantic_idx(struct tgsi_declaration_semantic *semantic)
103 {
104 int idx = semantic->Name;
105 if (idx == TGSI_SEMANTIC_GENERIC)
106 idx = TGSI_SEMANTIC_COUNT + semantic->Index;
107 return idx;
108 }
109
110 /* assign/get the input/export register # for given semantic idx as
111 * returned by semantic_idx():
112 */
113 static int
114 export_linkage(struct fd_compile_context *ctx, int idx)
115 {
116 struct fd_program_stateobj *prog = ctx->prog;
117
118 /* if first time we've seen this export, assign the next available slot: */
119 if (prog->export_linkage[idx] == 0xff)
120 prog->export_linkage[idx] = prog->num_exports++;
121
122 return prog->export_linkage[idx];
123 }
124
125 static unsigned
126 compile_init(struct fd_compile_context *ctx, struct fd_program_stateobj *prog,
127 struct fd_shader_stateobj *so)
128 {
129 unsigned ret;
130
131 ctx->prog = prog;
132 ctx->so = so;
133 ctx->cf = NULL;
134 ctx->pred_depth = 0;
135
136 ret = tgsi_parse_init(&ctx->parser, so->tokens);
137 if (ret != TGSI_PARSE_OK)
138 return ret;
139
140 ctx->type = ctx->parser.FullHeader.Processor.Processor;
141 ctx->position = ~0;
142 ctx->psize = ~0;
143 ctx->num_position = 0;
144 ctx->num_param = 0;
145 ctx->need_sync = 0;
146 ctx->immediate_idx = 0;
147 ctx->pred_reg = -1;
148 ctx->num_internal_temps = 0;
149
150 memset(ctx->num_regs, 0, sizeof(ctx->num_regs));
151 memset(ctx->input_export_idx, 0, sizeof(ctx->input_export_idx));
152 memset(ctx->output_export_idx, 0, sizeof(ctx->output_export_idx));
153
154 /* do first pass to extract declarations: */
155 while (!tgsi_parse_end_of_tokens(&ctx->parser)) {
156 tgsi_parse_token(&ctx->parser);
157
158 switch (ctx->parser.FullToken.Token.Type) {
159 case TGSI_TOKEN_TYPE_DECLARATION: {
160 struct tgsi_full_declaration *decl =
161 &ctx->parser.FullToken.FullDeclaration;
162 if (decl->Declaration.File == TGSI_FILE_OUTPUT) {
163 unsigned name = decl->Semantic.Name;
164
165 assert(decl->Declaration.Semantic); // TODO is this ever not true?
166
167 ctx->output_export_idx[decl->Range.First] =
168 semantic_idx(&decl->Semantic);
169
170 if (ctx->type == TGSI_PROCESSOR_VERTEX) {
171 switch (name) {
172 case TGSI_SEMANTIC_POSITION:
173 ctx->position = ctx->num_regs[TGSI_FILE_OUTPUT];
174 ctx->num_position++;
175 break;
176 case TGSI_SEMANTIC_PSIZE:
177 ctx->psize = ctx->num_regs[TGSI_FILE_OUTPUT];
178 ctx->num_position++;
179 case TGSI_SEMANTIC_COLOR:
180 case TGSI_SEMANTIC_GENERIC:
181 ctx->num_param++;
182 break;
183 default:
184 DBG("unknown VS semantic name: %s",
185 tgsi_semantic_names[name]);
186 assert(0);
187 }
188 } else {
189 switch (name) {
190 case TGSI_SEMANTIC_COLOR:
191 case TGSI_SEMANTIC_GENERIC:
192 ctx->num_param++;
193 break;
194 default:
195 DBG("unknown PS semantic name: %s",
196 tgsi_semantic_names[name]);
197 assert(0);
198 }
199 }
200 } else if (decl->Declaration.File == TGSI_FILE_INPUT) {
201 ctx->input_export_idx[decl->Range.First] =
202 semantic_idx(&decl->Semantic);
203 }
204 ctx->num_regs[decl->Declaration.File] =
205 MAX2(ctx->num_regs[decl->Declaration.File], decl->Range.Last + 1);
206 break;
207 }
208 case TGSI_TOKEN_TYPE_IMMEDIATE: {
209 struct tgsi_full_immediate *imm =
210 &ctx->parser.FullToken.FullImmediate;
211 unsigned n = ctx->so->num_immediates++;
212 memcpy(ctx->so->immediates[n].val, imm->u, 16);
213 break;
214 }
215 default:
216 break;
217 }
218 }
219
220 /* TGSI generated immediates are always entire vec4's, ones we
221 * generate internally are not:
222 */
223 ctx->immediate_idx = ctx->so->num_immediates * 4;
224
225 ctx->so->first_immediate = ctx->num_regs[TGSI_FILE_CONSTANT];
226
227 tgsi_parse_free(&ctx->parser);
228
229 return tgsi_parse_init(&ctx->parser, so->tokens);
230 }
231
232 static void
233 compile_free(struct fd_compile_context *ctx)
234 {
235 tgsi_parse_free(&ctx->parser);
236 }
237
238 static struct ir2_cf *
239 next_exec_cf(struct fd_compile_context *ctx)
240 {
241 struct ir2_cf *cf = ctx->cf;
242 if (!cf || cf->exec.instrs_count >= ARRAY_SIZE(ctx->cf->exec.instrs))
243 ctx->cf = cf = ir2_cf_create(ctx->so->ir, EXEC);
244 return cf;
245 }
246
247 static void
248 compile_vtx_fetch(struct fd_compile_context *ctx)
249 {
250 struct ir2_instruction **vfetch_instrs = ctx->so->vfetch_instrs;
251 int i;
252 for (i = 0; i < ctx->num_regs[TGSI_FILE_INPUT]; i++) {
253 struct ir2_instruction *instr = ir2_instr_create(
254 next_exec_cf(ctx), IR2_FETCH);
255 instr->fetch.opc = VTX_FETCH;
256
257 ctx->need_sync |= 1 << (i+1);
258
259 ir2_reg_create(instr, i+1, "xyzw", 0);
260 ir2_reg_create(instr, 0, "x", 0);
261
262 if (i == 0)
263 instr->sync = true;
264
265 vfetch_instrs[i] = instr;
266 }
267 ctx->so->num_vfetch_instrs = i;
268 ctx->cf = NULL;
269 }
270
271 /*
272 * For vertex shaders (VS):
273 * --- ------ -------------
274 *
275 * Inputs: R1-R(num_input)
276 * Constants: C0-C(num_const-1)
277 * Immediates: C(num_const)-C(num_const+num_imm-1)
278 * Outputs: export0-export(n) and export62, export63
279 * n is # of outputs minus gl_Position (export62) and gl_PointSize (export63)
280 * Temps: R(num_input+1)-R(num_input+num_temps)
281 *
282 * R0 could be clobbered after the vertex fetch instructions.. so we
283 * could use it for one of the temporaries.
284 *
285 * TODO: maybe the vertex fetch part could fetch first input into R0 as
286 * the last vtx fetch instruction, which would let us use the same
287 * register layout in either case.. although this is not what the blob
288 * compiler does.
289 *
290 *
291 * For frag shaders (PS):
292 * --- ---- -------------
293 *
294 * Inputs: R0-R(num_input-1)
295 * Constants: same as VS
296 * Immediates: same as VS
297 * Outputs: export0-export(num_outputs)
298 * Temps: R(num_input)-R(num_input+num_temps-1)
299 *
300 * In either case, immediates are are postpended to the constants
301 * (uniforms).
302 *
303 */
304
305 static unsigned
306 get_temp_gpr(struct fd_compile_context *ctx, int idx)
307 {
308 unsigned num = idx + ctx->num_regs[TGSI_FILE_INPUT];
309 if (ctx->type == TGSI_PROCESSOR_VERTEX)
310 num++;
311 return num;
312 }
313
314 static struct ir2_register *
315 add_dst_reg(struct fd_compile_context *ctx, struct ir2_instruction *alu,
316 const struct tgsi_dst_register *dst)
317 {
318 unsigned flags = 0, num = 0;
319 char swiz[5];
320
321 switch (dst->File) {
322 case TGSI_FILE_OUTPUT:
323 flags |= IR2_REG_EXPORT;
324 if (ctx->type == TGSI_PROCESSOR_VERTEX) {
325 if (dst->Index == ctx->position) {
326 num = 62;
327 } else if (dst->Index == ctx->psize) {
328 num = 63;
329 } else {
330 num = export_linkage(ctx,
331 ctx->output_export_idx[dst->Index]);
332 }
333 } else {
334 num = dst->Index;
335 }
336 break;
337 case TGSI_FILE_TEMPORARY:
338 num = get_temp_gpr(ctx, dst->Index);
339 break;
340 default:
341 DBG("unsupported dst register file: %s",
342 tgsi_file_name(dst->File));
343 assert(0);
344 break;
345 }
346
347 swiz[0] = (dst->WriteMask & TGSI_WRITEMASK_X) ? 'x' : '_';
348 swiz[1] = (dst->WriteMask & TGSI_WRITEMASK_Y) ? 'y' : '_';
349 swiz[2] = (dst->WriteMask & TGSI_WRITEMASK_Z) ? 'z' : '_';
350 swiz[3] = (dst->WriteMask & TGSI_WRITEMASK_W) ? 'w' : '_';
351 swiz[4] = '\0';
352
353 return ir2_reg_create(alu, num, swiz, flags);
354 }
355
356 static struct ir2_register *
357 add_src_reg(struct fd_compile_context *ctx, struct ir2_instruction *alu,
358 const struct tgsi_src_register *src)
359 {
360 static const char swiz_vals[] = {
361 'x', 'y', 'z', 'w',
362 };
363 char swiz[5];
364 unsigned flags = 0, num = 0;
365
366 switch (src->File) {
367 case TGSI_FILE_CONSTANT:
368 num = src->Index;
369 flags |= IR2_REG_CONST;
370 break;
371 case TGSI_FILE_INPUT:
372 if (ctx->type == TGSI_PROCESSOR_VERTEX) {
373 num = src->Index + 1;
374 } else {
375 num = export_linkage(ctx,
376 ctx->input_export_idx[src->Index]);
377 }
378 break;
379 case TGSI_FILE_TEMPORARY:
380 num = get_temp_gpr(ctx, src->Index);
381 break;
382 case TGSI_FILE_IMMEDIATE:
383 num = src->Index + ctx->num_regs[TGSI_FILE_CONSTANT];
384 flags |= IR2_REG_CONST;
385 break;
386 default:
387 DBG("unsupported src register file: %s",
388 tgsi_file_name(src->File));
389 assert(0);
390 break;
391 }
392
393 if (src->Absolute)
394 flags |= IR2_REG_ABS;
395 if (src->Negate)
396 flags |= IR2_REG_NEGATE;
397
398 swiz[0] = swiz_vals[src->SwizzleX];
399 swiz[1] = swiz_vals[src->SwizzleY];
400 swiz[2] = swiz_vals[src->SwizzleZ];
401 swiz[3] = swiz_vals[src->SwizzleW];
402 swiz[4] = '\0';
403
404 if ((ctx->need_sync & (uint64_t)(1 << num)) &&
405 !(flags & IR2_REG_CONST)) {
406 alu->sync = true;
407 ctx->need_sync &= ~(uint64_t)(1 << num);
408 }
409
410 return ir2_reg_create(alu, num, swiz, flags);
411 }
412
413 static void
414 add_vector_clamp(struct tgsi_full_instruction *inst, struct ir2_instruction *alu)
415 {
416 switch (inst->Instruction.Saturate) {
417 case TGSI_SAT_NONE:
418 break;
419 case TGSI_SAT_ZERO_ONE:
420 alu->alu.vector_clamp = true;
421 break;
422 case TGSI_SAT_MINUS_PLUS_ONE:
423 DBG("unsupported saturate");
424 assert(0);
425 break;
426 }
427 }
428
429 static void
430 add_scalar_clamp(struct tgsi_full_instruction *inst, struct ir2_instruction *alu)
431 {
432 switch (inst->Instruction.Saturate) {
433 case TGSI_SAT_NONE:
434 break;
435 case TGSI_SAT_ZERO_ONE:
436 alu->alu.scalar_clamp = true;
437 break;
438 case TGSI_SAT_MINUS_PLUS_ONE:
439 DBG("unsupported saturate");
440 assert(0);
441 break;
442 }
443 }
444
445 static void
446 add_regs_vector_1(struct fd_compile_context *ctx,
447 struct tgsi_full_instruction *inst, struct ir2_instruction *alu)
448 {
449 assert(inst->Instruction.NumSrcRegs == 1);
450 assert(inst->Instruction.NumDstRegs == 1);
451
452 add_dst_reg(ctx, alu, &inst->Dst[0].Register);
453 add_src_reg(ctx, alu, &inst->Src[0].Register);
454 add_src_reg(ctx, alu, &inst->Src[0].Register);
455 add_vector_clamp(inst, alu);
456 }
457
458 static void
459 add_regs_vector_2(struct fd_compile_context *ctx,
460 struct tgsi_full_instruction *inst, struct ir2_instruction *alu)
461 {
462 assert(inst->Instruction.NumSrcRegs == 2);
463 assert(inst->Instruction.NumDstRegs == 1);
464
465 add_dst_reg(ctx, alu, &inst->Dst[0].Register);
466 add_src_reg(ctx, alu, &inst->Src[0].Register);
467 add_src_reg(ctx, alu, &inst->Src[1].Register);
468 add_vector_clamp(inst, alu);
469 }
470
471 static void
472 add_regs_vector_3(struct fd_compile_context *ctx,
473 struct tgsi_full_instruction *inst, struct ir2_instruction *alu)
474 {
475 assert(inst->Instruction.NumSrcRegs == 3);
476 assert(inst->Instruction.NumDstRegs == 1);
477
478 add_dst_reg(ctx, alu, &inst->Dst[0].Register);
479 /* maybe should re-arrange the syntax some day, but
480 * in assembler/disassembler and what ir.c expects
481 * is: MULADDv Rdst = Rsrc2 + Rsrc0 * Rscr1
482 */
483 add_src_reg(ctx, alu, &inst->Src[2].Register);
484 add_src_reg(ctx, alu, &inst->Src[0].Register);
485 add_src_reg(ctx, alu, &inst->Src[1].Register);
486 add_vector_clamp(inst, alu);
487 }
488
489 static void
490 add_regs_dummy_vector(struct ir2_instruction *alu)
491 {
492 /* create dummy, non-written vector dst/src regs
493 * for unused vector instr slot:
494 */
495 ir2_reg_create(alu, 0, "____", 0); /* vector dst */
496 ir2_reg_create(alu, 0, NULL, 0); /* vector src1 */
497 ir2_reg_create(alu, 0, NULL, 0); /* vector src2 */
498 }
499
500 static void
501 add_regs_scalar_1(struct fd_compile_context *ctx,
502 struct tgsi_full_instruction *inst, struct ir2_instruction *alu)
503 {
504 assert(inst->Instruction.NumSrcRegs == 1);
505 assert(inst->Instruction.NumDstRegs == 1);
506
507 add_regs_dummy_vector(alu);
508
509 add_dst_reg(ctx, alu, &inst->Dst[0].Register);
510 add_src_reg(ctx, alu, &inst->Src[0].Register);
511 add_scalar_clamp(inst, alu);
512 }
513
514 /*
515 * Helpers for TGSI instructions that don't map to a single shader instr:
516 */
517
518 static void
519 src_from_dst(struct tgsi_src_register *src, struct tgsi_dst_register *dst)
520 {
521 src->File = dst->File;
522 src->Indirect = dst->Indirect;
523 src->Dimension = dst->Dimension;
524 src->Index = dst->Index;
525 src->Absolute = 0;
526 src->Negate = 0;
527 src->SwizzleX = TGSI_SWIZZLE_X;
528 src->SwizzleY = TGSI_SWIZZLE_Y;
529 src->SwizzleZ = TGSI_SWIZZLE_Z;
530 src->SwizzleW = TGSI_SWIZZLE_W;
531 }
532
533 /* Get internal-temp src/dst to use for a sequence of instructions
534 * generated by a single TGSI op.. if possible, use the final dst
535 * register as the temporary to avoid allocating a new register, but
536 * if necessary allocate one. If a single TGSI op needs multiple
537 * internal temps, pass NULL for orig_dst for all but the first one
538 * so that you don't end up using the same register for all your
539 * internal temps.
540 */
541 static void
542 get_internal_temp(struct fd_compile_context *ctx,
543 struct tgsi_dst_register *tmp_dst,
544 struct tgsi_src_register *tmp_src)
545 {
546 int n;
547
548 tmp_dst->File = TGSI_FILE_TEMPORARY;
549 tmp_dst->WriteMask = TGSI_WRITEMASK_XYZW;
550 tmp_dst->Indirect = 0;
551 tmp_dst->Dimension = 0;
552
553 /* assign next temporary: */
554 n = ctx->num_internal_temps++;
555 if (ctx->pred_reg != -1)
556 n++;
557
558 tmp_dst->Index = ctx->num_regs[TGSI_FILE_TEMPORARY] + n;
559
560 src_from_dst(tmp_src, tmp_dst);
561 }
562
563 static void
564 get_predicate(struct fd_compile_context *ctx, struct tgsi_dst_register *dst,
565 struct tgsi_src_register *src)
566 {
567 assert(ctx->pred_reg != -1);
568
569 dst->File = TGSI_FILE_TEMPORARY;
570 dst->WriteMask = TGSI_WRITEMASK_W;
571 dst->Indirect = 0;
572 dst->Dimension = 0;
573 dst->Index = get_temp_gpr(ctx, ctx->pred_reg);
574
575 if (src) {
576 src_from_dst(src, dst);
577 src->SwizzleX = TGSI_SWIZZLE_W;
578 src->SwizzleY = TGSI_SWIZZLE_W;
579 src->SwizzleZ = TGSI_SWIZZLE_W;
580 src->SwizzleW = TGSI_SWIZZLE_W;
581 }
582 }
583
584 static void
585 push_predicate(struct fd_compile_context *ctx, struct tgsi_src_register *src)
586 {
587 struct ir2_instruction *alu;
588 struct tgsi_dst_register pred_dst;
589
590 /* NOTE blob compiler seems to always puts PRED_* instrs in a CF by
591 * themselves:
592 */
593 ctx->cf = NULL;
594
595 if (ctx->pred_depth == 0) {
596 /* assign predicate register: */
597 ctx->pred_reg = ctx->num_regs[TGSI_FILE_TEMPORARY];
598
599 get_predicate(ctx, &pred_dst, NULL);
600
601 alu = ir2_instr_create_alu(next_exec_cf(ctx), ~0, PRED_SETNEs);
602 add_regs_dummy_vector(alu);
603 add_dst_reg(ctx, alu, &pred_dst);
604 add_src_reg(ctx, alu, src);
605 } else {
606 struct tgsi_src_register pred_src;
607
608 get_predicate(ctx, &pred_dst, &pred_src);
609
610 alu = ir2_instr_create_alu(next_exec_cf(ctx), MULv, ~0);
611 add_dst_reg(ctx, alu, &pred_dst);
612 add_src_reg(ctx, alu, &pred_src);
613 add_src_reg(ctx, alu, src);
614
615 // XXX need to make PRED_SETE_PUSHv IR2_PRED_NONE.. but need to make
616 // sure src reg is valid if it was calculated with a predicate
617 // condition..
618 alu->pred = IR2_PRED_NONE;
619 }
620
621 /* save previous pred state to restore in pop_predicate(): */
622 ctx->pred_stack[ctx->pred_depth++] = ctx->so->ir->pred;
623
624 ctx->cf = NULL;
625 }
626
627 static void
628 pop_predicate(struct fd_compile_context *ctx)
629 {
630 /* NOTE blob compiler seems to always puts PRED_* instrs in a CF by
631 * themselves:
632 */
633 ctx->cf = NULL;
634
635 /* restore previous predicate state: */
636 ctx->so->ir->pred = ctx->pred_stack[--ctx->pred_depth];
637
638 if (ctx->pred_depth != 0) {
639 struct ir2_instruction *alu;
640 struct tgsi_dst_register pred_dst;
641 struct tgsi_src_register pred_src;
642
643 get_predicate(ctx, &pred_dst, &pred_src);
644
645 alu = ir2_instr_create_alu(next_exec_cf(ctx), ~0, PRED_SET_POPs);
646 add_regs_dummy_vector(alu);
647 add_dst_reg(ctx, alu, &pred_dst);
648 add_src_reg(ctx, alu, &pred_src);
649 alu->pred = IR2_PRED_NONE;
650 } else {
651 /* predicate register no longer needed: */
652 ctx->pred_reg = -1;
653 }
654
655 ctx->cf = NULL;
656 }
657
658 static void
659 get_immediate(struct fd_compile_context *ctx,
660 struct tgsi_src_register *reg, uint32_t val)
661 {
662 unsigned neg, swiz, idx, i;
663 /* actually maps 1:1 currently.. not sure if that is safe to rely on: */
664 static const unsigned swiz2tgsi[] = {
665 TGSI_SWIZZLE_X, TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Z, TGSI_SWIZZLE_W,
666 };
667
668 for (i = 0; i < ctx->immediate_idx; i++) {
669 swiz = i % 4;
670 idx = i / 4;
671
672 if (ctx->so->immediates[idx].val[swiz] == val) {
673 neg = 0;
674 break;
675 }
676
677 if (ctx->so->immediates[idx].val[swiz] == -val) {
678 neg = 1;
679 break;
680 }
681 }
682
683 if (i == ctx->immediate_idx) {
684 /* need to generate a new immediate: */
685 swiz = i % 4;
686 idx = i / 4;
687 neg = 0;
688 ctx->so->immediates[idx].val[swiz] = val;
689 ctx->so->num_immediates = idx + 1;
690 ctx->immediate_idx++;
691 }
692
693 reg->File = TGSI_FILE_IMMEDIATE;
694 reg->Indirect = 0;
695 reg->Dimension = 0;
696 reg->Index = idx;
697 reg->Absolute = 0;
698 reg->Negate = neg;
699 reg->SwizzleX = swiz2tgsi[swiz];
700 reg->SwizzleY = swiz2tgsi[swiz];
701 reg->SwizzleZ = swiz2tgsi[swiz];
702 reg->SwizzleW = swiz2tgsi[swiz];
703 }
704
705 /* POW(a,b) = EXP2(b * LOG2(a)) */
706 static void
707 translate_pow(struct fd_compile_context *ctx,
708 struct tgsi_full_instruction *inst)
709 {
710 struct tgsi_dst_register tmp_dst;
711 struct tgsi_src_register tmp_src;
712 struct ir2_instruction *alu;
713
714 get_internal_temp(ctx, &tmp_dst, &tmp_src);
715
716 alu = ir2_instr_create_alu(next_exec_cf(ctx), ~0, LOG_CLAMP);
717 add_regs_dummy_vector(alu);
718 add_dst_reg(ctx, alu, &tmp_dst);
719 add_src_reg(ctx, alu, &inst->Src[0].Register);
720
721 alu = ir2_instr_create_alu(next_exec_cf(ctx), MULv, ~0);
722 add_dst_reg(ctx, alu, &tmp_dst);
723 add_src_reg(ctx, alu, &tmp_src);
724 add_src_reg(ctx, alu, &inst->Src[1].Register);
725
726 /* NOTE: some of the instructions, like EXP_IEEE, seem hard-
727 * coded to take their input from the w component.
728 */
729 switch(inst->Dst[0].Register.WriteMask) {
730 case TGSI_WRITEMASK_X:
731 tmp_src.SwizzleW = TGSI_SWIZZLE_X;
732 break;
733 case TGSI_WRITEMASK_Y:
734 tmp_src.SwizzleW = TGSI_SWIZZLE_Y;
735 break;
736 case TGSI_WRITEMASK_Z:
737 tmp_src.SwizzleW = TGSI_SWIZZLE_Z;
738 break;
739 case TGSI_WRITEMASK_W:
740 tmp_src.SwizzleW = TGSI_SWIZZLE_W;
741 break;
742 default:
743 DBG("invalid writemask!");
744 assert(0);
745 break;
746 }
747
748 alu = ir2_instr_create_alu(next_exec_cf(ctx), ~0, EXP_IEEE);
749 add_regs_dummy_vector(alu);
750 add_dst_reg(ctx, alu, &inst->Dst[0].Register);
751 add_src_reg(ctx, alu, &tmp_src);
752 add_scalar_clamp(inst, alu);
753 }
754
755 static void
756 translate_tex(struct fd_compile_context *ctx,
757 struct tgsi_full_instruction *inst, unsigned opc)
758 {
759 struct ir2_instruction *instr;
760 struct ir2_register *reg;
761 struct tgsi_dst_register tmp_dst;
762 struct tgsi_src_register tmp_src;
763 const struct tgsi_src_register *coord;
764 bool using_temp = (inst->Dst[0].Register.File == TGSI_FILE_OUTPUT) ||
765 (inst->Instruction.Saturate != TGSI_SAT_NONE);
766 int idx;
767
768 if (using_temp || (opc == TGSI_OPCODE_TXP))
769 get_internal_temp(ctx, &tmp_dst, &tmp_src);
770
771 if (opc == TGSI_OPCODE_TXP) {
772 static const char *swiz[] = {
773 [TGSI_SWIZZLE_X] = "xxxx",
774 [TGSI_SWIZZLE_Y] = "yyyy",
775 [TGSI_SWIZZLE_Z] = "zzzz",
776 [TGSI_SWIZZLE_W] = "wwww",
777 };
778
779 /* TXP - Projective Texture Lookup:
780 *
781 * coord.x = src0.x / src.w
782 * coord.y = src0.y / src.w
783 * coord.z = src0.z / src.w
784 * coord.w = src0.w
785 * bias = 0.0
786 *
787 * dst = texture_sample(unit, coord, bias)
788 */
789 instr = ir2_instr_create_alu(next_exec_cf(ctx), MAXv, RECIP_IEEE);
790
791 /* MAXv: */
792 add_dst_reg(ctx, instr, &tmp_dst)->swizzle = "___w";
793 add_src_reg(ctx, instr, &inst->Src[0].Register);
794 add_src_reg(ctx, instr, &inst->Src[0].Register);
795
796 /* RECIP_IEEE: */
797 add_dst_reg(ctx, instr, &tmp_dst)->swizzle = "x___";
798 add_src_reg(ctx, instr, &inst->Src[0].Register)->swizzle =
799 swiz[inst->Src[0].Register.SwizzleW];
800
801 instr = ir2_instr_create_alu(next_exec_cf(ctx), MULv, ~0);
802 add_dst_reg(ctx, instr, &tmp_dst)->swizzle = "xyz_";
803 add_src_reg(ctx, instr, &tmp_src)->swizzle = "xxxx";
804 add_src_reg(ctx, instr, &inst->Src[0].Register);
805
806 coord = &tmp_src;
807 } else {
808 coord = &inst->Src[0].Register;
809 }
810
811 instr = ir2_instr_create(next_exec_cf(ctx), IR2_FETCH);
812 instr->fetch.opc = TEX_FETCH;
813 instr->fetch.is_cube = (inst->Texture.Texture == TGSI_TEXTURE_3D);
814 assert(inst->Texture.NumOffsets <= 1); // TODO what to do in other cases?
815
816 /* save off the tex fetch to be patched later with correct const_idx: */
817 idx = ctx->so->num_tfetch_instrs++;
818 ctx->so->tfetch_instrs[idx].samp_id = inst->Src[1].Register.Index;
819 ctx->so->tfetch_instrs[idx].instr = instr;
820
821 add_dst_reg(ctx, instr, using_temp ? &tmp_dst : &inst->Dst[0].Register);
822 reg = add_src_reg(ctx, instr, coord);
823
824 /* blob compiler always sets 3rd component to same as 1st for 2d: */
825 if (inst->Texture.Texture == TGSI_TEXTURE_2D)
826 reg->swizzle[2] = reg->swizzle[0];
827
828 /* dst register needs to be marked for sync: */
829 ctx->need_sync |= 1 << instr->regs[0]->num;
830
831 /* TODO we need some way to know if the tex fetch needs to sync on alu pipe.. */
832 instr->sync = true;
833
834 if (using_temp) {
835 /* texture fetch can't write directly to export, so if tgsi
836 * is telling us the dst register is in output file, we load
837 * the texture to a temp and the use ALU instruction to move
838 * to output
839 */
840 instr = ir2_instr_create_alu(next_exec_cf(ctx), MAXv, ~0);
841
842 add_dst_reg(ctx, instr, &inst->Dst[0].Register);
843 add_src_reg(ctx, instr, &tmp_src);
844 add_src_reg(ctx, instr, &tmp_src);
845 add_vector_clamp(inst, instr);
846 }
847 }
848
849 /* SGE(a,b) = GTE((b - a), 1.0, 0.0) */
850 /* SLT(a,b) = GTE((b - a), 0.0, 1.0) */
851 static void
852 translate_sge_slt(struct fd_compile_context *ctx,
853 struct tgsi_full_instruction *inst, unsigned opc)
854 {
855 struct ir2_instruction *instr;
856 struct tgsi_dst_register tmp_dst;
857 struct tgsi_src_register tmp_src;
858 struct tgsi_src_register tmp_const;
859 float c0, c1;
860
861 switch (opc) {
862 default:
863 assert(0);
864 case TGSI_OPCODE_SGE:
865 c0 = 1.0;
866 c1 = 0.0;
867 break;
868 case TGSI_OPCODE_SLT:
869 c0 = 0.0;
870 c1 = 1.0;
871 break;
872 }
873
874 get_internal_temp(ctx, &tmp_dst, &tmp_src);
875
876 instr = ir2_instr_create_alu(next_exec_cf(ctx), ADDv, ~0);
877 add_dst_reg(ctx, instr, &tmp_dst);
878 add_src_reg(ctx, instr, &inst->Src[0].Register)->flags |= IR2_REG_NEGATE;
879 add_src_reg(ctx, instr, &inst->Src[1].Register);
880
881 instr = ir2_instr_create_alu(next_exec_cf(ctx), CNDGTEv, ~0);
882 add_dst_reg(ctx, instr, &inst->Dst[0].Register);
883 /* maybe should re-arrange the syntax some day, but
884 * in assembler/disassembler and what ir.c expects
885 * is: MULADDv Rdst = Rsrc2 + Rsrc0 * Rscr1
886 */
887 get_immediate(ctx, &tmp_const, fui(c0));
888 add_src_reg(ctx, instr, &tmp_const);
889 add_src_reg(ctx, instr, &tmp_src);
890 get_immediate(ctx, &tmp_const, fui(c1));
891 add_src_reg(ctx, instr, &tmp_const);
892 }
893
894 /* LRP(a,b,c) = (a * b) + ((1 - a) * c) */
895 static void
896 translate_lrp(struct fd_compile_context *ctx,
897 struct tgsi_full_instruction *inst,
898 unsigned opc)
899 {
900 struct ir2_instruction *instr;
901 struct tgsi_dst_register tmp_dst1, tmp_dst2;
902 struct tgsi_src_register tmp_src1, tmp_src2;
903 struct tgsi_src_register tmp_const;
904
905 get_internal_temp(ctx, &tmp_dst1, &tmp_src1);
906 get_internal_temp(ctx, &tmp_dst2, &tmp_src2);
907
908 get_immediate(ctx, &tmp_const, fui(1.0));
909
910 /* tmp1 = (a * b) */
911 instr = ir2_instr_create_alu(next_exec_cf(ctx), MULv, ~0);
912 add_dst_reg(ctx, instr, &tmp_dst1);
913 add_src_reg(ctx, instr, &inst->Src[0].Register);
914 add_src_reg(ctx, instr, &inst->Src[1].Register);
915
916 /* tmp2 = (1 - a) */
917 instr = ir2_instr_create_alu(next_exec_cf(ctx), ADDv, ~0);
918 add_dst_reg(ctx, instr, &tmp_dst2);
919 add_src_reg(ctx, instr, &tmp_const);
920 add_src_reg(ctx, instr, &inst->Src[0].Register)->flags |= IR2_REG_NEGATE;
921
922 /* tmp2 = tmp2 * c */
923 instr = ir2_instr_create_alu(next_exec_cf(ctx), MULv, ~0);
924 add_dst_reg(ctx, instr, &tmp_dst2);
925 add_src_reg(ctx, instr, &tmp_src2);
926 add_src_reg(ctx, instr, &inst->Src[2].Register);
927
928 /* dst = tmp1 + tmp2 */
929 instr = ir2_instr_create_alu(next_exec_cf(ctx), ADDv, ~0);
930 add_dst_reg(ctx, instr, &inst->Dst[0].Register);
931 add_src_reg(ctx, instr, &tmp_src1);
932 add_src_reg(ctx, instr, &tmp_src2);
933 }
934
935 static void
936 translate_trig(struct fd_compile_context *ctx,
937 struct tgsi_full_instruction *inst,
938 unsigned opc)
939 {
940 struct ir2_instruction *instr;
941 struct tgsi_dst_register tmp_dst;
942 struct tgsi_src_register tmp_src;
943 struct tgsi_src_register tmp_const;
944 instr_scalar_opc_t op;
945
946 switch (opc) {
947 default:
948 assert(0);
949 case TGSI_OPCODE_SIN:
950 op = SIN;
951 break;
952 case TGSI_OPCODE_COS:
953 op = COS;
954 break;
955 }
956
957 get_internal_temp(ctx, &tmp_dst, &tmp_src);
958
959 tmp_dst.WriteMask = TGSI_WRITEMASK_X;
960 tmp_src.SwizzleX = tmp_src.SwizzleY =
961 tmp_src.SwizzleZ = tmp_src.SwizzleW = TGSI_SWIZZLE_X;
962
963 /* maybe should re-arrange the syntax some day, but
964 * in assembler/disassembler and what ir.c expects
965 * is: MULADDv Rdst = Rsrc2 + Rsrc0 * Rscr1
966 */
967 instr = ir2_instr_create_alu(next_exec_cf(ctx), MULADDv, ~0);
968 add_dst_reg(ctx, instr, &tmp_dst);
969 get_immediate(ctx, &tmp_const, fui(0.5));
970 add_src_reg(ctx, instr, &tmp_const);
971 add_src_reg(ctx, instr, &inst->Src[0].Register);
972 get_immediate(ctx, &tmp_const, fui(0.159155));
973 add_src_reg(ctx, instr, &tmp_const);
974
975 instr = ir2_instr_create_alu(next_exec_cf(ctx), FRACv, ~0);
976 add_dst_reg(ctx, instr, &tmp_dst);
977 add_src_reg(ctx, instr, &tmp_src);
978 add_src_reg(ctx, instr, &tmp_src);
979
980 instr = ir2_instr_create_alu(next_exec_cf(ctx), MULADDv, ~0);
981 add_dst_reg(ctx, instr, &tmp_dst);
982 get_immediate(ctx, &tmp_const, fui(-3.141593));
983 add_src_reg(ctx, instr, &tmp_const);
984 add_src_reg(ctx, instr, &tmp_src);
985 get_immediate(ctx, &tmp_const, fui(6.283185));
986 add_src_reg(ctx, instr, &tmp_const);
987
988 instr = ir2_instr_create_alu(next_exec_cf(ctx), ~0, op);
989 add_regs_dummy_vector(instr);
990 add_dst_reg(ctx, instr, &inst->Dst[0].Register);
991 add_src_reg(ctx, instr, &tmp_src);
992 }
993
994 /*
995 * Main part of compiler/translator:
996 */
997
998 static void
999 translate_instruction(struct fd_compile_context *ctx,
1000 struct tgsi_full_instruction *inst)
1001 {
1002 unsigned opc = inst->Instruction.Opcode;
1003 struct ir2_instruction *instr;
1004 static struct ir2_cf *cf;
1005
1006 if (opc == TGSI_OPCODE_END)
1007 return;
1008
1009 if (inst->Dst[0].Register.File == TGSI_FILE_OUTPUT) {
1010 unsigned num = inst->Dst[0].Register.Index;
1011 /* seems like we need to ensure that position vs param/pixel
1012 * exports don't end up in the same EXEC clause.. easy way
1013 * to do this is force a new EXEC clause on first appearance
1014 * of an position or param/pixel export.
1015 */
1016 if ((num == ctx->position) || (num == ctx->psize)) {
1017 if (ctx->num_position > 0) {
1018 ctx->cf = NULL;
1019 ir2_cf_create_alloc(ctx->so->ir, SQ_POSITION,
1020 ctx->num_position - 1);
1021 ctx->num_position = 0;
1022 }
1023 } else {
1024 if (ctx->num_param > 0) {
1025 ctx->cf = NULL;
1026 ir2_cf_create_alloc(ctx->so->ir, SQ_PARAMETER_PIXEL,
1027 ctx->num_param - 1);
1028 ctx->num_param = 0;
1029 }
1030 }
1031 }
1032
1033 cf = next_exec_cf(ctx);
1034
1035 /* TODO turn this into a table: */
1036 switch (opc) {
1037 case TGSI_OPCODE_MOV:
1038 instr = ir2_instr_create_alu(cf, MAXv, ~0);
1039 add_regs_vector_1(ctx, inst, instr);
1040 break;
1041 case TGSI_OPCODE_RCP:
1042 instr = ir2_instr_create_alu(cf, ~0, RECIP_IEEE);
1043 add_regs_scalar_1(ctx, inst, instr);
1044 break;
1045 case TGSI_OPCODE_RSQ:
1046 instr = ir2_instr_create_alu(cf, ~0, RECIPSQ_IEEE);
1047 add_regs_scalar_1(ctx, inst, instr);
1048 break;
1049 case TGSI_OPCODE_MUL:
1050 instr = ir2_instr_create_alu(cf, MULv, ~0);
1051 add_regs_vector_2(ctx, inst, instr);
1052 break;
1053 case TGSI_OPCODE_ADD:
1054 instr = ir2_instr_create_alu(cf, ADDv, ~0);
1055 add_regs_vector_2(ctx, inst, instr);
1056 break;
1057 case TGSI_OPCODE_DP3:
1058 instr = ir2_instr_create_alu(cf, DOT3v, ~0);
1059 add_regs_vector_2(ctx, inst, instr);
1060 break;
1061 case TGSI_OPCODE_DP4:
1062 instr = ir2_instr_create_alu(cf, DOT4v, ~0);
1063 add_regs_vector_2(ctx, inst, instr);
1064 break;
1065 case TGSI_OPCODE_MIN:
1066 instr = ir2_instr_create_alu(cf, MINv, ~0);
1067 add_regs_vector_2(ctx, inst, instr);
1068 break;
1069 case TGSI_OPCODE_MAX:
1070 instr = ir2_instr_create_alu(cf, MAXv, ~0);
1071 add_regs_vector_2(ctx, inst, instr);
1072 break;
1073 case TGSI_OPCODE_SLT:
1074 case TGSI_OPCODE_SGE:
1075 translate_sge_slt(ctx, inst, opc);
1076 break;
1077 case TGSI_OPCODE_MAD:
1078 instr = ir2_instr_create_alu(cf, MULADDv, ~0);
1079 add_regs_vector_3(ctx, inst, instr);
1080 break;
1081 case TGSI_OPCODE_LRP:
1082 translate_lrp(ctx, inst, opc);
1083 break;
1084 case TGSI_OPCODE_FRC:
1085 instr = ir2_instr_create_alu(cf, FRACv, ~0);
1086 add_regs_vector_1(ctx, inst, instr);
1087 break;
1088 case TGSI_OPCODE_FLR:
1089 instr = ir2_instr_create_alu(cf, FLOORv, ~0);
1090 add_regs_vector_1(ctx, inst, instr);
1091 break;
1092 case TGSI_OPCODE_EX2:
1093 instr = ir2_instr_create_alu(cf, ~0, EXP_IEEE);
1094 add_regs_scalar_1(ctx, inst, instr);
1095 break;
1096 case TGSI_OPCODE_POW:
1097 translate_pow(ctx, inst);
1098 break;
1099 case TGSI_OPCODE_ABS:
1100 instr = ir2_instr_create_alu(cf, MAXv, ~0);
1101 add_regs_vector_1(ctx, inst, instr);
1102 instr->regs[1]->flags |= IR2_REG_NEGATE; /* src0 */
1103 break;
1104 case TGSI_OPCODE_COS:
1105 case TGSI_OPCODE_SIN:
1106 translate_trig(ctx, inst, opc);
1107 break;
1108 case TGSI_OPCODE_TEX:
1109 case TGSI_OPCODE_TXP:
1110 translate_tex(ctx, inst, opc);
1111 break;
1112 case TGSI_OPCODE_CMP:
1113 instr = ir2_instr_create_alu(cf, CNDGTEv, ~0);
1114 add_regs_vector_3(ctx, inst, instr);
1115 // TODO this should be src0 if regs where in sane order..
1116 instr->regs[2]->flags ^= IR2_REG_NEGATE; /* src1 */
1117 break;
1118 case TGSI_OPCODE_IF:
1119 push_predicate(ctx, &inst->Src[0].Register);
1120 ctx->so->ir->pred = IR2_PRED_EQ;
1121 break;
1122 case TGSI_OPCODE_ELSE:
1123 ctx->so->ir->pred = IR2_PRED_NE;
1124 /* not sure if this is required in all cases, but blob compiler
1125 * won't combine EQ and NE in same CF:
1126 */
1127 ctx->cf = NULL;
1128 break;
1129 case TGSI_OPCODE_ENDIF:
1130 pop_predicate(ctx);
1131 break;
1132 case TGSI_OPCODE_F2I:
1133 instr = ir2_instr_create_alu(cf, TRUNCv, ~0);
1134 add_regs_vector_1(ctx, inst, instr);
1135 break;
1136 default:
1137 DBG("unknown TGSI opc: %s", tgsi_get_opcode_name(opc));
1138 tgsi_dump(ctx->so->tokens, 0);
1139 assert(0);
1140 break;
1141 }
1142
1143 /* internal temporaries are only valid for the duration of a single
1144 * TGSI instruction:
1145 */
1146 ctx->num_internal_temps = 0;
1147 }
1148
1149 static void
1150 compile_instructions(struct fd_compile_context *ctx)
1151 {
1152 while (!tgsi_parse_end_of_tokens(&ctx->parser)) {
1153 tgsi_parse_token(&ctx->parser);
1154
1155 switch (ctx->parser.FullToken.Token.Type) {
1156 case TGSI_TOKEN_TYPE_INSTRUCTION:
1157 translate_instruction(ctx,
1158 &ctx->parser.FullToken.FullInstruction);
1159 break;
1160 default:
1161 break;
1162 }
1163 }
1164
1165 ctx->cf->cf_type = EXEC_END;
1166 }
1167
1168 int
1169 fd_compile_shader(struct fd_program_stateobj *prog,
1170 struct fd_shader_stateobj *so)
1171 {
1172 struct fd_compile_context ctx;
1173
1174 ir2_shader_destroy(so->ir);
1175 so->ir = ir2_shader_create();
1176 so->num_vfetch_instrs = so->num_tfetch_instrs = so->num_immediates = 0;
1177
1178 if (compile_init(&ctx, prog, so) != TGSI_PARSE_OK)
1179 return -1;
1180
1181 if (ctx.type == TGSI_PROCESSOR_VERTEX) {
1182 compile_vtx_fetch(&ctx);
1183 } else if (ctx.type == TGSI_PROCESSOR_FRAGMENT) {
1184 prog->num_exports = 0;
1185 memset(prog->export_linkage, 0xff,
1186 sizeof(prog->export_linkage));
1187 }
1188
1189 compile_instructions(&ctx);
1190
1191 compile_free(&ctx);
1192
1193 return 0;
1194 }
1195