freedreno: prepare for a3xx
[mesa.git] / src / gallium / drivers / freedreno / a2xx / fd2_compiler.c
1 /* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
2
3 /*
4 * Copyright (C) 2012 Rob Clark <robclark@freedesktop.org>
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 * SOFTWARE.
24 *
25 * Authors:
26 * Rob Clark <robclark@freedesktop.org>
27 */
28
29 #include "pipe/p_state.h"
30 #include "util/u_string.h"
31 #include "util/u_memory.h"
32 #include "util/u_inlines.h"
33 #include "tgsi/tgsi_parse.h"
34 #include "tgsi/tgsi_ureg.h"
35 #include "tgsi/tgsi_info.h"
36 #include "tgsi/tgsi_strings.h"
37 #include "tgsi/tgsi_dump.h"
38
39 #include "fd2_compiler.h"
40 #include "fd2_program.h"
41 #include "fd2_util.h"
42
43 #include "instr-a2xx.h"
44 #include "ir-a2xx.h"
45
46 struct fd2_compile_context {
47 struct fd_program_stateobj *prog;
48 struct fd2_shader_stateobj *so;
49
50 struct tgsi_parse_context parser;
51 unsigned type;
52
53 /* predicate stack: */
54 int pred_depth;
55 enum ir2_pred pred_stack[8];
56
57 /* Internal-Temporary and Predicate register assignment:
58 *
59 * Some TGSI instructions which translate into multiple actual
60 * instructions need one or more temporary registers, which are not
61 * assigned from TGSI perspective (ie. not TGSI_FILE_TEMPORARY).
62 * And some instructions (texture fetch) cannot write directly to
63 * output registers. We could be more clever and re-use dst or a
64 * src register in some cases. But for now don't try to be clever.
65 * Eventually we should implement an optimization pass that re-
66 * juggles the register usage and gets rid of unneeded temporaries.
67 *
68 * The predicate register must be valid across multiple TGSI
69 * instructions, but internal temporary's do not. For this reason,
70 * once the predicate register is requested, until it is no longer
71 * needed, it gets the first register slot after after the TGSI
72 * assigned temporaries (ie. num_regs[TGSI_FILE_TEMPORARY]), and the
73 * internal temporaries get the register slots above this.
74 */
75
76 int pred_reg;
77 int num_internal_temps;
78
79 uint8_t num_regs[TGSI_FILE_COUNT];
80
81 /* maps input register idx to prog->export_linkage idx: */
82 uint8_t input_export_idx[64];
83
84 /* maps output register idx to prog->export_linkage idx: */
85 uint8_t output_export_idx[64];
86
87 /* idx/slot for last compiler generated immediate */
88 unsigned immediate_idx;
89
90 // TODO we can skip emit exports in the VS that the FS doesn't need..
91 // and get rid perhaps of num_param..
92 unsigned num_position, num_param;
93 unsigned position, psize;
94
95 uint64_t need_sync;
96
97 /* current exec CF instruction */
98 struct ir2_cf *cf;
99 };
100
101 static int
102 semantic_idx(struct tgsi_declaration_semantic *semantic)
103 {
104 int idx = semantic->Name;
105 if (idx == TGSI_SEMANTIC_GENERIC)
106 idx = TGSI_SEMANTIC_COUNT + semantic->Index;
107 return idx;
108 }
109
110 /* assign/get the input/export register # for given semantic idx as
111 * returned by semantic_idx():
112 */
113 static int
114 export_linkage(struct fd2_compile_context *ctx, int idx)
115 {
116 struct fd_program_stateobj *prog = ctx->prog;
117
118 /* if first time we've seen this export, assign the next available slot: */
119 if (prog->export_linkage[idx] == 0xff)
120 prog->export_linkage[idx] = prog->num_exports++;
121
122 return prog->export_linkage[idx];
123 }
124
125 static unsigned
126 compile_init(struct fd2_compile_context *ctx, struct fd_program_stateobj *prog,
127 struct fd2_shader_stateobj *so)
128 {
129 unsigned ret;
130
131 ctx->prog = prog;
132 ctx->so = so;
133 ctx->cf = NULL;
134 ctx->pred_depth = 0;
135
136 ret = tgsi_parse_init(&ctx->parser, so->tokens);
137 if (ret != TGSI_PARSE_OK)
138 return ret;
139
140 ctx->type = ctx->parser.FullHeader.Processor.Processor;
141 ctx->position = ~0;
142 ctx->psize = ~0;
143 ctx->num_position = 0;
144 ctx->num_param = 0;
145 ctx->need_sync = 0;
146 ctx->immediate_idx = 0;
147 ctx->pred_reg = -1;
148 ctx->num_internal_temps = 0;
149
150 memset(ctx->num_regs, 0, sizeof(ctx->num_regs));
151 memset(ctx->input_export_idx, 0, sizeof(ctx->input_export_idx));
152 memset(ctx->output_export_idx, 0, sizeof(ctx->output_export_idx));
153
154 /* do first pass to extract declarations: */
155 while (!tgsi_parse_end_of_tokens(&ctx->parser)) {
156 tgsi_parse_token(&ctx->parser);
157
158 switch (ctx->parser.FullToken.Token.Type) {
159 case TGSI_TOKEN_TYPE_DECLARATION: {
160 struct tgsi_full_declaration *decl =
161 &ctx->parser.FullToken.FullDeclaration;
162 if (decl->Declaration.File == TGSI_FILE_OUTPUT) {
163 unsigned name = decl->Semantic.Name;
164
165 assert(decl->Declaration.Semantic); // TODO is this ever not true?
166
167 ctx->output_export_idx[decl->Range.First] =
168 semantic_idx(&decl->Semantic);
169
170 if (ctx->type == TGSI_PROCESSOR_VERTEX) {
171 switch (name) {
172 case TGSI_SEMANTIC_POSITION:
173 ctx->position = ctx->num_regs[TGSI_FILE_OUTPUT];
174 ctx->num_position++;
175 break;
176 case TGSI_SEMANTIC_PSIZE:
177 ctx->psize = ctx->num_regs[TGSI_FILE_OUTPUT];
178 ctx->num_position++;
179 break;
180 case TGSI_SEMANTIC_COLOR:
181 case TGSI_SEMANTIC_GENERIC:
182 ctx->num_param++;
183 break;
184 default:
185 DBG("unknown VS semantic name: %s",
186 tgsi_semantic_names[name]);
187 assert(0);
188 }
189 } else {
190 switch (name) {
191 case TGSI_SEMANTIC_COLOR:
192 case TGSI_SEMANTIC_GENERIC:
193 ctx->num_param++;
194 break;
195 default:
196 DBG("unknown PS semantic name: %s",
197 tgsi_semantic_names[name]);
198 assert(0);
199 }
200 }
201 } else if (decl->Declaration.File == TGSI_FILE_INPUT) {
202 ctx->input_export_idx[decl->Range.First] =
203 semantic_idx(&decl->Semantic);
204 }
205 ctx->num_regs[decl->Declaration.File] =
206 MAX2(ctx->num_regs[decl->Declaration.File], decl->Range.Last + 1);
207 break;
208 }
209 case TGSI_TOKEN_TYPE_IMMEDIATE: {
210 struct tgsi_full_immediate *imm =
211 &ctx->parser.FullToken.FullImmediate;
212 unsigned n = ctx->so->num_immediates++;
213 memcpy(ctx->so->immediates[n].val, imm->u, 16);
214 break;
215 }
216 default:
217 break;
218 }
219 }
220
221 /* TGSI generated immediates are always entire vec4's, ones we
222 * generate internally are not:
223 */
224 ctx->immediate_idx = ctx->so->num_immediates * 4;
225
226 ctx->so->first_immediate = ctx->num_regs[TGSI_FILE_CONSTANT];
227
228 tgsi_parse_free(&ctx->parser);
229
230 return tgsi_parse_init(&ctx->parser, so->tokens);
231 }
232
233 static void
234 compile_free(struct fd2_compile_context *ctx)
235 {
236 tgsi_parse_free(&ctx->parser);
237 }
238
239 static struct ir2_cf *
240 next_exec_cf(struct fd2_compile_context *ctx)
241 {
242 struct ir2_cf *cf = ctx->cf;
243 if (!cf || cf->exec.instrs_count >= ARRAY_SIZE(ctx->cf->exec.instrs))
244 ctx->cf = cf = ir2_cf_create(ctx->so->ir, EXEC);
245 return cf;
246 }
247
248 static void
249 compile_vtx_fetch(struct fd2_compile_context *ctx)
250 {
251 struct ir2_instruction **vfetch_instrs = ctx->so->vfetch_instrs;
252 int i;
253 for (i = 0; i < ctx->num_regs[TGSI_FILE_INPUT]; i++) {
254 struct ir2_instruction *instr = ir2_instr_create(
255 next_exec_cf(ctx), IR2_FETCH);
256 instr->fetch.opc = VTX_FETCH;
257
258 ctx->need_sync |= 1 << (i+1);
259
260 ir2_reg_create(instr, i+1, "xyzw", 0);
261 ir2_reg_create(instr, 0, "x", 0);
262
263 if (i == 0)
264 instr->sync = true;
265
266 vfetch_instrs[i] = instr;
267 }
268 ctx->so->num_vfetch_instrs = i;
269 ctx->cf = NULL;
270 }
271
272 /*
273 * For vertex shaders (VS):
274 * --- ------ -------------
275 *
276 * Inputs: R1-R(num_input)
277 * Constants: C0-C(num_const-1)
278 * Immediates: C(num_const)-C(num_const+num_imm-1)
279 * Outputs: export0-export(n) and export62, export63
280 * n is # of outputs minus gl_Position (export62) and gl_PointSize (export63)
281 * Temps: R(num_input+1)-R(num_input+num_temps)
282 *
283 * R0 could be clobbered after the vertex fetch instructions.. so we
284 * could use it for one of the temporaries.
285 *
286 * TODO: maybe the vertex fetch part could fetch first input into R0 as
287 * the last vtx fetch instruction, which would let us use the same
288 * register layout in either case.. although this is not what the blob
289 * compiler does.
290 *
291 *
292 * For frag shaders (PS):
293 * --- ---- -------------
294 *
295 * Inputs: R0-R(num_input-1)
296 * Constants: same as VS
297 * Immediates: same as VS
298 * Outputs: export0-export(num_outputs)
299 * Temps: R(num_input)-R(num_input+num_temps-1)
300 *
301 * In either case, immediates are are postpended to the constants
302 * (uniforms).
303 *
304 */
305
306 static unsigned
307 get_temp_gpr(struct fd2_compile_context *ctx, int idx)
308 {
309 unsigned num = idx + ctx->num_regs[TGSI_FILE_INPUT];
310 if (ctx->type == TGSI_PROCESSOR_VERTEX)
311 num++;
312 return num;
313 }
314
315 static struct ir2_register *
316 add_dst_reg(struct fd2_compile_context *ctx, struct ir2_instruction *alu,
317 const struct tgsi_dst_register *dst)
318 {
319 unsigned flags = 0, num = 0;
320 char swiz[5];
321
322 switch (dst->File) {
323 case TGSI_FILE_OUTPUT:
324 flags |= IR2_REG_EXPORT;
325 if (ctx->type == TGSI_PROCESSOR_VERTEX) {
326 if (dst->Index == ctx->position) {
327 num = 62;
328 } else if (dst->Index == ctx->psize) {
329 num = 63;
330 } else {
331 num = export_linkage(ctx,
332 ctx->output_export_idx[dst->Index]);
333 }
334 } else {
335 num = dst->Index;
336 }
337 break;
338 case TGSI_FILE_TEMPORARY:
339 num = get_temp_gpr(ctx, dst->Index);
340 break;
341 default:
342 DBG("unsupported dst register file: %s",
343 tgsi_file_name(dst->File));
344 assert(0);
345 break;
346 }
347
348 swiz[0] = (dst->WriteMask & TGSI_WRITEMASK_X) ? 'x' : '_';
349 swiz[1] = (dst->WriteMask & TGSI_WRITEMASK_Y) ? 'y' : '_';
350 swiz[2] = (dst->WriteMask & TGSI_WRITEMASK_Z) ? 'z' : '_';
351 swiz[3] = (dst->WriteMask & TGSI_WRITEMASK_W) ? 'w' : '_';
352 swiz[4] = '\0';
353
354 return ir2_reg_create(alu, num, swiz, flags);
355 }
356
357 static struct ir2_register *
358 add_src_reg(struct fd2_compile_context *ctx, struct ir2_instruction *alu,
359 const struct tgsi_src_register *src)
360 {
361 static const char swiz_vals[] = {
362 'x', 'y', 'z', 'w',
363 };
364 char swiz[5];
365 unsigned flags = 0, num = 0;
366
367 switch (src->File) {
368 case TGSI_FILE_CONSTANT:
369 num = src->Index;
370 flags |= IR2_REG_CONST;
371 break;
372 case TGSI_FILE_INPUT:
373 if (ctx->type == TGSI_PROCESSOR_VERTEX) {
374 num = src->Index + 1;
375 } else {
376 num = export_linkage(ctx,
377 ctx->input_export_idx[src->Index]);
378 }
379 break;
380 case TGSI_FILE_TEMPORARY:
381 num = get_temp_gpr(ctx, src->Index);
382 break;
383 case TGSI_FILE_IMMEDIATE:
384 num = src->Index + ctx->num_regs[TGSI_FILE_CONSTANT];
385 flags |= IR2_REG_CONST;
386 break;
387 default:
388 DBG("unsupported src register file: %s",
389 tgsi_file_name(src->File));
390 assert(0);
391 break;
392 }
393
394 if (src->Absolute)
395 flags |= IR2_REG_ABS;
396 if (src->Negate)
397 flags |= IR2_REG_NEGATE;
398
399 swiz[0] = swiz_vals[src->SwizzleX];
400 swiz[1] = swiz_vals[src->SwizzleY];
401 swiz[2] = swiz_vals[src->SwizzleZ];
402 swiz[3] = swiz_vals[src->SwizzleW];
403 swiz[4] = '\0';
404
405 if ((ctx->need_sync & (uint64_t)(1 << num)) &&
406 !(flags & IR2_REG_CONST)) {
407 alu->sync = true;
408 ctx->need_sync &= ~(uint64_t)(1 << num);
409 }
410
411 return ir2_reg_create(alu, num, swiz, flags);
412 }
413
414 static void
415 add_vector_clamp(struct tgsi_full_instruction *inst, struct ir2_instruction *alu)
416 {
417 switch (inst->Instruction.Saturate) {
418 case TGSI_SAT_NONE:
419 break;
420 case TGSI_SAT_ZERO_ONE:
421 alu->alu.vector_clamp = true;
422 break;
423 case TGSI_SAT_MINUS_PLUS_ONE:
424 DBG("unsupported saturate");
425 assert(0);
426 break;
427 }
428 }
429
430 static void
431 add_scalar_clamp(struct tgsi_full_instruction *inst, struct ir2_instruction *alu)
432 {
433 switch (inst->Instruction.Saturate) {
434 case TGSI_SAT_NONE:
435 break;
436 case TGSI_SAT_ZERO_ONE:
437 alu->alu.scalar_clamp = true;
438 break;
439 case TGSI_SAT_MINUS_PLUS_ONE:
440 DBG("unsupported saturate");
441 assert(0);
442 break;
443 }
444 }
445
446 static void
447 add_regs_vector_1(struct fd2_compile_context *ctx,
448 struct tgsi_full_instruction *inst, struct ir2_instruction *alu)
449 {
450 assert(inst->Instruction.NumSrcRegs == 1);
451 assert(inst->Instruction.NumDstRegs == 1);
452
453 add_dst_reg(ctx, alu, &inst->Dst[0].Register);
454 add_src_reg(ctx, alu, &inst->Src[0].Register);
455 add_src_reg(ctx, alu, &inst->Src[0].Register);
456 add_vector_clamp(inst, alu);
457 }
458
459 static void
460 add_regs_vector_2(struct fd2_compile_context *ctx,
461 struct tgsi_full_instruction *inst, struct ir2_instruction *alu)
462 {
463 assert(inst->Instruction.NumSrcRegs == 2);
464 assert(inst->Instruction.NumDstRegs == 1);
465
466 add_dst_reg(ctx, alu, &inst->Dst[0].Register);
467 add_src_reg(ctx, alu, &inst->Src[0].Register);
468 add_src_reg(ctx, alu, &inst->Src[1].Register);
469 add_vector_clamp(inst, alu);
470 }
471
472 static void
473 add_regs_vector_3(struct fd2_compile_context *ctx,
474 struct tgsi_full_instruction *inst, struct ir2_instruction *alu)
475 {
476 assert(inst->Instruction.NumSrcRegs == 3);
477 assert(inst->Instruction.NumDstRegs == 1);
478
479 add_dst_reg(ctx, alu, &inst->Dst[0].Register);
480 /* maybe should re-arrange the syntax some day, but
481 * in assembler/disassembler and what ir.c expects
482 * is: MULADDv Rdst = Rsrc2 + Rsrc0 * Rscr1
483 */
484 add_src_reg(ctx, alu, &inst->Src[2].Register);
485 add_src_reg(ctx, alu, &inst->Src[0].Register);
486 add_src_reg(ctx, alu, &inst->Src[1].Register);
487 add_vector_clamp(inst, alu);
488 }
489
490 static void
491 add_regs_dummy_vector(struct ir2_instruction *alu)
492 {
493 /* create dummy, non-written vector dst/src regs
494 * for unused vector instr slot:
495 */
496 ir2_reg_create(alu, 0, "____", 0); /* vector dst */
497 ir2_reg_create(alu, 0, NULL, 0); /* vector src1 */
498 ir2_reg_create(alu, 0, NULL, 0); /* vector src2 */
499 }
500
501 static void
502 add_regs_scalar_1(struct fd2_compile_context *ctx,
503 struct tgsi_full_instruction *inst, struct ir2_instruction *alu)
504 {
505 assert(inst->Instruction.NumSrcRegs == 1);
506 assert(inst->Instruction.NumDstRegs == 1);
507
508 add_regs_dummy_vector(alu);
509
510 add_dst_reg(ctx, alu, &inst->Dst[0].Register);
511 add_src_reg(ctx, alu, &inst->Src[0].Register);
512 add_scalar_clamp(inst, alu);
513 }
514
515 /*
516 * Helpers for TGSI instructions that don't map to a single shader instr:
517 */
518
519 static void
520 src_from_dst(struct tgsi_src_register *src, struct tgsi_dst_register *dst)
521 {
522 src->File = dst->File;
523 src->Indirect = dst->Indirect;
524 src->Dimension = dst->Dimension;
525 src->Index = dst->Index;
526 src->Absolute = 0;
527 src->Negate = 0;
528 src->SwizzleX = TGSI_SWIZZLE_X;
529 src->SwizzleY = TGSI_SWIZZLE_Y;
530 src->SwizzleZ = TGSI_SWIZZLE_Z;
531 src->SwizzleW = TGSI_SWIZZLE_W;
532 }
533
534 /* Get internal-temp src/dst to use for a sequence of instructions
535 * generated by a single TGSI op.
536 */
537 static void
538 get_internal_temp(struct fd2_compile_context *ctx,
539 struct tgsi_dst_register *tmp_dst,
540 struct tgsi_src_register *tmp_src)
541 {
542 int n;
543
544 tmp_dst->File = TGSI_FILE_TEMPORARY;
545 tmp_dst->WriteMask = TGSI_WRITEMASK_XYZW;
546 tmp_dst->Indirect = 0;
547 tmp_dst->Dimension = 0;
548
549 /* assign next temporary: */
550 n = ctx->num_internal_temps++;
551 if (ctx->pred_reg != -1)
552 n++;
553
554 tmp_dst->Index = ctx->num_regs[TGSI_FILE_TEMPORARY] + n;
555
556 src_from_dst(tmp_src, tmp_dst);
557 }
558
559 static void
560 get_predicate(struct fd2_compile_context *ctx, struct tgsi_dst_register *dst,
561 struct tgsi_src_register *src)
562 {
563 assert(ctx->pred_reg != -1);
564
565 dst->File = TGSI_FILE_TEMPORARY;
566 dst->WriteMask = TGSI_WRITEMASK_W;
567 dst->Indirect = 0;
568 dst->Dimension = 0;
569 dst->Index = get_temp_gpr(ctx, ctx->pred_reg);
570
571 if (src) {
572 src_from_dst(src, dst);
573 src->SwizzleX = TGSI_SWIZZLE_W;
574 src->SwizzleY = TGSI_SWIZZLE_W;
575 src->SwizzleZ = TGSI_SWIZZLE_W;
576 src->SwizzleW = TGSI_SWIZZLE_W;
577 }
578 }
579
580 static void
581 push_predicate(struct fd2_compile_context *ctx, struct tgsi_src_register *src)
582 {
583 struct ir2_instruction *alu;
584 struct tgsi_dst_register pred_dst;
585
586 /* NOTE blob compiler seems to always puts PRED_* instrs in a CF by
587 * themselves:
588 */
589 ctx->cf = NULL;
590
591 if (ctx->pred_depth == 0) {
592 /* assign predicate register: */
593 ctx->pred_reg = ctx->num_regs[TGSI_FILE_TEMPORARY];
594
595 get_predicate(ctx, &pred_dst, NULL);
596
597 alu = ir2_instr_create_alu(next_exec_cf(ctx), ~0, PRED_SETNEs);
598 add_regs_dummy_vector(alu);
599 add_dst_reg(ctx, alu, &pred_dst);
600 add_src_reg(ctx, alu, src);
601 } else {
602 struct tgsi_src_register pred_src;
603
604 get_predicate(ctx, &pred_dst, &pred_src);
605
606 alu = ir2_instr_create_alu(next_exec_cf(ctx), MULv, ~0);
607 add_dst_reg(ctx, alu, &pred_dst);
608 add_src_reg(ctx, alu, &pred_src);
609 add_src_reg(ctx, alu, src);
610
611 // XXX need to make PRED_SETE_PUSHv IR2_PRED_NONE.. but need to make
612 // sure src reg is valid if it was calculated with a predicate
613 // condition..
614 alu->pred = IR2_PRED_NONE;
615 }
616
617 /* save previous pred state to restore in pop_predicate(): */
618 ctx->pred_stack[ctx->pred_depth++] = ctx->so->ir->pred;
619
620 ctx->cf = NULL;
621 }
622
623 static void
624 pop_predicate(struct fd2_compile_context *ctx)
625 {
626 /* NOTE blob compiler seems to always puts PRED_* instrs in a CF by
627 * themselves:
628 */
629 ctx->cf = NULL;
630
631 /* restore previous predicate state: */
632 ctx->so->ir->pred = ctx->pred_stack[--ctx->pred_depth];
633
634 if (ctx->pred_depth != 0) {
635 struct ir2_instruction *alu;
636 struct tgsi_dst_register pred_dst;
637 struct tgsi_src_register pred_src;
638
639 get_predicate(ctx, &pred_dst, &pred_src);
640
641 alu = ir2_instr_create_alu(next_exec_cf(ctx), ~0, PRED_SET_POPs);
642 add_regs_dummy_vector(alu);
643 add_dst_reg(ctx, alu, &pred_dst);
644 add_src_reg(ctx, alu, &pred_src);
645 alu->pred = IR2_PRED_NONE;
646 } else {
647 /* predicate register no longer needed: */
648 ctx->pred_reg = -1;
649 }
650
651 ctx->cf = NULL;
652 }
653
654 static void
655 get_immediate(struct fd2_compile_context *ctx,
656 struct tgsi_src_register *reg, uint32_t val)
657 {
658 unsigned neg, swiz, idx, i;
659 /* actually maps 1:1 currently.. not sure if that is safe to rely on: */
660 static const unsigned swiz2tgsi[] = {
661 TGSI_SWIZZLE_X, TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Z, TGSI_SWIZZLE_W,
662 };
663
664 for (i = 0; i < ctx->immediate_idx; i++) {
665 swiz = i % 4;
666 idx = i / 4;
667
668 if (ctx->so->immediates[idx].val[swiz] == val) {
669 neg = 0;
670 break;
671 }
672
673 if (ctx->so->immediates[idx].val[swiz] == -val) {
674 neg = 1;
675 break;
676 }
677 }
678
679 if (i == ctx->immediate_idx) {
680 /* need to generate a new immediate: */
681 swiz = i % 4;
682 idx = i / 4;
683 neg = 0;
684 ctx->so->immediates[idx].val[swiz] = val;
685 ctx->so->num_immediates = idx + 1;
686 ctx->immediate_idx++;
687 }
688
689 reg->File = TGSI_FILE_IMMEDIATE;
690 reg->Indirect = 0;
691 reg->Dimension = 0;
692 reg->Index = idx;
693 reg->Absolute = 0;
694 reg->Negate = neg;
695 reg->SwizzleX = swiz2tgsi[swiz];
696 reg->SwizzleY = swiz2tgsi[swiz];
697 reg->SwizzleZ = swiz2tgsi[swiz];
698 reg->SwizzleW = swiz2tgsi[swiz];
699 }
700
701 /* POW(a,b) = EXP2(b * LOG2(a)) */
702 static void
703 translate_pow(struct fd2_compile_context *ctx,
704 struct tgsi_full_instruction *inst)
705 {
706 struct tgsi_dst_register tmp_dst;
707 struct tgsi_src_register tmp_src;
708 struct ir2_instruction *alu;
709
710 get_internal_temp(ctx, &tmp_dst, &tmp_src);
711
712 alu = ir2_instr_create_alu(next_exec_cf(ctx), ~0, LOG_CLAMP);
713 add_regs_dummy_vector(alu);
714 add_dst_reg(ctx, alu, &tmp_dst);
715 add_src_reg(ctx, alu, &inst->Src[0].Register);
716
717 alu = ir2_instr_create_alu(next_exec_cf(ctx), MULv, ~0);
718 add_dst_reg(ctx, alu, &tmp_dst);
719 add_src_reg(ctx, alu, &tmp_src);
720 add_src_reg(ctx, alu, &inst->Src[1].Register);
721
722 /* NOTE: some of the instructions, like EXP_IEEE, seem hard-
723 * coded to take their input from the w component.
724 */
725 switch(inst->Dst[0].Register.WriteMask) {
726 case TGSI_WRITEMASK_X:
727 tmp_src.SwizzleW = TGSI_SWIZZLE_X;
728 break;
729 case TGSI_WRITEMASK_Y:
730 tmp_src.SwizzleW = TGSI_SWIZZLE_Y;
731 break;
732 case TGSI_WRITEMASK_Z:
733 tmp_src.SwizzleW = TGSI_SWIZZLE_Z;
734 break;
735 case TGSI_WRITEMASK_W:
736 tmp_src.SwizzleW = TGSI_SWIZZLE_W;
737 break;
738 default:
739 DBG("invalid writemask!");
740 assert(0);
741 break;
742 }
743
744 alu = ir2_instr_create_alu(next_exec_cf(ctx), ~0, EXP_IEEE);
745 add_regs_dummy_vector(alu);
746 add_dst_reg(ctx, alu, &inst->Dst[0].Register);
747 add_src_reg(ctx, alu, &tmp_src);
748 add_scalar_clamp(inst, alu);
749 }
750
751 static void
752 translate_tex(struct fd2_compile_context *ctx,
753 struct tgsi_full_instruction *inst, unsigned opc)
754 {
755 struct ir2_instruction *instr;
756 struct ir2_register *reg;
757 struct tgsi_dst_register tmp_dst;
758 struct tgsi_src_register tmp_src;
759 const struct tgsi_src_register *coord;
760 bool using_temp = (inst->Dst[0].Register.File == TGSI_FILE_OUTPUT) ||
761 (inst->Instruction.Saturate != TGSI_SAT_NONE);
762 int idx;
763
764 if (using_temp || (opc == TGSI_OPCODE_TXP))
765 get_internal_temp(ctx, &tmp_dst, &tmp_src);
766
767 if (opc == TGSI_OPCODE_TXP) {
768 static const char *swiz[] = {
769 [TGSI_SWIZZLE_X] = "xxxx",
770 [TGSI_SWIZZLE_Y] = "yyyy",
771 [TGSI_SWIZZLE_Z] = "zzzz",
772 [TGSI_SWIZZLE_W] = "wwww",
773 };
774
775 /* TXP - Projective Texture Lookup:
776 *
777 * coord.x = src0.x / src.w
778 * coord.y = src0.y / src.w
779 * coord.z = src0.z / src.w
780 * coord.w = src0.w
781 * bias = 0.0
782 *
783 * dst = texture_sample(unit, coord, bias)
784 */
785 instr = ir2_instr_create_alu(next_exec_cf(ctx), MAXv, RECIP_IEEE);
786
787 /* MAXv: */
788 add_dst_reg(ctx, instr, &tmp_dst)->swizzle = "___w";
789 add_src_reg(ctx, instr, &inst->Src[0].Register);
790 add_src_reg(ctx, instr, &inst->Src[0].Register);
791
792 /* RECIP_IEEE: */
793 add_dst_reg(ctx, instr, &tmp_dst)->swizzle = "x___";
794 add_src_reg(ctx, instr, &inst->Src[0].Register)->swizzle =
795 swiz[inst->Src[0].Register.SwizzleW];
796
797 instr = ir2_instr_create_alu(next_exec_cf(ctx), MULv, ~0);
798 add_dst_reg(ctx, instr, &tmp_dst)->swizzle = "xyz_";
799 add_src_reg(ctx, instr, &tmp_src)->swizzle = "xxxx";
800 add_src_reg(ctx, instr, &inst->Src[0].Register);
801
802 coord = &tmp_src;
803 } else {
804 coord = &inst->Src[0].Register;
805 }
806
807 instr = ir2_instr_create(next_exec_cf(ctx), IR2_FETCH);
808 instr->fetch.opc = TEX_FETCH;
809 instr->fetch.is_cube = (inst->Texture.Texture == TGSI_TEXTURE_3D);
810 assert(inst->Texture.NumOffsets <= 1); // TODO what to do in other cases?
811
812 /* save off the tex fetch to be patched later with correct const_idx: */
813 idx = ctx->so->num_tfetch_instrs++;
814 ctx->so->tfetch_instrs[idx].samp_id = inst->Src[1].Register.Index;
815 ctx->so->tfetch_instrs[idx].instr = instr;
816
817 add_dst_reg(ctx, instr, using_temp ? &tmp_dst : &inst->Dst[0].Register);
818 reg = add_src_reg(ctx, instr, coord);
819
820 /* blob compiler always sets 3rd component to same as 1st for 2d: */
821 if (inst->Texture.Texture == TGSI_TEXTURE_2D)
822 reg->swizzle[2] = reg->swizzle[0];
823
824 /* dst register needs to be marked for sync: */
825 ctx->need_sync |= 1 << instr->regs[0]->num;
826
827 /* TODO we need some way to know if the tex fetch needs to sync on alu pipe.. */
828 instr->sync = true;
829
830 if (using_temp) {
831 /* texture fetch can't write directly to export, so if tgsi
832 * is telling us the dst register is in output file, we load
833 * the texture to a temp and the use ALU instruction to move
834 * to output
835 */
836 instr = ir2_instr_create_alu(next_exec_cf(ctx), MAXv, ~0);
837
838 add_dst_reg(ctx, instr, &inst->Dst[0].Register);
839 add_src_reg(ctx, instr, &tmp_src);
840 add_src_reg(ctx, instr, &tmp_src);
841 add_vector_clamp(inst, instr);
842 }
843 }
844
845 /* SGE(a,b) = GTE((b - a), 1.0, 0.0) */
846 /* SLT(a,b) = GTE((b - a), 0.0, 1.0) */
847 static void
848 translate_sge_slt(struct fd2_compile_context *ctx,
849 struct tgsi_full_instruction *inst, unsigned opc)
850 {
851 struct ir2_instruction *instr;
852 struct tgsi_dst_register tmp_dst;
853 struct tgsi_src_register tmp_src;
854 struct tgsi_src_register tmp_const;
855 float c0, c1;
856
857 switch (opc) {
858 default:
859 assert(0);
860 case TGSI_OPCODE_SGE:
861 c0 = 1.0;
862 c1 = 0.0;
863 break;
864 case TGSI_OPCODE_SLT:
865 c0 = 0.0;
866 c1 = 1.0;
867 break;
868 }
869
870 get_internal_temp(ctx, &tmp_dst, &tmp_src);
871
872 instr = ir2_instr_create_alu(next_exec_cf(ctx), ADDv, ~0);
873 add_dst_reg(ctx, instr, &tmp_dst);
874 add_src_reg(ctx, instr, &inst->Src[0].Register)->flags |= IR2_REG_NEGATE;
875 add_src_reg(ctx, instr, &inst->Src[1].Register);
876
877 instr = ir2_instr_create_alu(next_exec_cf(ctx), CNDGTEv, ~0);
878 add_dst_reg(ctx, instr, &inst->Dst[0].Register);
879 /* maybe should re-arrange the syntax some day, but
880 * in assembler/disassembler and what ir.c expects
881 * is: MULADDv Rdst = Rsrc2 + Rsrc0 * Rscr1
882 */
883 get_immediate(ctx, &tmp_const, fui(c0));
884 add_src_reg(ctx, instr, &tmp_const);
885 add_src_reg(ctx, instr, &tmp_src);
886 get_immediate(ctx, &tmp_const, fui(c1));
887 add_src_reg(ctx, instr, &tmp_const);
888 }
889
890 /* LRP(a,b,c) = (a * b) + ((1 - a) * c) */
891 static void
892 translate_lrp(struct fd2_compile_context *ctx,
893 struct tgsi_full_instruction *inst,
894 unsigned opc)
895 {
896 struct ir2_instruction *instr;
897 struct tgsi_dst_register tmp_dst1, tmp_dst2;
898 struct tgsi_src_register tmp_src1, tmp_src2;
899 struct tgsi_src_register tmp_const;
900
901 get_internal_temp(ctx, &tmp_dst1, &tmp_src1);
902 get_internal_temp(ctx, &tmp_dst2, &tmp_src2);
903
904 get_immediate(ctx, &tmp_const, fui(1.0));
905
906 /* tmp1 = (a * b) */
907 instr = ir2_instr_create_alu(next_exec_cf(ctx), MULv, ~0);
908 add_dst_reg(ctx, instr, &tmp_dst1);
909 add_src_reg(ctx, instr, &inst->Src[0].Register);
910 add_src_reg(ctx, instr, &inst->Src[1].Register);
911
912 /* tmp2 = (1 - a) */
913 instr = ir2_instr_create_alu(next_exec_cf(ctx), ADDv, ~0);
914 add_dst_reg(ctx, instr, &tmp_dst2);
915 add_src_reg(ctx, instr, &tmp_const);
916 add_src_reg(ctx, instr, &inst->Src[0].Register)->flags |= IR2_REG_NEGATE;
917
918 /* tmp2 = tmp2 * c */
919 instr = ir2_instr_create_alu(next_exec_cf(ctx), MULv, ~0);
920 add_dst_reg(ctx, instr, &tmp_dst2);
921 add_src_reg(ctx, instr, &tmp_src2);
922 add_src_reg(ctx, instr, &inst->Src[2].Register);
923
924 /* dst = tmp1 + tmp2 */
925 instr = ir2_instr_create_alu(next_exec_cf(ctx), ADDv, ~0);
926 add_dst_reg(ctx, instr, &inst->Dst[0].Register);
927 add_src_reg(ctx, instr, &tmp_src1);
928 add_src_reg(ctx, instr, &tmp_src2);
929 }
930
931 static void
932 translate_trig(struct fd2_compile_context *ctx,
933 struct tgsi_full_instruction *inst,
934 unsigned opc)
935 {
936 struct ir2_instruction *instr;
937 struct tgsi_dst_register tmp_dst;
938 struct tgsi_src_register tmp_src;
939 struct tgsi_src_register tmp_const;
940 instr_scalar_opc_t op;
941
942 switch (opc) {
943 default:
944 assert(0);
945 case TGSI_OPCODE_SIN:
946 op = SIN;
947 break;
948 case TGSI_OPCODE_COS:
949 op = COS;
950 break;
951 }
952
953 get_internal_temp(ctx, &tmp_dst, &tmp_src);
954
955 tmp_dst.WriteMask = TGSI_WRITEMASK_X;
956 tmp_src.SwizzleX = tmp_src.SwizzleY =
957 tmp_src.SwizzleZ = tmp_src.SwizzleW = TGSI_SWIZZLE_X;
958
959 /* maybe should re-arrange the syntax some day, but
960 * in assembler/disassembler and what ir.c expects
961 * is: MULADDv Rdst = Rsrc2 + Rsrc0 * Rscr1
962 */
963 instr = ir2_instr_create_alu(next_exec_cf(ctx), MULADDv, ~0);
964 add_dst_reg(ctx, instr, &tmp_dst);
965 get_immediate(ctx, &tmp_const, fui(0.5));
966 add_src_reg(ctx, instr, &tmp_const);
967 add_src_reg(ctx, instr, &inst->Src[0].Register);
968 get_immediate(ctx, &tmp_const, fui(0.159155));
969 add_src_reg(ctx, instr, &tmp_const);
970
971 instr = ir2_instr_create_alu(next_exec_cf(ctx), FRACv, ~0);
972 add_dst_reg(ctx, instr, &tmp_dst);
973 add_src_reg(ctx, instr, &tmp_src);
974 add_src_reg(ctx, instr, &tmp_src);
975
976 instr = ir2_instr_create_alu(next_exec_cf(ctx), MULADDv, ~0);
977 add_dst_reg(ctx, instr, &tmp_dst);
978 get_immediate(ctx, &tmp_const, fui(-3.141593));
979 add_src_reg(ctx, instr, &tmp_const);
980 add_src_reg(ctx, instr, &tmp_src);
981 get_immediate(ctx, &tmp_const, fui(6.283185));
982 add_src_reg(ctx, instr, &tmp_const);
983
984 instr = ir2_instr_create_alu(next_exec_cf(ctx), ~0, op);
985 add_regs_dummy_vector(instr);
986 add_dst_reg(ctx, instr, &inst->Dst[0].Register);
987 add_src_reg(ctx, instr, &tmp_src);
988 }
989
990 /*
991 * Main part of compiler/translator:
992 */
993
994 static void
995 translate_instruction(struct fd2_compile_context *ctx,
996 struct tgsi_full_instruction *inst)
997 {
998 unsigned opc = inst->Instruction.Opcode;
999 struct ir2_instruction *instr;
1000 static struct ir2_cf *cf;
1001
1002 if (opc == TGSI_OPCODE_END)
1003 return;
1004
1005 if (inst->Dst[0].Register.File == TGSI_FILE_OUTPUT) {
1006 unsigned num = inst->Dst[0].Register.Index;
1007 /* seems like we need to ensure that position vs param/pixel
1008 * exports don't end up in the same EXEC clause.. easy way
1009 * to do this is force a new EXEC clause on first appearance
1010 * of an position or param/pixel export.
1011 */
1012 if ((num == ctx->position) || (num == ctx->psize)) {
1013 if (ctx->num_position > 0) {
1014 ctx->cf = NULL;
1015 ir2_cf_create_alloc(ctx->so->ir, SQ_POSITION,
1016 ctx->num_position - 1);
1017 ctx->num_position = 0;
1018 }
1019 } else {
1020 if (ctx->num_param > 0) {
1021 ctx->cf = NULL;
1022 ir2_cf_create_alloc(ctx->so->ir, SQ_PARAMETER_PIXEL,
1023 ctx->num_param - 1);
1024 ctx->num_param = 0;
1025 }
1026 }
1027 }
1028
1029 cf = next_exec_cf(ctx);
1030
1031 /* TODO turn this into a table: */
1032 switch (opc) {
1033 case TGSI_OPCODE_MOV:
1034 instr = ir2_instr_create_alu(cf, MAXv, ~0);
1035 add_regs_vector_1(ctx, inst, instr);
1036 break;
1037 case TGSI_OPCODE_RCP:
1038 instr = ir2_instr_create_alu(cf, ~0, RECIP_IEEE);
1039 add_regs_scalar_1(ctx, inst, instr);
1040 break;
1041 case TGSI_OPCODE_RSQ:
1042 instr = ir2_instr_create_alu(cf, ~0, RECIPSQ_IEEE);
1043 add_regs_scalar_1(ctx, inst, instr);
1044 break;
1045 case TGSI_OPCODE_MUL:
1046 instr = ir2_instr_create_alu(cf, MULv, ~0);
1047 add_regs_vector_2(ctx, inst, instr);
1048 break;
1049 case TGSI_OPCODE_ADD:
1050 instr = ir2_instr_create_alu(cf, ADDv, ~0);
1051 add_regs_vector_2(ctx, inst, instr);
1052 break;
1053 case TGSI_OPCODE_DP3:
1054 instr = ir2_instr_create_alu(cf, DOT3v, ~0);
1055 add_regs_vector_2(ctx, inst, instr);
1056 break;
1057 case TGSI_OPCODE_DP4:
1058 instr = ir2_instr_create_alu(cf, DOT4v, ~0);
1059 add_regs_vector_2(ctx, inst, instr);
1060 break;
1061 case TGSI_OPCODE_MIN:
1062 instr = ir2_instr_create_alu(cf, MINv, ~0);
1063 add_regs_vector_2(ctx, inst, instr);
1064 break;
1065 case TGSI_OPCODE_MAX:
1066 instr = ir2_instr_create_alu(cf, MAXv, ~0);
1067 add_regs_vector_2(ctx, inst, instr);
1068 break;
1069 case TGSI_OPCODE_SLT:
1070 case TGSI_OPCODE_SGE:
1071 translate_sge_slt(ctx, inst, opc);
1072 break;
1073 case TGSI_OPCODE_MAD:
1074 instr = ir2_instr_create_alu(cf, MULADDv, ~0);
1075 add_regs_vector_3(ctx, inst, instr);
1076 break;
1077 case TGSI_OPCODE_LRP:
1078 translate_lrp(ctx, inst, opc);
1079 break;
1080 case TGSI_OPCODE_FRC:
1081 instr = ir2_instr_create_alu(cf, FRACv, ~0);
1082 add_regs_vector_1(ctx, inst, instr);
1083 break;
1084 case TGSI_OPCODE_FLR:
1085 instr = ir2_instr_create_alu(cf, FLOORv, ~0);
1086 add_regs_vector_1(ctx, inst, instr);
1087 break;
1088 case TGSI_OPCODE_EX2:
1089 instr = ir2_instr_create_alu(cf, ~0, EXP_IEEE);
1090 add_regs_scalar_1(ctx, inst, instr);
1091 break;
1092 case TGSI_OPCODE_POW:
1093 translate_pow(ctx, inst);
1094 break;
1095 case TGSI_OPCODE_ABS:
1096 instr = ir2_instr_create_alu(cf, MAXv, ~0);
1097 add_regs_vector_1(ctx, inst, instr);
1098 instr->regs[1]->flags |= IR2_REG_NEGATE; /* src0 */
1099 break;
1100 case TGSI_OPCODE_COS:
1101 case TGSI_OPCODE_SIN:
1102 translate_trig(ctx, inst, opc);
1103 break;
1104 case TGSI_OPCODE_TEX:
1105 case TGSI_OPCODE_TXP:
1106 translate_tex(ctx, inst, opc);
1107 break;
1108 case TGSI_OPCODE_CMP:
1109 instr = ir2_instr_create_alu(cf, CNDGTEv, ~0);
1110 add_regs_vector_3(ctx, inst, instr);
1111 // TODO this should be src0 if regs where in sane order..
1112 instr->regs[2]->flags ^= IR2_REG_NEGATE; /* src1 */
1113 break;
1114 case TGSI_OPCODE_IF:
1115 push_predicate(ctx, &inst->Src[0].Register);
1116 ctx->so->ir->pred = IR2_PRED_EQ;
1117 break;
1118 case TGSI_OPCODE_ELSE:
1119 ctx->so->ir->pred = IR2_PRED_NE;
1120 /* not sure if this is required in all cases, but blob compiler
1121 * won't combine EQ and NE in same CF:
1122 */
1123 ctx->cf = NULL;
1124 break;
1125 case TGSI_OPCODE_ENDIF:
1126 pop_predicate(ctx);
1127 break;
1128 case TGSI_OPCODE_F2I:
1129 instr = ir2_instr_create_alu(cf, TRUNCv, ~0);
1130 add_regs_vector_1(ctx, inst, instr);
1131 break;
1132 default:
1133 DBG("unknown TGSI opc: %s", tgsi_get_opcode_name(opc));
1134 tgsi_dump(ctx->so->tokens, 0);
1135 assert(0);
1136 break;
1137 }
1138
1139 /* internal temporaries are only valid for the duration of a single
1140 * TGSI instruction:
1141 */
1142 ctx->num_internal_temps = 0;
1143 }
1144
1145 static void
1146 compile_instructions(struct fd2_compile_context *ctx)
1147 {
1148 while (!tgsi_parse_end_of_tokens(&ctx->parser)) {
1149 tgsi_parse_token(&ctx->parser);
1150
1151 switch (ctx->parser.FullToken.Token.Type) {
1152 case TGSI_TOKEN_TYPE_INSTRUCTION:
1153 translate_instruction(ctx,
1154 &ctx->parser.FullToken.FullInstruction);
1155 break;
1156 default:
1157 break;
1158 }
1159 }
1160
1161 ctx->cf->cf_type = EXEC_END;
1162 }
1163
1164 int
1165 fd2_compile_shader(struct fd_program_stateobj *prog,
1166 struct fd2_shader_stateobj *so)
1167 {
1168 struct fd2_compile_context ctx;
1169
1170 ir2_shader_destroy(so->ir);
1171 so->ir = ir2_shader_create();
1172 so->num_vfetch_instrs = so->num_tfetch_instrs = so->num_immediates = 0;
1173
1174 if (compile_init(&ctx, prog, so) != TGSI_PARSE_OK)
1175 return -1;
1176
1177 if (ctx.type == TGSI_PROCESSOR_VERTEX) {
1178 compile_vtx_fetch(&ctx);
1179 } else if (ctx.type == TGSI_PROCESSOR_FRAGMENT) {
1180 prog->num_exports = 0;
1181 memset(prog->export_linkage, 0xff,
1182 sizeof(prog->export_linkage));
1183 }
1184
1185 compile_instructions(&ctx);
1186
1187 compile_free(&ctx);
1188
1189 return 0;
1190 }
1191