st/nine: Add D3DFMT_DF16 support
[mesa.git] / src / gallium / drivers / freedreno / ir3 / ir3_ra.c
1 /* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
2
3 /*
4 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 * SOFTWARE.
24 *
25 * Authors:
26 * Rob Clark <robclark@freedesktop.org>
27 */
28
29 #include "pipe/p_shader_tokens.h"
30 #include "util/u_math.h"
31
32 #include "ir3.h"
33
34 /*
35 * Register Assignment:
36 *
37 * NOTE: currently only works on a single basic block.. need to think
38 * about how multiple basic blocks are going to get scheduled. But
39 * I think I want to re-arrange how blocks work, ie. get rid of the
40 * block nesting thing..
41 *
42 * NOTE: we could do register coalescing (eliminate moves) as part of
43 * the RA step.. OTOH I think we need to do scheduling before register
44 * assignment. And if we remove a mov that effects scheduling (unless
45 * we leave a placeholder nop, which seems lame), so I'm not really
46 * sure how practical this is to do both in a single stage. But OTOH
47 * I'm not really sure a sane way for the CP stage to realize when it
48 * cannot remove a mov due to multi-register constraints..
49 *
50 */
51
52 struct ir3_ra_ctx {
53 struct ir3_block *block;
54 enum shader_t type;
55 bool frag_coord;
56 bool frag_face;
57 int cnt;
58 bool error;
59 };
60
61 #ifdef DEBUG
62 # include "freedreno_util.h"
63 # define ra_debug (fd_mesa_debug & FD_DBG_OPTMSGS)
64 #else
65 # define ra_debug 0
66 #endif
67
68 #define ra_dump_list(msg, n) do { \
69 if (ra_debug) { \
70 debug_printf("-- " msg); \
71 ir3_dump_instr_list(n); \
72 } \
73 } while (0)
74
75 #define ra_dump_instr(msg, n) do { \
76 if (ra_debug) { \
77 debug_printf(">> " msg); \
78 ir3_dump_instr_single(n); \
79 } \
80 } while (0)
81
82 #define ra_assert(ctx, x) do { \
83 debug_assert(x); \
84 if (!(x)) { \
85 debug_printf("RA: failed assert: %s\n", #x); \
86 (ctx)->error = true; \
87 }; \
88 } while (0)
89
90
91 /* sorta ugly way to retrofit half-precision support.. rather than
92 * passing extra param around, just OR in a high bit. All the low
93 * value arithmetic (ie. +/- offset within a contiguous vec4, etc)
94 * will continue to work as long as you don't underflow (and that
95 * would go badly anyways).
96 */
97 #define REG_HALF 0x8000
98
99 #define REG(n, wm, f) (struct ir3_register){ \
100 .flags = (f), \
101 .num = (n), \
102 .wrmask = TGSI_WRITEMASK_ ## wm, \
103 }
104
105 /* check that the register exists, is a GPR and is not special (a0/p0) */
106 static struct ir3_register * reg_check(struct ir3_instruction *instr, unsigned n)
107 {
108 if ((n < instr->regs_count) && reg_gpr(instr->regs[n]) &&
109 !(instr->regs[n]->flags & IR3_REG_SSA))
110 return instr->regs[n];
111 return NULL;
112 }
113
114 /* figure out if an unassigned src register points back to the instr we
115 * are assigning:
116 */
117 static bool instr_used_by(struct ir3_instruction *instr,
118 struct ir3_register *src)
119 {
120 struct ir3_instruction *src_instr = ssa(src);
121 unsigned i;
122 if (instr == src_instr)
123 return true;
124 if (src_instr && is_meta(src_instr))
125 for (i = 1; i < src_instr->regs_count; i++)
126 if (instr_used_by(instr, src_instr->regs[i]))
127 return true;
128
129 return false;
130 }
131
132 static bool instr_is_output(struct ir3_instruction *instr)
133 {
134 struct ir3_block *block = instr->block;
135 unsigned i;
136
137 for (i = 0; i < block->noutputs; i++)
138 if (instr == block->outputs[i])
139 return true;
140
141 return false;
142 }
143
144 /* live means read before written */
145 static void compute_liveregs(struct ir3_ra_ctx *ctx,
146 struct ir3_instruction *instr, regmask_t *liveregs)
147 {
148 struct ir3_block *block = instr->block;
149 struct ir3_instruction *n;
150 regmask_t written;
151 unsigned i;
152
153 regmask_init(&written);
154
155 for (n = instr->next; n; n = n->next) {
156 struct ir3_register *r;
157
158 if (is_meta(n))
159 continue;
160
161 /* check first src's read: */
162 for (i = 1; i < n->regs_count; i++) {
163 r = reg_check(n, i);
164 if (r)
165 regmask_set_if_not(liveregs, r, &written);
166
167 /* if any src points back to the instruction(s) in
168 * the block of neighbors that we are assigning then
169 * mark any written (clobbered) registers as live:
170 */
171 if (instr_used_by(instr, n->regs[i]))
172 regmask_or(liveregs, liveregs, &written);
173 }
174
175 /* meta-instructions don't actually get scheduled,
176 * so don't let it's write confuse us.. what we
177 * really care about is when the src to the meta
178 * instr was written:
179 */
180 if (is_meta(n))
181 continue;
182
183 /* then dst written (if assigned already): */
184 r = reg_check(n, 0);
185 if (r) {
186 /* if an instruction *is* an output, then it is live */
187 if (!instr_is_output(n))
188 regmask_set(&written, r);
189 }
190
191 }
192
193 /* be sure to account for output registers too: */
194 for (i = 0; i < block->noutputs; i++) {
195 struct ir3_register *r;
196 if (!block->outputs[i])
197 continue;
198 r = reg_check(block->outputs[i], 0);
199 if (r)
200 regmask_set_if_not(liveregs, r, &written);
201 }
202
203 /* if instruction is output, we need a reg that isn't written
204 * before the end.. equiv to the instr_used_by() check above
205 * in the loop body
206 * TODO maybe should follow fanin/fanout?
207 */
208 if (instr_is_output(instr))
209 regmask_or(liveregs, liveregs, &written);
210 }
211
212 static int find_available(regmask_t *liveregs, int size, bool half)
213 {
214 unsigned i;
215 unsigned f = half ? IR3_REG_HALF : 0;
216 for (i = 0; i < MAX_REG - size; i++) {
217 if (!regmask_get(liveregs, &REG(i, X, f))) {
218 unsigned start = i++;
219 for (; (i < MAX_REG) && ((i - start) < size); i++)
220 if (regmask_get(liveregs, &REG(i, X, f)))
221 break;
222 if ((i - start) >= size)
223 return start;
224 }
225 }
226 assert(0);
227 return -1;
228 }
229
230 static int alloc_block(struct ir3_ra_ctx *ctx,
231 struct ir3_instruction *instr, int size)
232 {
233 struct ir3_register *dst = instr->regs[0];
234 struct ir3_instruction *n;
235 regmask_t liveregs;
236 unsigned name;
237
238 /* should only ever be called w/ head of neighbor list: */
239 debug_assert(!instr->cp.left);
240
241 regmask_init(&liveregs);
242
243 for (n = instr; n; n = n->cp.right)
244 compute_liveregs(ctx, n, &liveregs);
245
246 /* because we do assignment on fanout nodes for wrmask!=0x1, we
247 * need to handle this special case, where the fanout nodes all
248 * appear after one or more of the consumers of the src node:
249 *
250 * 0098:009: sam _, r2.x
251 * 0028:010: mul.f r3.z, r4.x, c13.x
252 * ; we start assigning here for '0098:009: sam'.. but
253 * ; would miss the usage at '0028:010: mul.f'
254 * 0101:009: _meta:fo _, _[0098:009: sam], off=2
255 */
256 if (is_meta(instr) && (instr->opc == OPC_META_FO))
257 compute_liveregs(ctx, instr->regs[1]->instr, &liveregs);
258
259 name = find_available(&liveregs, size,
260 !!(dst->flags & IR3_REG_HALF));
261
262 if (dst->flags & IR3_REG_HALF)
263 name |= REG_HALF;
264
265 return name;
266 }
267
268 static type_t half_type(type_t type)
269 {
270 switch (type) {
271 case TYPE_F32: return TYPE_F16;
272 case TYPE_U32: return TYPE_U16;
273 case TYPE_S32: return TYPE_S16;
274 /* instructions may already be fixed up: */
275 case TYPE_F16:
276 case TYPE_U16:
277 case TYPE_S16:
278 return type;
279 default:
280 assert(0);
281 return ~0;
282 }
283 }
284
285 /* some instructions need fix-up if dst register is half precision: */
286 static void fixup_half_instr_dst(struct ir3_instruction *instr)
287 {
288 switch (instr->category) {
289 case 1: /* move instructions */
290 instr->cat1.dst_type = half_type(instr->cat1.dst_type);
291 break;
292 case 3:
293 switch (instr->opc) {
294 case OPC_MAD_F32:
295 instr->opc = OPC_MAD_F16;
296 break;
297 case OPC_SEL_B32:
298 instr->opc = OPC_SEL_B16;
299 break;
300 case OPC_SEL_S32:
301 instr->opc = OPC_SEL_S16;
302 break;
303 case OPC_SEL_F32:
304 instr->opc = OPC_SEL_F16;
305 break;
306 case OPC_SAD_S32:
307 instr->opc = OPC_SAD_S16;
308 break;
309 /* instructions may already be fixed up: */
310 case OPC_MAD_F16:
311 case OPC_SEL_B16:
312 case OPC_SEL_S16:
313 case OPC_SEL_F16:
314 case OPC_SAD_S16:
315 break;
316 default:
317 assert(0);
318 break;
319 }
320 break;
321 case 5:
322 instr->cat5.type = half_type(instr->cat5.type);
323 break;
324 }
325 }
326 /* some instructions need fix-up if src register is half precision: */
327 static void fixup_half_instr_src(struct ir3_instruction *instr)
328 {
329 switch (instr->category) {
330 case 1: /* move instructions */
331 instr->cat1.src_type = half_type(instr->cat1.src_type);
332 break;
333 }
334 }
335
336 static void reg_assign(struct ir3_instruction *instr,
337 unsigned r, unsigned name)
338 {
339 struct ir3_register *reg = instr->regs[r];
340
341 reg->flags &= ~IR3_REG_SSA;
342 reg->num = name & ~REG_HALF;
343
344 if (name & REG_HALF) {
345 reg->flags |= IR3_REG_HALF;
346 /* if dst reg being assigned, patch up the instr: */
347 if (reg == instr->regs[0])
348 fixup_half_instr_dst(instr);
349 else
350 fixup_half_instr_src(instr);
351 }
352 }
353
354 static void instr_assign(struct ir3_ra_ctx *ctx,
355 struct ir3_instruction *instr, unsigned name);
356
357 static void instr_assign_src(struct ir3_ra_ctx *ctx,
358 struct ir3_instruction *instr, unsigned r, unsigned name)
359 {
360 reg_assign(instr, r, name);
361
362 if (is_meta(instr)) {
363 switch (instr->opc) {
364 case OPC_META_INPUT:
365 /* shader-input does not have a src, only block input: */
366 debug_assert(instr->regs_count == 2);
367 instr_assign(ctx, instr, name);
368 return;
369 case OPC_META_FO:
370 instr_assign(ctx, instr, name + instr->fo.off);
371 return;
372 case OPC_META_FI:
373 instr_assign(ctx, instr, name - (r - 1));
374 return;
375 case OPC_META_DEREF:
376 /* first arg of meta:deref is the addr reg (do not
377 * propagate), 2nd is actual src (fanin) which does
378 * get propagated)
379 */
380 if (r == 2)
381 instr_assign(ctx, instr, name + instr->deref.off);
382 break;
383 default:
384 break;
385 }
386 }
387 }
388
389 static void instr_assign(struct ir3_ra_ctx *ctx,
390 struct ir3_instruction *instr, unsigned name)
391 {
392 struct ir3_instruction *n;
393 struct ir3_register *reg = instr->regs[0];
394
395 /* check if already assigned: */
396 if (!(reg->flags & IR3_REG_SSA)) {
397 /* ... and if so, sanity check: */
398 ra_assert(ctx, reg->num == (name & ~REG_HALF));
399 return;
400 }
401
402 /* rename this instructions dst register: */
403 reg_assign(instr, 0, name);
404
405 /* and rename any subsequent use of result of this instr: */
406 for (n = instr->next; n && !ctx->error; n = n->next) {
407 unsigned i;
408
409 for (i = 1; i < n->regs_count; i++) {
410 reg = n->regs[i];
411 if ((reg->flags & IR3_REG_SSA) && (reg->instr == instr))
412 instr_assign_src(ctx, n, i, name);
413 }
414 }
415
416 /* To simplify the neighbor logic, and to "avoid" dealing with
417 * instructions which write more than one output, we actually
418 * do register assignment for instructions that produce multiple
419 * outputs on the fanout nodes and propagate up the assignment
420 * to the actual instruction:
421 */
422 if (is_meta(instr) && (instr->opc == OPC_META_FO)) {
423 struct ir3_instruction *src = ssa(instr->regs[1]);
424 debug_assert(name >= instr->fo.off);
425 if (src)
426 instr_assign(ctx, src, name - instr->fo.off);
427 }
428 }
429
430 /* check neighbor list to see if it is already partially (or completely)
431 * assigned, in which case register block is already allocated and we
432 * just need to complete the assignment:
433 */
434 static int check_partial_assignment(struct ir3_ra_ctx *ctx,
435 struct ir3_instruction *instr)
436 {
437 struct ir3_instruction *n;
438 int off = 0;
439
440 debug_assert(!instr->cp.left);
441
442 for (n = instr; n; n = n->cp.right) {
443 struct ir3_register *dst = n->regs[0];
444 if (!(dst->flags & IR3_REG_SSA)) {
445 int name = dst->num - off;
446 debug_assert(name >= 0);
447 return name;
448 }
449 off++;
450 }
451
452 return -1;
453 }
454
455 /* allocate register name(s) for a list of neighboring instructions;
456 * instr should point to leftmost neighbor (head of list)
457 */
458 static void instr_alloc_and_assign(struct ir3_ra_ctx *ctx,
459 struct ir3_instruction *instr)
460 {
461 struct ir3_instruction *n;
462 struct ir3_register *dst;
463 int name;
464
465 debug_assert(!instr->cp.left);
466
467 if (instr->regs_count == 0)
468 return;
469
470 dst = instr->regs[0];
471
472 /* for instructions w/ fanouts, do the actual register assignment
473 * on the group of fanout neighbor nodes and propagate the reg
474 * name back up to the texture instruction.
475 */
476 if (dst->wrmask != 0x1)
477 return;
478
479 name = check_partial_assignment(ctx, instr);
480
481 /* allocate register(s): */
482 if (name >= 0) {
483 /* already partially assigned, just finish the job */
484 } else if (is_addr(instr)) {
485 debug_assert(!instr->cp.right);
486 name = instr->regs[2]->num + instr->deref.off;
487 } else if (reg_gpr(dst)) {
488 int size;
489 /* number of consecutive registers to assign: */
490 size = ir3_neighbor_count(instr);
491 if (dst->wrmask != 0x1)
492 size = MAX2(size, ffs(~dst->wrmask) - 1);
493 name = alloc_block(ctx, instr, size);
494 } else if (dst->flags & IR3_REG_ADDR) {
495 debug_assert(!instr->cp.right);
496 dst->flags &= ~IR3_REG_ADDR;
497 name = regid(REG_A0, 0) | REG_HALF;
498 } else {
499 debug_assert(!instr->cp.right);
500 /* predicate register (p0).. etc */
501 name = regid(REG_P0, 0);
502 debug_assert(dst->num == name);
503 }
504
505 ra_assert(ctx, name >= 0);
506
507 for (n = instr; n && !ctx->error; n = n->cp.right) {
508 instr_assign(ctx, n, name);
509 name++;
510 }
511 }
512
513 static int block_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block)
514 {
515 struct ir3_instruction *n;
516
517 /* frag shader inputs get pre-assigned, since we have some
518 * constraints/unknowns about setup for some of these regs:
519 */
520 if ((ctx->type == SHADER_FRAGMENT) && !block->parent) {
521 unsigned i = 0, j;
522 if (ctx->frag_face && (i < block->ninputs) && block->inputs[i]) {
523 /* if we have frag_face, it gets hr0.x */
524 instr_assign(ctx, block->inputs[i], REG_HALF | 0);
525 i += 4;
526 }
527 for (j = 0; i < block->ninputs; i++, j++)
528 if (block->inputs[i])
529 instr_assign(ctx, block->inputs[i], j);
530 }
531
532 ra_dump_list("-------\n", block->head);
533
534 for (n = block->head; n && !ctx->error; n = n->next) {
535 ra_dump_instr("ASSIGN: ", n);
536 instr_alloc_and_assign(ctx, ir3_neighbor_first(n));
537 ra_dump_list("-------\n", block->head);
538 }
539
540 return ctx->error ? -1 : 0;
541 }
542
543 int ir3_block_ra(struct ir3_block *block, enum shader_t type,
544 bool frag_coord, bool frag_face)
545 {
546 struct ir3_instruction *n;
547 struct ir3_ra_ctx ctx = {
548 .block = block,
549 .type = type,
550 .frag_coord = frag_coord,
551 .frag_face = frag_face,
552 };
553 int ret;
554
555 /* mark dst registers w/ SSA flag so we can see which
556 * have been assigned so far:
557 * NOTE: we really should set SSA flag consistently on
558 * every dst register in the frontend.
559 */
560 for (n = block->head; n; n = n->next)
561 if (n->regs_count > 0)
562 n->regs[0]->flags |= IR3_REG_SSA;
563
564 ir3_clear_mark(block->shader);
565 ret = block_ra(&ctx, block);
566
567 return ret;
568 }