freedreno/ir3: stop hard-coding FS input regs
[mesa.git] / src / gallium / drivers / freedreno / ir3 / ir3_shader.c
1 /* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
2
3 /*
4 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a
7 * copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice (including the next
14 * paragraph) shall be included in all copies or substantial portions of the
15 * Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 * SOFTWARE.
24 *
25 * Authors:
26 * Rob Clark <robclark@freedesktop.org>
27 */
28
29 #include "pipe/p_state.h"
30 #include "util/u_string.h"
31 #include "util/u_memory.h"
32 #include "util/u_inlines.h"
33 #include "util/u_format.h"
34 #include "tgsi/tgsi_dump.h"
35 #include "tgsi/tgsi_parse.h"
36
37 #include "freedreno_context.h"
38 #include "freedreno_util.h"
39
40 #include "ir3_shader.h"
41 #include "ir3_compiler.h"
42 #include "ir3_nir.h"
43
44 int
45 ir3_glsl_type_size(const struct glsl_type *type)
46 {
47 return glsl_count_attribute_slots(type, false);
48 }
49
50 static void
51 delete_variant(struct ir3_shader_variant *v)
52 {
53 if (v->ir)
54 ir3_destroy(v->ir);
55 if (v->bo)
56 fd_bo_del(v->bo);
57 free(v);
58 }
59
60 /* for vertex shader, the inputs are loaded into registers before the shader
61 * is executed, so max_regs from the shader instructions might not properly
62 * reflect the # of registers actually used, especially in case passthrough
63 * varyings.
64 *
65 * Likewise, for fragment shader, we can have some regs which are passed
66 * input values but never touched by the resulting shader (ie. as result
67 * of dead code elimination or simply because we don't know how to turn
68 * the reg off.
69 */
70 static void
71 fixup_regfootprint(struct ir3_shader_variant *v)
72 {
73 unsigned i;
74
75 for (i = 0; i < v->inputs_count; i++) {
76 /* skip frag inputs fetch via bary.f since their reg's are
77 * not written by gpu before shader starts (and in fact the
78 * regid's might not even be valid)
79 */
80 if (v->inputs[i].bary)
81 continue;
82
83 /* ignore high regs that are global to all threads in a warp
84 * (they exist by default) (a5xx+)
85 */
86 if (v->inputs[i].regid >= regid(48,0))
87 continue;
88
89 if (v->inputs[i].compmask) {
90 unsigned n = util_last_bit(v->inputs[i].compmask) - 1;
91 int32_t regid = (v->inputs[i].regid + n) >> 2;
92 v->info.max_reg = MAX2(v->info.max_reg, regid);
93 }
94 }
95
96 for (i = 0; i < v->outputs_count; i++) {
97 int32_t regid = (v->outputs[i].regid + 3) >> 2;
98 v->info.max_reg = MAX2(v->info.max_reg, regid);
99 }
100 }
101
102 /* wrapper for ir3_assemble() which does some info fixup based on
103 * shader state. Non-static since used by ir3_cmdline too.
104 */
105 void * ir3_shader_assemble(struct ir3_shader_variant *v, uint32_t gpu_id)
106 {
107 void *bin;
108
109 bin = ir3_assemble(v->ir, &v->info, gpu_id);
110 if (!bin)
111 return NULL;
112
113 if (gpu_id >= 400) {
114 v->instrlen = v->info.sizedwords / (2 * 16);
115 } else {
116 v->instrlen = v->info.sizedwords / (2 * 4);
117 }
118
119 /* NOTE: if relative addressing is used, we set constlen in
120 * the compiler (to worst-case value) since we don't know in
121 * the assembler what the max addr reg value can be:
122 */
123 v->constlen = MIN2(255, MAX2(v->constlen, v->info.max_const + 1));
124
125 fixup_regfootprint(v);
126
127 return bin;
128 }
129
130 static void
131 assemble_variant(struct ir3_shader_variant *v)
132 {
133 struct ir3_compiler *compiler = v->shader->compiler;
134 uint32_t gpu_id = compiler->gpu_id;
135 uint32_t sz, *bin;
136
137 bin = ir3_shader_assemble(v, gpu_id);
138 sz = v->info.sizedwords * 4;
139
140 v->bo = fd_bo_new(compiler->dev, sz,
141 DRM_FREEDRENO_GEM_CACHE_WCOMBINE |
142 DRM_FREEDRENO_GEM_TYPE_KMEM);
143
144 memcpy(fd_bo_map(v->bo), bin, sz);
145
146 if (fd_mesa_debug & FD_DBG_DISASM) {
147 struct ir3_shader_key key = v->key;
148 printf("disassemble: type=%d, k={bp=%u,cts=%u,hp=%u}", v->type,
149 key.binning_pass, key.color_two_side, key.half_precision);
150 ir3_shader_disasm(v, bin, stdout);
151 }
152
153 if (shader_debug_enabled(v->shader->type)) {
154 fprintf(stderr, "Native code for unnamed %s shader %s:\n",
155 shader_stage_name(v->shader->type), v->shader->nir->info.name);
156 if (v->shader->type == SHADER_FRAGMENT)
157 fprintf(stderr, "SIMD0\n");
158 ir3_shader_disasm(v, bin, stderr);
159 }
160
161 free(bin);
162
163 /* no need to keep the ir around beyond this point: */
164 ir3_destroy(v->ir);
165 v->ir = NULL;
166 }
167
168 static void
169 dump_shader_info(struct ir3_shader_variant *v, struct pipe_debug_callback *debug)
170 {
171 if (!unlikely(fd_mesa_debug & FD_DBG_SHADERDB))
172 return;
173
174 pipe_debug_message(debug, SHADER_INFO, "\n"
175 "SHADER-DB: %s prog %d/%d: %u instructions, %u dwords\n"
176 "SHADER-DB: %s prog %d/%d: %u half, %u full\n"
177 "SHADER-DB: %s prog %d/%d: %u const, %u constlen\n"
178 "SHADER-DB: %s prog %d/%d: %u (ss), %u (sy)\n",
179 ir3_shader_stage(v->shader),
180 v->shader->id, v->id,
181 v->info.instrs_count,
182 v->info.sizedwords,
183 ir3_shader_stage(v->shader),
184 v->shader->id, v->id,
185 v->info.max_half_reg + 1,
186 v->info.max_reg + 1,
187 ir3_shader_stage(v->shader),
188 v->shader->id, v->id,
189 v->info.max_const + 1,
190 v->constlen,
191 ir3_shader_stage(v->shader),
192 v->shader->id, v->id,
193 v->info.ss, v->info.sy);
194 }
195
196 static struct ir3_shader_variant *
197 create_variant(struct ir3_shader *shader, struct ir3_shader_key key)
198 {
199 struct ir3_shader_variant *v = CALLOC_STRUCT(ir3_shader_variant);
200 int ret;
201
202 if (!v)
203 return NULL;
204
205 v->id = ++shader->variant_count;
206 v->shader = shader;
207 v->key = key;
208 v->type = shader->type;
209
210 ret = ir3_compile_shader_nir(shader->compiler, v);
211 if (ret) {
212 debug_error("compile failed!");
213 goto fail;
214 }
215
216 assemble_variant(v);
217 if (!v->bo) {
218 debug_error("assemble failed!");
219 goto fail;
220 }
221
222 return v;
223
224 fail:
225 delete_variant(v);
226 return NULL;
227 }
228
229 struct ir3_shader_variant *
230 ir3_shader_variant(struct ir3_shader *shader, struct ir3_shader_key key,
231 struct pipe_debug_callback *debug)
232 {
233 struct ir3_shader_variant *v;
234
235 /* some shader key values only apply to vertex or frag shader,
236 * so normalize the key to avoid constructing multiple identical
237 * variants:
238 */
239 switch (shader->type) {
240 case SHADER_FRAGMENT:
241 key.binning_pass = false;
242 if (key.has_per_samp) {
243 key.vsaturate_s = 0;
244 key.vsaturate_t = 0;
245 key.vsaturate_r = 0;
246 key.vastc_srgb = 0;
247 key.vsamples = 0;
248 }
249 break;
250 case SHADER_VERTEX:
251 key.color_two_side = false;
252 key.half_precision = false;
253 key.rasterflat = false;
254 if (key.has_per_samp) {
255 key.fsaturate_s = 0;
256 key.fsaturate_t = 0;
257 key.fsaturate_r = 0;
258 key.fastc_srgb = 0;
259 key.fsamples = 0;
260 }
261 break;
262 default:
263 /* TODO */
264 break;
265 }
266
267 for (v = shader->variants; v; v = v->next)
268 if (ir3_shader_key_equal(&key, &v->key))
269 return v;
270
271 /* compile new variant if it doesn't exist already: */
272 v = create_variant(shader, key);
273 if (v) {
274 v->next = shader->variants;
275 shader->variants = v;
276 dump_shader_info(v, debug);
277 }
278
279 return v;
280 }
281
282
283 void
284 ir3_shader_destroy(struct ir3_shader *shader)
285 {
286 struct ir3_shader_variant *v, *t;
287 for (v = shader->variants; v; ) {
288 t = v;
289 v = v->next;
290 delete_variant(t);
291 }
292 ralloc_free(shader->nir);
293 free(shader);
294 }
295
296 struct ir3_shader *
297 ir3_shader_create(struct ir3_compiler *compiler,
298 const struct pipe_shader_state *cso, enum shader_t type,
299 struct pipe_debug_callback *debug)
300 {
301 struct ir3_shader *shader = CALLOC_STRUCT(ir3_shader);
302 shader->compiler = compiler;
303 shader->id = ++shader->compiler->shader_count;
304 shader->type = type;
305
306 nir_shader *nir;
307 if (cso->type == PIPE_SHADER_IR_NIR) {
308 /* we take ownership of the reference: */
309 nir = cso->ir.nir;
310
311 NIR_PASS_V(nir, nir_lower_io, nir_var_all, ir3_glsl_type_size,
312 (nir_lower_io_options)0);
313 } else {
314 debug_assert(cso->type == PIPE_SHADER_IR_TGSI);
315 if (fd_mesa_debug & FD_DBG_DISASM) {
316 DBG("dump tgsi: type=%d", shader->type);
317 tgsi_dump(cso->tokens, 0);
318 }
319 nir = ir3_tgsi_to_nir(cso->tokens);
320 }
321 /* do first pass optimization, ignoring the key: */
322 shader->nir = ir3_optimize_nir(shader, nir, NULL);
323 if (fd_mesa_debug & FD_DBG_DISASM) {
324 DBG("dump nir%d: type=%d", shader->id, shader->type);
325 nir_print_shader(shader->nir, stdout);
326 }
327
328 shader->stream_output = cso->stream_output;
329 if (fd_mesa_debug & FD_DBG_SHADERDB) {
330 /* if shader-db run, create a standard variant immediately
331 * (as otherwise nothing will trigger the shader to be
332 * actually compiled)
333 */
334 static struct ir3_shader_key key;
335 memset(&key, 0, sizeof(key));
336 ir3_shader_variant(shader, key, debug);
337 }
338 return shader;
339 }
340
341 /* a bit annoying that compute-shader and normal shader state objects
342 * aren't a bit more aligned.
343 */
344 struct ir3_shader *
345 ir3_shader_create_compute(struct ir3_compiler *compiler,
346 const struct pipe_compute_state *cso,
347 struct pipe_debug_callback *debug)
348 {
349 struct ir3_shader *shader = CALLOC_STRUCT(ir3_shader);
350
351 shader->compiler = compiler;
352 shader->id = ++shader->compiler->shader_count;
353 shader->type = SHADER_COMPUTE;
354
355 nir_shader *nir;
356 if (cso->ir_type == PIPE_SHADER_IR_NIR) {
357 /* we take ownership of the reference: */
358 nir = (nir_shader *)cso->prog;
359
360 NIR_PASS_V(nir, nir_lower_io, nir_var_all, ir3_glsl_type_size,
361 (nir_lower_io_options)0);
362 } else {
363 debug_assert(cso->ir_type == PIPE_SHADER_IR_TGSI);
364 if (fd_mesa_debug & FD_DBG_DISASM) {
365 DBG("dump tgsi: type=%d", shader->type);
366 tgsi_dump(cso->prog, 0);
367 }
368 nir = ir3_tgsi_to_nir(cso->prog);
369 }
370
371 /* do first pass optimization, ignoring the key: */
372 shader->nir = ir3_optimize_nir(shader, nir, NULL);
373 if (fd_mesa_debug & FD_DBG_DISASM) {
374 printf("dump nir%d: type=%d\n", shader->id, shader->type);
375 nir_print_shader(shader->nir, stdout);
376 }
377
378 return shader;
379 }
380
381 static void dump_reg(FILE *out, const char *name, uint32_t r)
382 {
383 if (r != regid(63,0))
384 fprintf(out, "; %s: r%d.%c\n", name, r >> 2, "xyzw"[r & 0x3]);
385 }
386
387 static void dump_output(FILE *out, struct ir3_shader_variant *so,
388 unsigned slot, const char *name)
389 {
390 uint32_t regid;
391 regid = ir3_find_output_regid(so, slot);
392 dump_reg(out, name, regid);
393 }
394
395 void
396 ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin, FILE *out)
397 {
398 struct ir3 *ir = so->ir;
399 struct ir3_register *reg;
400 const char *type = ir3_shader_stage(so->shader);
401 uint8_t regid;
402 unsigned i;
403
404 for (i = 0; i < ir->ninputs; i++) {
405 if (!ir->inputs[i]) {
406 fprintf(out, "; in%d unused\n", i);
407 continue;
408 }
409 reg = ir->inputs[i]->regs[0];
410 regid = reg->num;
411 fprintf(out, "@in(%sr%d.%c)\tin%d\n",
412 (reg->flags & IR3_REG_HALF) ? "h" : "",
413 (regid >> 2), "xyzw"[regid & 0x3], i);
414 }
415
416 for (i = 0; i < ir->noutputs; i++) {
417 if (!ir->outputs[i]) {
418 fprintf(out, "; out%d unused\n", i);
419 continue;
420 }
421 /* kill shows up as a virtual output.. skip it! */
422 if (is_kill(ir->outputs[i]))
423 continue;
424 reg = ir->outputs[i]->regs[0];
425 regid = reg->num;
426 fprintf(out, "@out(%sr%d.%c)\tout%d\n",
427 (reg->flags & IR3_REG_HALF) ? "h" : "",
428 (regid >> 2), "xyzw"[regid & 0x3], i);
429 }
430
431 for (i = 0; i < so->immediates_count; i++) {
432 fprintf(out, "@const(c%d.x)\t", so->constbase.immediate + i);
433 fprintf(out, "0x%08x, 0x%08x, 0x%08x, 0x%08x\n",
434 so->immediates[i].val[0],
435 so->immediates[i].val[1],
436 so->immediates[i].val[2],
437 so->immediates[i].val[3]);
438 }
439
440 disasm_a3xx(bin, so->info.sizedwords, 0, out);
441
442 switch (so->type) {
443 case SHADER_VERTEX:
444 fprintf(out, "; %s: outputs:", type);
445 for (i = 0; i < so->outputs_count; i++) {
446 uint8_t regid = so->outputs[i].regid;
447 fprintf(out, " r%d.%c (%s)",
448 (regid >> 2), "xyzw"[regid & 0x3],
449 gl_varying_slot_name(so->outputs[i].slot));
450 }
451 fprintf(out, "\n");
452 fprintf(out, "; %s: inputs:", type);
453 for (i = 0; i < so->inputs_count; i++) {
454 uint8_t regid = so->inputs[i].regid;
455 fprintf(out, " r%d.%c (cm=%x,il=%u,b=%u)",
456 (regid >> 2), "xyzw"[regid & 0x3],
457 so->inputs[i].compmask,
458 so->inputs[i].inloc,
459 so->inputs[i].bary);
460 }
461 fprintf(out, "\n");
462 break;
463 case SHADER_FRAGMENT:
464 fprintf(out, "; %s: outputs:", type);
465 for (i = 0; i < so->outputs_count; i++) {
466 uint8_t regid = so->outputs[i].regid;
467 fprintf(out, " r%d.%c (%s)",
468 (regid >> 2), "xyzw"[regid & 0x3],
469 gl_frag_result_name(so->outputs[i].slot));
470 }
471 fprintf(out, "\n");
472 fprintf(out, "; %s: inputs:", type);
473 for (i = 0; i < so->inputs_count; i++) {
474 uint8_t regid = so->inputs[i].regid;
475 fprintf(out, " r%d.%c (%s,cm=%x,il=%u,b=%u)",
476 (regid >> 2), "xyzw"[regid & 0x3],
477 gl_varying_slot_name(so->inputs[i].slot),
478 so->inputs[i].compmask,
479 so->inputs[i].inloc,
480 so->inputs[i].bary);
481 }
482 fprintf(out, "\n");
483 break;
484 default:
485 /* TODO */
486 break;
487 }
488
489 /* print generic shader info: */
490 fprintf(out, "; %s prog %d/%d: %u instructions, %d half, %d full\n",
491 type, so->shader->id, so->id,
492 so->info.instrs_count,
493 so->info.max_half_reg + 1,
494 so->info.max_reg + 1);
495
496 fprintf(out, "; %d const, %u constlen\n",
497 so->info.max_const + 1,
498 so->constlen);
499
500 fprintf(out, "; %u (ss), %u (sy)\n", so->info.ss, so->info.sy);
501
502 /* print shader type specific info: */
503 switch (so->type) {
504 case SHADER_VERTEX:
505 dump_output(out, so, VARYING_SLOT_POS, "pos");
506 dump_output(out, so, VARYING_SLOT_PSIZ, "psize");
507 break;
508 case SHADER_FRAGMENT:
509 dump_reg(out, "pos (bary)",
510 ir3_find_sysval_regid(so, SYSTEM_VALUE_VARYING_COORD));
511 dump_output(out, so, FRAG_RESULT_DEPTH, "posz");
512 if (so->color0_mrt) {
513 dump_output(out, so, FRAG_RESULT_COLOR, "color");
514 } else {
515 dump_output(out, so, FRAG_RESULT_DATA0, "data0");
516 dump_output(out, so, FRAG_RESULT_DATA1, "data1");
517 dump_output(out, so, FRAG_RESULT_DATA2, "data2");
518 dump_output(out, so, FRAG_RESULT_DATA3, "data3");
519 dump_output(out, so, FRAG_RESULT_DATA4, "data4");
520 dump_output(out, so, FRAG_RESULT_DATA5, "data5");
521 dump_output(out, so, FRAG_RESULT_DATA6, "data6");
522 dump_output(out, so, FRAG_RESULT_DATA7, "data7");
523 }
524 /* these two are hard-coded since we don't know how to
525 * program them to anything but all 0's...
526 */
527 if (so->frag_coord)
528 fprintf(out, "; fragcoord: r0.x\n");
529 if (so->frag_face)
530 fprintf(out, "; fragface: hr0.x\n");
531 break;
532 default:
533 /* TODO */
534 break;
535 }
536
537 fprintf(out, "\n");
538 }
539
540 uint64_t
541 ir3_shader_outputs(const struct ir3_shader *so)
542 {
543 return so->nir->info.outputs_written;
544 }
545
546 /* This has to reach into the fd_context a bit more than the rest of
547 * ir3, but it needs to be aligned with the compiler, so both agree
548 * on which const regs hold what. And the logic is identical between
549 * a3xx/a4xx, the only difference is small details in the actual
550 * CP_LOAD_STATE packets (which is handled inside the generation
551 * specific ctx->emit_const(_bo)() fxns)
552 */
553
554 #include "freedreno_resource.h"
555
556 static void
557 emit_user_consts(struct fd_context *ctx, const struct ir3_shader_variant *v,
558 struct fd_ringbuffer *ring, struct fd_constbuf_stateobj *constbuf)
559 {
560 const unsigned index = 0; /* user consts are index 0 */
561
562 if (constbuf->enabled_mask & (1 << index)) {
563 struct pipe_constant_buffer *cb = &constbuf->cb[index];
564 unsigned size = align(cb->buffer_size, 4) / 4; /* size in dwords */
565
566 /* in particular, with binning shader we may end up with
567 * unused consts, ie. we could end up w/ constlen that is
568 * smaller than first_driver_param. In that case truncate
569 * the user consts early to avoid HLSQ lockup caused by
570 * writing too many consts
571 */
572 uint32_t max_const = MIN2(v->num_uniforms, v->constlen);
573
574 // I expect that size should be a multiple of vec4's:
575 assert(size == align(size, 4));
576
577 /* and even if the start of the const buffer is before
578 * first_immediate, the end may not be:
579 */
580 size = MIN2(size, 4 * max_const);
581
582 if (size > 0) {
583 fd_wfi(ctx->batch, ring);
584 ctx->emit_const(ring, v->type, 0,
585 cb->buffer_offset, size,
586 cb->user_buffer, cb->buffer);
587 }
588 }
589 }
590
591 static void
592 emit_ubos(struct fd_context *ctx, const struct ir3_shader_variant *v,
593 struct fd_ringbuffer *ring, struct fd_constbuf_stateobj *constbuf)
594 {
595 uint32_t offset = v->constbase.ubo;
596 if (v->constlen > offset) {
597 uint32_t params = v->num_ubos;
598 uint32_t offsets[params];
599 struct pipe_resource *prscs[params];
600
601 for (uint32_t i = 0; i < params; i++) {
602 const uint32_t index = i + 1; /* UBOs start at index 1 */
603 struct pipe_constant_buffer *cb = &constbuf->cb[index];
604 assert(!cb->user_buffer);
605
606 if ((constbuf->enabled_mask & (1 << index)) && cb->buffer) {
607 offsets[i] = cb->buffer_offset;
608 prscs[i] = cb->buffer;
609 } else {
610 offsets[i] = 0;
611 prscs[i] = NULL;
612 }
613 }
614
615 fd_wfi(ctx->batch, ring);
616 ctx->emit_const_bo(ring, v->type, false, offset * 4, params, prscs, offsets);
617 }
618 }
619
620 static void
621 emit_ssbo_sizes(struct fd_context *ctx, const struct ir3_shader_variant *v,
622 struct fd_ringbuffer *ring, struct fd_shaderbuf_stateobj *sb)
623 {
624 uint32_t offset = v->constbase.ssbo_sizes;
625 if (v->constlen > offset) {
626 uint32_t sizes[align(v->const_layout.ssbo_size.count, 4)];
627 unsigned mask = v->const_layout.ssbo_size.mask;
628
629 while (mask) {
630 unsigned index = u_bit_scan(&mask);
631 unsigned off = v->const_layout.ssbo_size.off[index];
632 sizes[off] = sb->sb[index].buffer_size;
633 }
634
635 fd_wfi(ctx->batch, ring);
636 ctx->emit_const(ring, v->type, offset * 4,
637 0, ARRAY_SIZE(sizes), sizes, NULL);
638 }
639 }
640
641 static void
642 emit_image_dims(struct fd_context *ctx, const struct ir3_shader_variant *v,
643 struct fd_ringbuffer *ring, struct fd_shaderimg_stateobj *si)
644 {
645 uint32_t offset = v->constbase.image_dims;
646 if (v->constlen > offset) {
647 uint32_t dims[align(v->const_layout.image_dims.count, 4)];
648 unsigned mask = v->const_layout.image_dims.mask;
649
650 while (mask) {
651 struct pipe_image_view *img;
652 struct fd_resource *rsc;
653 unsigned index = u_bit_scan(&mask);
654 unsigned off = v->const_layout.image_dims.off[index];
655
656 img = &si->si[index];
657 rsc = fd_resource(img->resource);
658
659 dims[off + 0] = util_format_get_blocksize(img->format);
660 if (img->resource->target != PIPE_BUFFER) {
661 unsigned lvl = img->u.tex.level;
662 /* note for 2d/cube/etc images, even if re-interpreted
663 * as a different color format, the pixel size should
664 * be the same, so use original dimensions for y and z
665 * stride:
666 */
667 dims[off + 1] = rsc->slices[lvl].pitch * rsc->cpp;
668 /* see corresponding logic in fd_resource_offset(): */
669 if (rsc->layer_first) {
670 dims[off + 2] = rsc->layer_size;
671 } else {
672 dims[off + 2] = rsc->slices[lvl].size0;
673 }
674 }
675 }
676
677 fd_wfi(ctx->batch, ring);
678 ctx->emit_const(ring, v->type, offset * 4,
679 0, ARRAY_SIZE(dims), dims, NULL);
680 }
681 }
682
683 static void
684 emit_immediates(struct fd_context *ctx, const struct ir3_shader_variant *v,
685 struct fd_ringbuffer *ring)
686 {
687 int size = v->immediates_count;
688 uint32_t base = v->constbase.immediate;
689
690 /* truncate size to avoid writing constants that shader
691 * does not use:
692 */
693 size = MIN2(size + base, v->constlen) - base;
694
695 /* convert out of vec4: */
696 base *= 4;
697 size *= 4;
698
699 if (size > 0) {
700 fd_wfi(ctx->batch, ring);
701 ctx->emit_const(ring, v->type, base,
702 0, size, v->immediates[0].val, NULL);
703 }
704 }
705
706 /* emit stream-out buffers: */
707 static void
708 emit_tfbos(struct fd_context *ctx, const struct ir3_shader_variant *v,
709 struct fd_ringbuffer *ring)
710 {
711 /* streamout addresses after driver-params: */
712 uint32_t offset = v->constbase.tfbo;
713 if (v->constlen > offset) {
714 struct fd_streamout_stateobj *so = &ctx->streamout;
715 struct pipe_stream_output_info *info = &v->shader->stream_output;
716 uint32_t params = 4;
717 uint32_t offsets[params];
718 struct pipe_resource *prscs[params];
719
720 for (uint32_t i = 0; i < params; i++) {
721 struct pipe_stream_output_target *target = so->targets[i];
722
723 if (target) {
724 offsets[i] = (so->offsets[i] * info->stride[i] * 4) +
725 target->buffer_offset;
726 prscs[i] = target->buffer;
727 } else {
728 offsets[i] = 0;
729 prscs[i] = NULL;
730 }
731 }
732
733 fd_wfi(ctx->batch, ring);
734 ctx->emit_const_bo(ring, v->type, true, offset * 4, params, prscs, offsets);
735 }
736 }
737
738 static uint32_t
739 max_tf_vtx(struct fd_context *ctx, const struct ir3_shader_variant *v)
740 {
741 struct fd_streamout_stateobj *so = &ctx->streamout;
742 struct pipe_stream_output_info *info = &v->shader->stream_output;
743 uint32_t maxvtxcnt = 0x7fffffff;
744
745 if (ctx->screen->gpu_id >= 500)
746 return 0;
747 if (v->key.binning_pass)
748 return 0;
749 if (v->shader->stream_output.num_outputs == 0)
750 return 0;
751 if (so->num_targets == 0)
752 return 0;
753
754 /* offset to write to is:
755 *
756 * total_vtxcnt = vtxcnt + offsets[i]
757 * offset = total_vtxcnt * stride[i]
758 *
759 * offset = vtxcnt * stride[i] ; calculated in shader
760 * + offsets[i] * stride[i] ; calculated at emit_tfbos()
761 *
762 * assuming for each vtx, each target buffer will have data written
763 * up to 'offset + stride[i]', that leaves maxvtxcnt as:
764 *
765 * buffer_size = (maxvtxcnt * stride[i]) + stride[i]
766 * maxvtxcnt = (buffer_size - stride[i]) / stride[i]
767 *
768 * but shader is actually doing a less-than (rather than less-than-
769 * equal) check, so we can drop the -stride[i].
770 *
771 * TODO is assumption about `offset + stride[i]` legit?
772 */
773 for (unsigned i = 0; i < so->num_targets; i++) {
774 struct pipe_stream_output_target *target = so->targets[i];
775 unsigned stride = info->stride[i] * 4; /* convert dwords->bytes */
776 if (target) {
777 uint32_t max = target->buffer_size / stride;
778 maxvtxcnt = MIN2(maxvtxcnt, max);
779 }
780 }
781
782 return maxvtxcnt;
783 }
784
785 static void
786 emit_common_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
787 struct fd_context *ctx, enum pipe_shader_type t)
788 {
789 enum fd_dirty_shader_state dirty = ctx->dirty_shader[t];
790
791 if (dirty & (FD_DIRTY_SHADER_PROG | FD_DIRTY_SHADER_CONST)) {
792 struct fd_constbuf_stateobj *constbuf;
793 bool shader_dirty;
794
795 constbuf = &ctx->constbuf[t];
796 shader_dirty = !!(dirty & FD_DIRTY_SHADER_PROG);
797
798 emit_user_consts(ctx, v, ring, constbuf);
799 emit_ubos(ctx, v, ring, constbuf);
800 if (shader_dirty)
801 emit_immediates(ctx, v, ring);
802 }
803
804 if (dirty & (FD_DIRTY_SHADER_PROG | FD_DIRTY_SHADER_SSBO)) {
805 struct fd_shaderbuf_stateobj *sb = &ctx->shaderbuf[t];
806 emit_ssbo_sizes(ctx, v, ring, sb);
807 }
808
809 if (dirty & (FD_DIRTY_SHADER_PROG | FD_DIRTY_SHADER_IMAGE)) {
810 struct fd_shaderimg_stateobj *si = &ctx->shaderimg[t];
811 emit_image_dims(ctx, v, ring, si);
812 }
813 }
814
815 void
816 ir3_emit_vs_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
817 struct fd_context *ctx, const struct pipe_draw_info *info)
818 {
819 debug_assert(v->type == SHADER_VERTEX);
820
821 emit_common_consts(v, ring, ctx, PIPE_SHADER_VERTEX);
822
823 /* emit driver params every time: */
824 /* TODO skip emit if shader doesn't use driver params to avoid WFI.. */
825 if (info) {
826 uint32_t offset = v->constbase.driver_param;
827 if (v->constlen > offset) {
828 uint32_t vertex_params[IR3_DP_VS_COUNT] = {
829 [IR3_DP_VTXID_BASE] = info->index_size ?
830 info->index_bias : info->start,
831 [IR3_DP_VTXCNT_MAX] = max_tf_vtx(ctx, v),
832 };
833 /* if no user-clip-planes, we don't need to emit the
834 * entire thing:
835 */
836 uint32_t vertex_params_size = 4;
837
838 if (v->key.ucp_enables) {
839 struct pipe_clip_state *ucp = &ctx->ucp;
840 unsigned pos = IR3_DP_UCP0_X;
841 for (unsigned i = 0; pos <= IR3_DP_UCP7_W; i++) {
842 for (unsigned j = 0; j < 4; j++) {
843 vertex_params[pos] = fui(ucp->ucp[i][j]);
844 pos++;
845 }
846 }
847 vertex_params_size = ARRAY_SIZE(vertex_params);
848 }
849
850 fd_wfi(ctx->batch, ring);
851
852 bool needs_vtxid_base =
853 ir3_find_sysval_regid(v, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) != regid(63, 0);
854
855 /* for indirect draw, we need to copy VTXID_BASE from
856 * indirect-draw parameters buffer.. which is annoying
857 * and means we can't easily emit these consts in cmd
858 * stream so need to copy them to bo.
859 */
860 if (info->indirect && needs_vtxid_base) {
861 struct pipe_draw_indirect_info *indirect = info->indirect;
862 struct pipe_resource *vertex_params_rsc =
863 pipe_buffer_create(&ctx->screen->base,
864 PIPE_BIND_CONSTANT_BUFFER, PIPE_USAGE_STREAM,
865 vertex_params_size * 4);
866 unsigned src_off = info->indirect->offset;;
867 void *ptr;
868
869 ptr = fd_bo_map(fd_resource(vertex_params_rsc)->bo);
870 memcpy(ptr, vertex_params, vertex_params_size * 4);
871
872 if (info->index_size) {
873 /* indexed draw, index_bias is 4th field: */
874 src_off += 3 * 4;
875 } else {
876 /* non-indexed draw, start is 3rd field: */
877 src_off += 2 * 4;
878 }
879
880 /* copy index_bias or start from draw params: */
881 ctx->mem_to_mem(ring, vertex_params_rsc, 0,
882 indirect->buffer, src_off, 1);
883
884 ctx->emit_const(ring, SHADER_VERTEX, offset * 4, 0,
885 vertex_params_size, NULL, vertex_params_rsc);
886
887 pipe_resource_reference(&vertex_params_rsc, NULL);
888 } else {
889 ctx->emit_const(ring, SHADER_VERTEX, offset * 4, 0,
890 vertex_params_size, vertex_params, NULL);
891 }
892
893 /* if needed, emit stream-out buffer addresses: */
894 if (vertex_params[IR3_DP_VTXCNT_MAX] > 0) {
895 emit_tfbos(ctx, v, ring);
896 }
897 }
898 }
899 }
900
901 void
902 ir3_emit_fs_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
903 struct fd_context *ctx)
904 {
905 debug_assert(v->type == SHADER_FRAGMENT);
906
907 emit_common_consts(v, ring, ctx, PIPE_SHADER_FRAGMENT);
908 }
909
910 /* emit compute-shader consts: */
911 void
912 ir3_emit_cs_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
913 struct fd_context *ctx, const struct pipe_grid_info *info)
914 {
915 debug_assert(v->type == SHADER_COMPUTE);
916
917 emit_common_consts(v, ring, ctx, PIPE_SHADER_COMPUTE);
918
919 /* emit compute-shader driver-params: */
920 uint32_t offset = v->constbase.driver_param;
921 if (v->constlen > offset) {
922 fd_wfi(ctx->batch, ring);
923
924 if (info->indirect) {
925 struct pipe_resource *indirect = NULL;
926 unsigned indirect_offset;
927
928 /* This is a bit awkward, but CP_LOAD_STATE.EXT_SRC_ADDR needs
929 * to be aligned more strongly than 4 bytes. So in this case
930 * we need a temporary buffer to copy NumWorkGroups.xyz to.
931 *
932 * TODO if previous compute job is writing to info->indirect,
933 * we might need a WFI.. but since we currently flush for each
934 * compute job, we are probably ok for now.
935 */
936 if (info->indirect_offset & 0xf) {
937 indirect = pipe_buffer_create(&ctx->screen->base,
938 PIPE_BIND_COMMAND_ARGS_BUFFER, PIPE_USAGE_STREAM,
939 0x1000);
940 indirect_offset = 0;
941
942 ctx->mem_to_mem(ring, indirect, 0, info->indirect,
943 info->indirect_offset, 3);
944 } else {
945 pipe_resource_reference(&indirect, info->indirect);
946 indirect_offset = info->indirect_offset;
947 }
948
949 ctx->emit_const(ring, SHADER_COMPUTE, offset * 4,
950 indirect_offset, 4, NULL, indirect);
951
952 pipe_resource_reference(&indirect, NULL);
953 } else {
954 uint32_t compute_params[IR3_DP_CS_COUNT] = {
955 [IR3_DP_NUM_WORK_GROUPS_X] = info->grid[0],
956 [IR3_DP_NUM_WORK_GROUPS_Y] = info->grid[1],
957 [IR3_DP_NUM_WORK_GROUPS_Z] = info->grid[2],
958 [IR3_DP_LOCAL_GROUP_SIZE_X] = info->block[0],
959 [IR3_DP_LOCAL_GROUP_SIZE_Y] = info->block[1],
960 [IR3_DP_LOCAL_GROUP_SIZE_Z] = info->block[2],
961 };
962
963 ctx->emit_const(ring, SHADER_COMPUTE, offset * 4, 0,
964 ARRAY_SIZE(compute_params), compute_params, NULL);
965 }
966 }
967 }