freedreno: Remove the Emacs mode lines
[mesa.git] / src / gallium / drivers / freedreno / ir3 / ir3_shader.c
1 /*
2 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 *
23 * Authors:
24 * Rob Clark <robclark@freedesktop.org>
25 */
26
27 #include "pipe/p_state.h"
28 #include "util/u_string.h"
29 #include "util/u_memory.h"
30 #include "util/u_inlines.h"
31 #include "util/u_format.h"
32 #include "tgsi/tgsi_dump.h"
33 #include "tgsi/tgsi_parse.h"
34
35 #include "freedreno_context.h"
36 #include "freedreno_util.h"
37
38 #include "ir3_shader.h"
39 #include "ir3_compiler.h"
40 #include "ir3_nir.h"
41
42 int
43 ir3_glsl_type_size(const struct glsl_type *type)
44 {
45 return glsl_count_attribute_slots(type, false);
46 }
47
48 static void
49 delete_variant(struct ir3_shader_variant *v)
50 {
51 if (v->ir)
52 ir3_destroy(v->ir);
53 if (v->bo)
54 fd_bo_del(v->bo);
55 if (v->immediates)
56 free(v->immediates);
57 free(v);
58 }
59
60 /* for vertex shader, the inputs are loaded into registers before the shader
61 * is executed, so max_regs from the shader instructions might not properly
62 * reflect the # of registers actually used, especially in case passthrough
63 * varyings.
64 *
65 * Likewise, for fragment shader, we can have some regs which are passed
66 * input values but never touched by the resulting shader (ie. as result
67 * of dead code elimination or simply because we don't know how to turn
68 * the reg off.
69 */
70 static void
71 fixup_regfootprint(struct ir3_shader_variant *v)
72 {
73 unsigned i;
74
75 for (i = 0; i < v->inputs_count; i++) {
76 /* skip frag inputs fetch via bary.f since their reg's are
77 * not written by gpu before shader starts (and in fact the
78 * regid's might not even be valid)
79 */
80 if (v->inputs[i].bary)
81 continue;
82
83 /* ignore high regs that are global to all threads in a warp
84 * (they exist by default) (a5xx+)
85 */
86 if (v->inputs[i].regid >= regid(48,0))
87 continue;
88
89 if (v->inputs[i].compmask) {
90 unsigned n = util_last_bit(v->inputs[i].compmask) - 1;
91 int32_t regid = (v->inputs[i].regid + n) >> 2;
92 v->info.max_reg = MAX2(v->info.max_reg, regid);
93 }
94 }
95
96 for (i = 0; i < v->outputs_count; i++) {
97 int32_t regid = (v->outputs[i].regid + 3) >> 2;
98 v->info.max_reg = MAX2(v->info.max_reg, regid);
99 }
100 }
101
102 /* wrapper for ir3_assemble() which does some info fixup based on
103 * shader state. Non-static since used by ir3_cmdline too.
104 */
105 void * ir3_shader_assemble(struct ir3_shader_variant *v, uint32_t gpu_id)
106 {
107 void *bin;
108
109 bin = ir3_assemble(v->ir, &v->info, gpu_id);
110 if (!bin)
111 return NULL;
112
113 if (gpu_id >= 400) {
114 v->instrlen = v->info.sizedwords / (2 * 16);
115 } else {
116 v->instrlen = v->info.sizedwords / (2 * 4);
117 }
118
119 /* NOTE: if relative addressing is used, we set constlen in
120 * the compiler (to worst-case value) since we don't know in
121 * the assembler what the max addr reg value can be:
122 */
123 v->constlen = MIN2(255, MAX2(v->constlen, v->info.max_const + 1));
124
125 fixup_regfootprint(v);
126
127 return bin;
128 }
129
130 static void
131 assemble_variant(struct ir3_shader_variant *v)
132 {
133 struct ir3_compiler *compiler = v->shader->compiler;
134 uint32_t gpu_id = compiler->gpu_id;
135 uint32_t sz, *bin;
136
137 bin = ir3_shader_assemble(v, gpu_id);
138 sz = v->info.sizedwords * 4;
139
140 v->bo = fd_bo_new(compiler->dev, sz,
141 DRM_FREEDRENO_GEM_CACHE_WCOMBINE |
142 DRM_FREEDRENO_GEM_TYPE_KMEM);
143
144 memcpy(fd_bo_map(v->bo), bin, sz);
145
146 if (fd_mesa_debug & FD_DBG_DISASM) {
147 struct ir3_shader_key key = v->key;
148 printf("disassemble: type=%d, k={bp=%u,cts=%u,hp=%u}", v->type,
149 key.binning_pass, key.color_two_side, key.half_precision);
150 ir3_shader_disasm(v, bin, stdout);
151 }
152
153 if (shader_debug_enabled(v->shader->type)) {
154 fprintf(stderr, "Native code for unnamed %s shader %s:\n",
155 shader_stage_name(v->shader->type), v->shader->nir->info.name);
156 if (v->shader->type == SHADER_FRAGMENT)
157 fprintf(stderr, "SIMD0\n");
158 ir3_shader_disasm(v, bin, stderr);
159 }
160
161 free(bin);
162
163 /* no need to keep the ir around beyond this point: */
164 ir3_destroy(v->ir);
165 v->ir = NULL;
166 }
167
168 static void
169 dump_shader_info(struct ir3_shader_variant *v, struct pipe_debug_callback *debug)
170 {
171 if (!unlikely(fd_mesa_debug & FD_DBG_SHADERDB))
172 return;
173
174 pipe_debug_message(debug, SHADER_INFO, "\n"
175 "SHADER-DB: %s prog %d/%d: %u instructions, %u dwords\n"
176 "SHADER-DB: %s prog %d/%d: %u half, %u full\n"
177 "SHADER-DB: %s prog %d/%d: %u const, %u constlen\n"
178 "SHADER-DB: %s prog %d/%d: %u (ss), %u (sy)\n",
179 ir3_shader_stage(v->shader),
180 v->shader->id, v->id,
181 v->info.instrs_count,
182 v->info.sizedwords,
183 ir3_shader_stage(v->shader),
184 v->shader->id, v->id,
185 v->info.max_half_reg + 1,
186 v->info.max_reg + 1,
187 ir3_shader_stage(v->shader),
188 v->shader->id, v->id,
189 v->info.max_const + 1,
190 v->constlen,
191 ir3_shader_stage(v->shader),
192 v->shader->id, v->id,
193 v->info.ss, v->info.sy);
194 }
195
196 static struct ir3_shader_variant *
197 create_variant(struct ir3_shader *shader, struct ir3_shader_key key)
198 {
199 struct ir3_shader_variant *v = CALLOC_STRUCT(ir3_shader_variant);
200 int ret;
201
202 if (!v)
203 return NULL;
204
205 v->id = ++shader->variant_count;
206 v->shader = shader;
207 v->key = key;
208 v->type = shader->type;
209
210 ret = ir3_compile_shader_nir(shader->compiler, v);
211 if (ret) {
212 debug_error("compile failed!");
213 goto fail;
214 }
215
216 assemble_variant(v);
217 if (!v->bo) {
218 debug_error("assemble failed!");
219 goto fail;
220 }
221
222 return v;
223
224 fail:
225 delete_variant(v);
226 return NULL;
227 }
228
229 struct ir3_shader_variant *
230 ir3_shader_variant(struct ir3_shader *shader, struct ir3_shader_key key,
231 struct pipe_debug_callback *debug)
232 {
233 struct ir3_shader_variant *v;
234
235 /* some shader key values only apply to vertex or frag shader,
236 * so normalize the key to avoid constructing multiple identical
237 * variants:
238 */
239 switch (shader->type) {
240 case SHADER_FRAGMENT:
241 key.binning_pass = false;
242 if (key.has_per_samp) {
243 key.vsaturate_s = 0;
244 key.vsaturate_t = 0;
245 key.vsaturate_r = 0;
246 key.vastc_srgb = 0;
247 key.vsamples = 0;
248 }
249 break;
250 case SHADER_VERTEX:
251 key.color_two_side = false;
252 key.half_precision = false;
253 key.rasterflat = false;
254 if (key.has_per_samp) {
255 key.fsaturate_s = 0;
256 key.fsaturate_t = 0;
257 key.fsaturate_r = 0;
258 key.fastc_srgb = 0;
259 key.fsamples = 0;
260 }
261 break;
262 default:
263 /* TODO */
264 break;
265 }
266
267 for (v = shader->variants; v; v = v->next)
268 if (ir3_shader_key_equal(&key, &v->key))
269 return v;
270
271 /* compile new variant if it doesn't exist already: */
272 v = create_variant(shader, key);
273 if (v) {
274 v->next = shader->variants;
275 shader->variants = v;
276 dump_shader_info(v, debug);
277 }
278
279 return v;
280 }
281
282
283 void
284 ir3_shader_destroy(struct ir3_shader *shader)
285 {
286 struct ir3_shader_variant *v, *t;
287 for (v = shader->variants; v; ) {
288 t = v;
289 v = v->next;
290 delete_variant(t);
291 }
292 ralloc_free(shader->nir);
293 free(shader);
294 }
295
296 struct ir3_shader *
297 ir3_shader_create(struct ir3_compiler *compiler,
298 const struct pipe_shader_state *cso, enum shader_t type,
299 struct pipe_debug_callback *debug)
300 {
301 struct ir3_shader *shader = CALLOC_STRUCT(ir3_shader);
302 shader->compiler = compiler;
303 shader->id = ++shader->compiler->shader_count;
304 shader->type = type;
305
306 nir_shader *nir;
307 if (cso->type == PIPE_SHADER_IR_NIR) {
308 /* we take ownership of the reference: */
309 nir = cso->ir.nir;
310 } else {
311 debug_assert(cso->type == PIPE_SHADER_IR_TGSI);
312 if (fd_mesa_debug & FD_DBG_DISASM) {
313 DBG("dump tgsi: type=%d", shader->type);
314 tgsi_dump(cso->tokens, 0);
315 }
316 nir = ir3_tgsi_to_nir(cso->tokens);
317 }
318 NIR_PASS_V(nir, nir_lower_io, nir_var_all, ir3_glsl_type_size,
319 (nir_lower_io_options)0);
320 /* do first pass optimization, ignoring the key: */
321 shader->nir = ir3_optimize_nir(shader, nir, NULL);
322 if (fd_mesa_debug & FD_DBG_DISASM) {
323 DBG("dump nir%d: type=%d", shader->id, shader->type);
324 nir_print_shader(shader->nir, stdout);
325 }
326
327 shader->stream_output = cso->stream_output;
328 if (fd_mesa_debug & FD_DBG_SHADERDB) {
329 /* if shader-db run, create a standard variant immediately
330 * (as otherwise nothing will trigger the shader to be
331 * actually compiled)
332 */
333 static struct ir3_shader_key key;
334 memset(&key, 0, sizeof(key));
335 ir3_shader_variant(shader, key, debug);
336 }
337 return shader;
338 }
339
340 /* a bit annoying that compute-shader and normal shader state objects
341 * aren't a bit more aligned.
342 */
343 struct ir3_shader *
344 ir3_shader_create_compute(struct ir3_compiler *compiler,
345 const struct pipe_compute_state *cso,
346 struct pipe_debug_callback *debug)
347 {
348 struct ir3_shader *shader = CALLOC_STRUCT(ir3_shader);
349
350 shader->compiler = compiler;
351 shader->id = ++shader->compiler->shader_count;
352 shader->type = SHADER_COMPUTE;
353
354 nir_shader *nir;
355 if (cso->ir_type == PIPE_SHADER_IR_NIR) {
356 /* we take ownership of the reference: */
357 nir = (nir_shader *)cso->prog;
358
359 NIR_PASS_V(nir, nir_lower_io, nir_var_all, ir3_glsl_type_size,
360 (nir_lower_io_options)0);
361 } else {
362 debug_assert(cso->ir_type == PIPE_SHADER_IR_TGSI);
363 if (fd_mesa_debug & FD_DBG_DISASM) {
364 DBG("dump tgsi: type=%d", shader->type);
365 tgsi_dump(cso->prog, 0);
366 }
367 nir = ir3_tgsi_to_nir(cso->prog);
368 }
369
370 /* do first pass optimization, ignoring the key: */
371 shader->nir = ir3_optimize_nir(shader, nir, NULL);
372 if (fd_mesa_debug & FD_DBG_DISASM) {
373 printf("dump nir%d: type=%d\n", shader->id, shader->type);
374 nir_print_shader(shader->nir, stdout);
375 }
376
377 return shader;
378 }
379
380 static void dump_reg(FILE *out, const char *name, uint32_t r)
381 {
382 if (r != regid(63,0))
383 fprintf(out, "; %s: r%d.%c\n", name, r >> 2, "xyzw"[r & 0x3]);
384 }
385
386 static void dump_output(FILE *out, struct ir3_shader_variant *so,
387 unsigned slot, const char *name)
388 {
389 uint32_t regid;
390 regid = ir3_find_output_regid(so, slot);
391 dump_reg(out, name, regid);
392 }
393
394 void
395 ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin, FILE *out)
396 {
397 struct ir3 *ir = so->ir;
398 struct ir3_register *reg;
399 const char *type = ir3_shader_stage(so->shader);
400 uint8_t regid;
401 unsigned i;
402
403 for (i = 0; i < ir->ninputs; i++) {
404 if (!ir->inputs[i]) {
405 fprintf(out, "; in%d unused\n", i);
406 continue;
407 }
408 reg = ir->inputs[i]->regs[0];
409 regid = reg->num;
410 fprintf(out, "@in(%sr%d.%c)\tin%d\n",
411 (reg->flags & IR3_REG_HALF) ? "h" : "",
412 (regid >> 2), "xyzw"[regid & 0x3], i);
413 }
414
415 for (i = 0; i < ir->noutputs; i++) {
416 if (!ir->outputs[i]) {
417 fprintf(out, "; out%d unused\n", i);
418 continue;
419 }
420 /* kill shows up as a virtual output.. skip it! */
421 if (is_kill(ir->outputs[i]))
422 continue;
423 reg = ir->outputs[i]->regs[0];
424 regid = reg->num;
425 fprintf(out, "@out(%sr%d.%c)\tout%d\n",
426 (reg->flags & IR3_REG_HALF) ? "h" : "",
427 (regid >> 2), "xyzw"[regid & 0x3], i);
428 }
429
430 for (i = 0; i < so->immediates_count; i++) {
431 fprintf(out, "@const(c%d.x)\t", so->constbase.immediate + i);
432 fprintf(out, "0x%08x, 0x%08x, 0x%08x, 0x%08x\n",
433 so->immediates[i].val[0],
434 so->immediates[i].val[1],
435 so->immediates[i].val[2],
436 so->immediates[i].val[3]);
437 }
438
439 disasm_a3xx(bin, so->info.sizedwords, 0, out);
440
441 switch (so->type) {
442 case SHADER_VERTEX:
443 fprintf(out, "; %s: outputs:", type);
444 for (i = 0; i < so->outputs_count; i++) {
445 uint8_t regid = so->outputs[i].regid;
446 fprintf(out, " r%d.%c (%s)",
447 (regid >> 2), "xyzw"[regid & 0x3],
448 gl_varying_slot_name(so->outputs[i].slot));
449 }
450 fprintf(out, "\n");
451 fprintf(out, "; %s: inputs:", type);
452 for (i = 0; i < so->inputs_count; i++) {
453 uint8_t regid = so->inputs[i].regid;
454 fprintf(out, " r%d.%c (cm=%x,il=%u,b=%u)",
455 (regid >> 2), "xyzw"[regid & 0x3],
456 so->inputs[i].compmask,
457 so->inputs[i].inloc,
458 so->inputs[i].bary);
459 }
460 fprintf(out, "\n");
461 break;
462 case SHADER_FRAGMENT:
463 fprintf(out, "; %s: outputs:", type);
464 for (i = 0; i < so->outputs_count; i++) {
465 uint8_t regid = so->outputs[i].regid;
466 fprintf(out, " r%d.%c (%s)",
467 (regid >> 2), "xyzw"[regid & 0x3],
468 gl_frag_result_name(so->outputs[i].slot));
469 }
470 fprintf(out, "\n");
471 fprintf(out, "; %s: inputs:", type);
472 for (i = 0; i < so->inputs_count; i++) {
473 uint8_t regid = so->inputs[i].regid;
474 fprintf(out, " r%d.%c (%s,cm=%x,il=%u,b=%u)",
475 (regid >> 2), "xyzw"[regid & 0x3],
476 gl_varying_slot_name(so->inputs[i].slot),
477 so->inputs[i].compmask,
478 so->inputs[i].inloc,
479 so->inputs[i].bary);
480 }
481 fprintf(out, "\n");
482 break;
483 default:
484 /* TODO */
485 break;
486 }
487
488 /* print generic shader info: */
489 fprintf(out, "; %s prog %d/%d: %u instructions, %d half, %d full\n",
490 type, so->shader->id, so->id,
491 so->info.instrs_count,
492 so->info.max_half_reg + 1,
493 so->info.max_reg + 1);
494
495 fprintf(out, "; %d const, %u constlen\n",
496 so->info.max_const + 1,
497 so->constlen);
498
499 fprintf(out, "; %u (ss), %u (sy)\n", so->info.ss, so->info.sy);
500
501 /* print shader type specific info: */
502 switch (so->type) {
503 case SHADER_VERTEX:
504 dump_output(out, so, VARYING_SLOT_POS, "pos");
505 dump_output(out, so, VARYING_SLOT_PSIZ, "psize");
506 break;
507 case SHADER_FRAGMENT:
508 dump_reg(out, "pos (bary)",
509 ir3_find_sysval_regid(so, SYSTEM_VALUE_VARYING_COORD));
510 dump_output(out, so, FRAG_RESULT_DEPTH, "posz");
511 if (so->color0_mrt) {
512 dump_output(out, so, FRAG_RESULT_COLOR, "color");
513 } else {
514 dump_output(out, so, FRAG_RESULT_DATA0, "data0");
515 dump_output(out, so, FRAG_RESULT_DATA1, "data1");
516 dump_output(out, so, FRAG_RESULT_DATA2, "data2");
517 dump_output(out, so, FRAG_RESULT_DATA3, "data3");
518 dump_output(out, so, FRAG_RESULT_DATA4, "data4");
519 dump_output(out, so, FRAG_RESULT_DATA5, "data5");
520 dump_output(out, so, FRAG_RESULT_DATA6, "data6");
521 dump_output(out, so, FRAG_RESULT_DATA7, "data7");
522 }
523 /* these two are hard-coded since we don't know how to
524 * program them to anything but all 0's...
525 */
526 if (so->frag_coord)
527 fprintf(out, "; fragcoord: r0.x\n");
528 if (so->frag_face)
529 fprintf(out, "; fragface: hr0.x\n");
530 break;
531 default:
532 /* TODO */
533 break;
534 }
535
536 fprintf(out, "\n");
537 }
538
539 uint64_t
540 ir3_shader_outputs(const struct ir3_shader *so)
541 {
542 return so->nir->info.outputs_written;
543 }
544
545 /* This has to reach into the fd_context a bit more than the rest of
546 * ir3, but it needs to be aligned with the compiler, so both agree
547 * on which const regs hold what. And the logic is identical between
548 * a3xx/a4xx, the only difference is small details in the actual
549 * CP_LOAD_STATE packets (which is handled inside the generation
550 * specific ctx->emit_const(_bo)() fxns)
551 */
552
553 #include "freedreno_resource.h"
554
555 static void
556 emit_user_consts(struct fd_context *ctx, const struct ir3_shader_variant *v,
557 struct fd_ringbuffer *ring, struct fd_constbuf_stateobj *constbuf)
558 {
559 const unsigned index = 0; /* user consts are index 0 */
560
561 if (constbuf->enabled_mask & (1 << index)) {
562 struct pipe_constant_buffer *cb = &constbuf->cb[index];
563 unsigned size = align(cb->buffer_size, 4) / 4; /* size in dwords */
564
565 /* in particular, with binning shader we may end up with
566 * unused consts, ie. we could end up w/ constlen that is
567 * smaller than first_driver_param. In that case truncate
568 * the user consts early to avoid HLSQ lockup caused by
569 * writing too many consts
570 */
571 uint32_t max_const = MIN2(v->num_uniforms, v->constlen);
572
573 // I expect that size should be a multiple of vec4's:
574 assert(size == align(size, 4));
575
576 /* and even if the start of the const buffer is before
577 * first_immediate, the end may not be:
578 */
579 size = MIN2(size, 4 * max_const);
580
581 if (size > 0) {
582 fd_wfi(ctx->batch, ring);
583 ctx->emit_const(ring, v->type, 0,
584 cb->buffer_offset, size,
585 cb->user_buffer, cb->buffer);
586 }
587 }
588 }
589
590 static void
591 emit_ubos(struct fd_context *ctx, const struct ir3_shader_variant *v,
592 struct fd_ringbuffer *ring, struct fd_constbuf_stateobj *constbuf)
593 {
594 uint32_t offset = v->constbase.ubo;
595 if (v->constlen > offset) {
596 uint32_t params = v->num_ubos;
597 uint32_t offsets[params];
598 struct pipe_resource *prscs[params];
599
600 for (uint32_t i = 0; i < params; i++) {
601 const uint32_t index = i + 1; /* UBOs start at index 1 */
602 struct pipe_constant_buffer *cb = &constbuf->cb[index];
603 assert(!cb->user_buffer);
604
605 if ((constbuf->enabled_mask & (1 << index)) && cb->buffer) {
606 offsets[i] = cb->buffer_offset;
607 prscs[i] = cb->buffer;
608 } else {
609 offsets[i] = 0;
610 prscs[i] = NULL;
611 }
612 }
613
614 fd_wfi(ctx->batch, ring);
615 ctx->emit_const_bo(ring, v->type, false, offset * 4, params, prscs, offsets);
616 }
617 }
618
619 static void
620 emit_ssbo_sizes(struct fd_context *ctx, const struct ir3_shader_variant *v,
621 struct fd_ringbuffer *ring, struct fd_shaderbuf_stateobj *sb)
622 {
623 uint32_t offset = v->constbase.ssbo_sizes;
624 if (v->constlen > offset) {
625 uint32_t sizes[align(v->const_layout.ssbo_size.count, 4)];
626 unsigned mask = v->const_layout.ssbo_size.mask;
627
628 while (mask) {
629 unsigned index = u_bit_scan(&mask);
630 unsigned off = v->const_layout.ssbo_size.off[index];
631 sizes[off] = sb->sb[index].buffer_size;
632 }
633
634 fd_wfi(ctx->batch, ring);
635 ctx->emit_const(ring, v->type, offset * 4,
636 0, ARRAY_SIZE(sizes), sizes, NULL);
637 }
638 }
639
640 static void
641 emit_image_dims(struct fd_context *ctx, const struct ir3_shader_variant *v,
642 struct fd_ringbuffer *ring, struct fd_shaderimg_stateobj *si)
643 {
644 uint32_t offset = v->constbase.image_dims;
645 if (v->constlen > offset) {
646 uint32_t dims[align(v->const_layout.image_dims.count, 4)];
647 unsigned mask = v->const_layout.image_dims.mask;
648
649 while (mask) {
650 struct pipe_image_view *img;
651 struct fd_resource *rsc;
652 unsigned index = u_bit_scan(&mask);
653 unsigned off = v->const_layout.image_dims.off[index];
654
655 img = &si->si[index];
656 rsc = fd_resource(img->resource);
657
658 dims[off + 0] = util_format_get_blocksize(img->format);
659 if (img->resource->target != PIPE_BUFFER) {
660 unsigned lvl = img->u.tex.level;
661 /* note for 2d/cube/etc images, even if re-interpreted
662 * as a different color format, the pixel size should
663 * be the same, so use original dimensions for y and z
664 * stride:
665 */
666 dims[off + 1] = rsc->slices[lvl].pitch * rsc->cpp;
667 /* see corresponding logic in fd_resource_offset(): */
668 if (rsc->layer_first) {
669 dims[off + 2] = rsc->layer_size;
670 } else {
671 dims[off + 2] = rsc->slices[lvl].size0;
672 }
673 }
674 }
675
676 fd_wfi(ctx->batch, ring);
677 ctx->emit_const(ring, v->type, offset * 4,
678 0, ARRAY_SIZE(dims), dims, NULL);
679 }
680 }
681
682 static void
683 emit_immediates(struct fd_context *ctx, const struct ir3_shader_variant *v,
684 struct fd_ringbuffer *ring)
685 {
686 int size = v->immediates_count;
687 uint32_t base = v->constbase.immediate;
688
689 /* truncate size to avoid writing constants that shader
690 * does not use:
691 */
692 size = MIN2(size + base, v->constlen) - base;
693
694 /* convert out of vec4: */
695 base *= 4;
696 size *= 4;
697
698 if (size > 0) {
699 fd_wfi(ctx->batch, ring);
700 ctx->emit_const(ring, v->type, base,
701 0, size, v->immediates[0].val, NULL);
702 }
703 }
704
705 /* emit stream-out buffers: */
706 static void
707 emit_tfbos(struct fd_context *ctx, const struct ir3_shader_variant *v,
708 struct fd_ringbuffer *ring)
709 {
710 /* streamout addresses after driver-params: */
711 uint32_t offset = v->constbase.tfbo;
712 if (v->constlen > offset) {
713 struct fd_streamout_stateobj *so = &ctx->streamout;
714 struct pipe_stream_output_info *info = &v->shader->stream_output;
715 uint32_t params = 4;
716 uint32_t offsets[params];
717 struct pipe_resource *prscs[params];
718
719 for (uint32_t i = 0; i < params; i++) {
720 struct pipe_stream_output_target *target = so->targets[i];
721
722 if (target) {
723 offsets[i] = (so->offsets[i] * info->stride[i] * 4) +
724 target->buffer_offset;
725 prscs[i] = target->buffer;
726 } else {
727 offsets[i] = 0;
728 prscs[i] = NULL;
729 }
730 }
731
732 fd_wfi(ctx->batch, ring);
733 ctx->emit_const_bo(ring, v->type, true, offset * 4, params, prscs, offsets);
734 }
735 }
736
737 static uint32_t
738 max_tf_vtx(struct fd_context *ctx, const struct ir3_shader_variant *v)
739 {
740 struct fd_streamout_stateobj *so = &ctx->streamout;
741 struct pipe_stream_output_info *info = &v->shader->stream_output;
742 uint32_t maxvtxcnt = 0x7fffffff;
743
744 if (ctx->screen->gpu_id >= 500)
745 return 0;
746 if (v->key.binning_pass)
747 return 0;
748 if (v->shader->stream_output.num_outputs == 0)
749 return 0;
750 if (so->num_targets == 0)
751 return 0;
752
753 /* offset to write to is:
754 *
755 * total_vtxcnt = vtxcnt + offsets[i]
756 * offset = total_vtxcnt * stride[i]
757 *
758 * offset = vtxcnt * stride[i] ; calculated in shader
759 * + offsets[i] * stride[i] ; calculated at emit_tfbos()
760 *
761 * assuming for each vtx, each target buffer will have data written
762 * up to 'offset + stride[i]', that leaves maxvtxcnt as:
763 *
764 * buffer_size = (maxvtxcnt * stride[i]) + stride[i]
765 * maxvtxcnt = (buffer_size - stride[i]) / stride[i]
766 *
767 * but shader is actually doing a less-than (rather than less-than-
768 * equal) check, so we can drop the -stride[i].
769 *
770 * TODO is assumption about `offset + stride[i]` legit?
771 */
772 for (unsigned i = 0; i < so->num_targets; i++) {
773 struct pipe_stream_output_target *target = so->targets[i];
774 unsigned stride = info->stride[i] * 4; /* convert dwords->bytes */
775 if (target) {
776 uint32_t max = target->buffer_size / stride;
777 maxvtxcnt = MIN2(maxvtxcnt, max);
778 }
779 }
780
781 return maxvtxcnt;
782 }
783
784 static void
785 emit_common_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
786 struct fd_context *ctx, enum pipe_shader_type t)
787 {
788 enum fd_dirty_shader_state dirty = ctx->dirty_shader[t];
789
790 if (dirty & (FD_DIRTY_SHADER_PROG | FD_DIRTY_SHADER_CONST)) {
791 struct fd_constbuf_stateobj *constbuf;
792 bool shader_dirty;
793
794 constbuf = &ctx->constbuf[t];
795 shader_dirty = !!(dirty & FD_DIRTY_SHADER_PROG);
796
797 emit_user_consts(ctx, v, ring, constbuf);
798 emit_ubos(ctx, v, ring, constbuf);
799 if (shader_dirty)
800 emit_immediates(ctx, v, ring);
801 }
802
803 if (dirty & (FD_DIRTY_SHADER_PROG | FD_DIRTY_SHADER_SSBO)) {
804 struct fd_shaderbuf_stateobj *sb = &ctx->shaderbuf[t];
805 emit_ssbo_sizes(ctx, v, ring, sb);
806 }
807
808 if (dirty & (FD_DIRTY_SHADER_PROG | FD_DIRTY_SHADER_IMAGE)) {
809 struct fd_shaderimg_stateobj *si = &ctx->shaderimg[t];
810 emit_image_dims(ctx, v, ring, si);
811 }
812 }
813
814 void
815 ir3_emit_vs_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
816 struct fd_context *ctx, const struct pipe_draw_info *info)
817 {
818 debug_assert(v->type == SHADER_VERTEX);
819
820 emit_common_consts(v, ring, ctx, PIPE_SHADER_VERTEX);
821
822 /* emit driver params every time: */
823 /* TODO skip emit if shader doesn't use driver params to avoid WFI.. */
824 if (info) {
825 uint32_t offset = v->constbase.driver_param;
826 if (v->constlen > offset) {
827 uint32_t vertex_params[IR3_DP_VS_COUNT] = {
828 [IR3_DP_VTXID_BASE] = info->index_size ?
829 info->index_bias : info->start,
830 [IR3_DP_VTXCNT_MAX] = max_tf_vtx(ctx, v),
831 };
832 /* if no user-clip-planes, we don't need to emit the
833 * entire thing:
834 */
835 uint32_t vertex_params_size = 4;
836
837 if (v->key.ucp_enables) {
838 struct pipe_clip_state *ucp = &ctx->ucp;
839 unsigned pos = IR3_DP_UCP0_X;
840 for (unsigned i = 0; pos <= IR3_DP_UCP7_W; i++) {
841 for (unsigned j = 0; j < 4; j++) {
842 vertex_params[pos] = fui(ucp->ucp[i][j]);
843 pos++;
844 }
845 }
846 vertex_params_size = ARRAY_SIZE(vertex_params);
847 }
848
849 fd_wfi(ctx->batch, ring);
850
851 bool needs_vtxid_base =
852 ir3_find_sysval_regid(v, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) != regid(63, 0);
853
854 /* for indirect draw, we need to copy VTXID_BASE from
855 * indirect-draw parameters buffer.. which is annoying
856 * and means we can't easily emit these consts in cmd
857 * stream so need to copy them to bo.
858 */
859 if (info->indirect && needs_vtxid_base) {
860 struct pipe_draw_indirect_info *indirect = info->indirect;
861 struct pipe_resource *vertex_params_rsc =
862 pipe_buffer_create(&ctx->screen->base,
863 PIPE_BIND_CONSTANT_BUFFER, PIPE_USAGE_STREAM,
864 vertex_params_size * 4);
865 unsigned src_off = info->indirect->offset;;
866 void *ptr;
867
868 ptr = fd_bo_map(fd_resource(vertex_params_rsc)->bo);
869 memcpy(ptr, vertex_params, vertex_params_size * 4);
870
871 if (info->index_size) {
872 /* indexed draw, index_bias is 4th field: */
873 src_off += 3 * 4;
874 } else {
875 /* non-indexed draw, start is 3rd field: */
876 src_off += 2 * 4;
877 }
878
879 /* copy index_bias or start from draw params: */
880 ctx->mem_to_mem(ring, vertex_params_rsc, 0,
881 indirect->buffer, src_off, 1);
882
883 ctx->emit_const(ring, SHADER_VERTEX, offset * 4, 0,
884 vertex_params_size, NULL, vertex_params_rsc);
885
886 pipe_resource_reference(&vertex_params_rsc, NULL);
887 } else {
888 ctx->emit_const(ring, SHADER_VERTEX, offset * 4, 0,
889 vertex_params_size, vertex_params, NULL);
890 }
891
892 /* if needed, emit stream-out buffer addresses: */
893 if (vertex_params[IR3_DP_VTXCNT_MAX] > 0) {
894 emit_tfbos(ctx, v, ring);
895 }
896 }
897 }
898 }
899
900 void
901 ir3_emit_fs_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
902 struct fd_context *ctx)
903 {
904 debug_assert(v->type == SHADER_FRAGMENT);
905
906 emit_common_consts(v, ring, ctx, PIPE_SHADER_FRAGMENT);
907 }
908
909 /* emit compute-shader consts: */
910 void
911 ir3_emit_cs_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
912 struct fd_context *ctx, const struct pipe_grid_info *info)
913 {
914 debug_assert(v->type == SHADER_COMPUTE);
915
916 emit_common_consts(v, ring, ctx, PIPE_SHADER_COMPUTE);
917
918 /* emit compute-shader driver-params: */
919 uint32_t offset = v->constbase.driver_param;
920 if (v->constlen > offset) {
921 fd_wfi(ctx->batch, ring);
922
923 if (info->indirect) {
924 struct pipe_resource *indirect = NULL;
925 unsigned indirect_offset;
926
927 /* This is a bit awkward, but CP_LOAD_STATE.EXT_SRC_ADDR needs
928 * to be aligned more strongly than 4 bytes. So in this case
929 * we need a temporary buffer to copy NumWorkGroups.xyz to.
930 *
931 * TODO if previous compute job is writing to info->indirect,
932 * we might need a WFI.. but since we currently flush for each
933 * compute job, we are probably ok for now.
934 */
935 if (info->indirect_offset & 0xf) {
936 indirect = pipe_buffer_create(&ctx->screen->base,
937 PIPE_BIND_COMMAND_ARGS_BUFFER, PIPE_USAGE_STREAM,
938 0x1000);
939 indirect_offset = 0;
940
941 ctx->mem_to_mem(ring, indirect, 0, info->indirect,
942 info->indirect_offset, 3);
943 } else {
944 pipe_resource_reference(&indirect, info->indirect);
945 indirect_offset = info->indirect_offset;
946 }
947
948 ctx->emit_const(ring, SHADER_COMPUTE, offset * 4,
949 indirect_offset, 4, NULL, indirect);
950
951 pipe_resource_reference(&indirect, NULL);
952 } else {
953 uint32_t compute_params[IR3_DP_CS_COUNT] = {
954 [IR3_DP_NUM_WORK_GROUPS_X] = info->grid[0],
955 [IR3_DP_NUM_WORK_GROUPS_Y] = info->grid[1],
956 [IR3_DP_NUM_WORK_GROUPS_Z] = info->grid[2],
957 [IR3_DP_LOCAL_GROUP_SIZE_X] = info->block[0],
958 [IR3_DP_LOCAL_GROUP_SIZE_Y] = info->block[1],
959 [IR3_DP_LOCAL_GROUP_SIZE_Z] = info->block[2],
960 };
961
962 ctx->emit_const(ring, SHADER_COMPUTE, offset * 4, 0,
963 ARRAY_SIZE(compute_params), compute_params, NULL);
964 }
965 }
966 }