panfrost: Add a bitset of render targets read by shaders
[mesa.git] / src / gallium / drivers / panfrost / pan_cmdstream.c
1 /*
2 * Copyright (C) 2018 Alyssa Rosenzweig
3 * Copyright (C) 2020 Collabora Ltd.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "util/macros.h"
26 #include "util/u_prim.h"
27 #include "util/u_vbuf.h"
28
29 #include "panfrost-quirks.h"
30
31 #include "pan_pool.h"
32 #include "pan_bo.h"
33 #include "pan_cmdstream.h"
34 #include "pan_context.h"
35 #include "pan_job.h"
36
37 /* If a BO is accessed for a particular shader stage, will it be in the primary
38 * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
39 * fragment will be primary, e.g. compute jobs will be considered
40 * "vertex/tiler" by analogy */
41
42 static inline uint32_t
43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
44 {
45 assert(stage == PIPE_SHADER_FRAGMENT ||
46 stage == PIPE_SHADER_VERTEX ||
47 stage == PIPE_SHADER_COMPUTE);
48
49 return stage == PIPE_SHADER_FRAGMENT ?
50 PAN_BO_ACCESS_FRAGMENT :
51 PAN_BO_ACCESS_VERTEX_TILER;
52 }
53
54 static void
55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
56 struct mali_vertex_tiler_postfix *postfix)
57 {
58 struct panfrost_device *dev = pan_device(ctx->base.screen);
59 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
60
61 unsigned shift = panfrost_get_stack_shift(batch->stack_size);
62 struct mali_shared_memory shared = {
63 .stack_shift = shift,
64 .scratchpad = panfrost_batch_get_scratchpad(batch, shift, dev->thread_tls_alloc, dev->core_count)->gpu,
65 .shared_workgroup_count = ~0,
66 };
67 postfix->shared_memory = panfrost_pool_upload(&batch->pool, &shared, sizeof(shared));
68 }
69
70 static void
71 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
72 struct mali_vertex_tiler_postfix *postfix)
73 {
74 struct panfrost_device *dev = pan_device(ctx->base.screen);
75 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
76
77 /* If we haven't, reserve space for the framebuffer */
78
79 if (!batch->framebuffer.gpu) {
80 unsigned size = (dev->quirks & MIDGARD_SFBD) ?
81 sizeof(struct mali_single_framebuffer) :
82 sizeof(struct mali_framebuffer);
83
84 batch->framebuffer = panfrost_pool_alloc(&batch->pool, size);
85
86 /* Tag the pointer */
87 if (!(dev->quirks & MIDGARD_SFBD))
88 batch->framebuffer.gpu |= MALI_MFBD;
89 }
90
91 postfix->shared_memory = batch->framebuffer.gpu;
92 }
93
94 static void
95 panfrost_vt_update_rasterizer(struct panfrost_context *ctx,
96 struct mali_vertex_tiler_prefix *prefix,
97 struct mali_vertex_tiler_postfix *postfix)
98 {
99 struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
100
101 postfix->gl_enables |= 0x7;
102 SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
103 rasterizer && rasterizer->base.front_ccw);
104 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
105 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_FRONT));
106 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
107 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_BACK));
108 SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
109 rasterizer && rasterizer->base.flatshade_first);
110 }
111
112 void
113 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
114 struct mali_vertex_tiler_prefix *prefix,
115 union midgard_primitive_size *primitive_size)
116 {
117 struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
118
119 if (!panfrost_writes_point_size(ctx)) {
120 bool points = prefix->draw_mode == MALI_POINTS;
121 float val = 0.0f;
122
123 if (rasterizer)
124 val = points ?
125 rasterizer->base.point_size :
126 rasterizer->base.line_width;
127
128 primitive_size->constant = val;
129 }
130 }
131
132 static void
133 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
134 struct mali_vertex_tiler_postfix *postfix)
135 {
136 SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
137 if (ctx->occlusion_query) {
138 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
139 panfrost_batch_add_bo(ctx->batch, ctx->occlusion_query->bo,
140 PAN_BO_ACCESS_SHARED |
141 PAN_BO_ACCESS_RW |
142 PAN_BO_ACCESS_FRAGMENT);
143 } else {
144 postfix->occlusion_counter = 0;
145 }
146 }
147
148 void
149 panfrost_vt_init(struct panfrost_context *ctx,
150 enum pipe_shader_type stage,
151 struct mali_vertex_tiler_prefix *prefix,
152 struct mali_vertex_tiler_postfix *postfix)
153 {
154 struct panfrost_device *device = pan_device(ctx->base.screen);
155
156 if (!ctx->shader[stage])
157 return;
158
159 memset(prefix, 0, sizeof(*prefix));
160 memset(postfix, 0, sizeof(*postfix));
161
162 if (device->quirks & IS_BIFROST) {
163 postfix->gl_enables = 0x2;
164 panfrost_vt_emit_shared_memory(ctx, postfix);
165 } else {
166 postfix->gl_enables = 0x6;
167 panfrost_vt_attach_framebuffer(ctx, postfix);
168 }
169
170 if (stage == PIPE_SHADER_FRAGMENT) {
171 panfrost_vt_update_occlusion_query(ctx, postfix);
172 panfrost_vt_update_rasterizer(ctx, prefix, postfix);
173 }
174 }
175
176 static unsigned
177 panfrost_translate_index_size(unsigned size)
178 {
179 switch (size) {
180 case 1:
181 return MALI_DRAW_INDEXED_UINT8;
182
183 case 2:
184 return MALI_DRAW_INDEXED_UINT16;
185
186 case 4:
187 return MALI_DRAW_INDEXED_UINT32;
188
189 default:
190 unreachable("Invalid index size");
191 }
192 }
193
194 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
195 * good for the duration of the draw (transient), could last longer. Also get
196 * the bounds on the index buffer for the range accessed by the draw. We do
197 * these operations together because there are natural optimizations which
198 * require them to be together. */
199
200 static mali_ptr
201 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
202 const struct pipe_draw_info *info,
203 unsigned *min_index, unsigned *max_index)
204 {
205 struct panfrost_resource *rsrc = pan_resource(info->index.resource);
206 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
207 off_t offset = info->start * info->index_size;
208 bool needs_indices = true;
209 mali_ptr out = 0;
210
211 if (info->max_index != ~0u) {
212 *min_index = info->min_index;
213 *max_index = info->max_index;
214 needs_indices = false;
215 }
216
217 if (!info->has_user_indices) {
218 /* Only resources can be directly mapped */
219 panfrost_batch_add_bo(batch, rsrc->bo,
220 PAN_BO_ACCESS_SHARED |
221 PAN_BO_ACCESS_READ |
222 PAN_BO_ACCESS_VERTEX_TILER);
223 out = rsrc->bo->gpu + offset;
224
225 /* Check the cache */
226 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
227 info->start,
228 info->count,
229 min_index,
230 max_index);
231 } else {
232 /* Otherwise, we need to upload to transient memory */
233 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
234 out = panfrost_pool_upload(&batch->pool, ibuf8 + offset,
235 info->count *
236 info->index_size);
237 }
238
239 if (needs_indices) {
240 /* Fallback */
241 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
242
243 if (!info->has_user_indices)
244 panfrost_minmax_cache_add(rsrc->index_cache,
245 info->start, info->count,
246 *min_index, *max_index);
247 }
248
249 return out;
250 }
251
252 void
253 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
254 const struct pipe_draw_info *info,
255 enum mali_draw_mode draw_mode,
256 struct mali_vertex_tiler_postfix *vertex_postfix,
257 struct mali_vertex_tiler_prefix *tiler_prefix,
258 struct mali_vertex_tiler_postfix *tiler_postfix,
259 unsigned *vertex_count,
260 unsigned *padded_count)
261 {
262 tiler_prefix->draw_mode = draw_mode;
263
264 unsigned draw_flags = 0;
265
266 if (panfrost_writes_point_size(ctx))
267 draw_flags |= MALI_DRAW_VARYING_SIZE;
268
269 if (info->primitive_restart)
270 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
271
272 /* These doesn't make much sense */
273
274 draw_flags |= 0x3000;
275
276 if (info->index_size) {
277 unsigned min_index = 0, max_index = 0;
278
279 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
280 info,
281 &min_index,
282 &max_index);
283
284 /* Use the corresponding values */
285 *vertex_count = max_index - min_index + 1;
286 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
287 tiler_prefix->offset_bias_correction = -min_index;
288 tiler_prefix->index_count = MALI_POSITIVE(info->count);
289 draw_flags |= panfrost_translate_index_size(info->index_size);
290 } else {
291 tiler_prefix->indices = 0;
292 *vertex_count = ctx->vertex_count;
293 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
294 tiler_prefix->offset_bias_correction = 0;
295 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
296 }
297
298 tiler_prefix->unknown_draw = draw_flags;
299
300 /* Encode the padded vertex count */
301
302 if (info->instance_count > 1) {
303 *padded_count = panfrost_padded_vertex_count(*vertex_count);
304
305 unsigned shift = __builtin_ctz(ctx->padded_count);
306 unsigned k = ctx->padded_count >> (shift + 1);
307
308 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
309 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
310 } else {
311 *padded_count = *vertex_count;
312
313 /* Reset instancing state */
314 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
315 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
316 }
317 }
318
319 static void
320 panfrost_shader_meta_init(struct panfrost_context *ctx,
321 enum pipe_shader_type st,
322 struct mali_shader_meta *meta)
323 {
324 const struct panfrost_device *dev = pan_device(ctx->base.screen);
325 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
326
327 memset(meta, 0, sizeof(*meta));
328 meta->shader = (ss->bo ? ss->bo->gpu : 0) | ss->first_tag;
329 meta->attribute_count = ss->attribute_count;
330 meta->varying_count = ss->varying_count;
331 meta->texture_count = ctx->sampler_view_count[st];
332 meta->sampler_count = ctx->sampler_count[st];
333
334 if (dev->quirks & IS_BIFROST) {
335 if (st == PIPE_SHADER_VERTEX)
336 meta->bifrost1.unk1 = 0x800000;
337 else {
338 /* First clause ATEST |= 0x4000000.
339 * Less than 32 regs |= 0x200 */
340 meta->bifrost1.unk1 = 0x950020;
341 }
342
343 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
344 if (st == PIPE_SHADER_VERTEX)
345 meta->bifrost2.preload_regs = 0xC0;
346 else {
347 meta->bifrost2.preload_regs = 0x1;
348 SET_BIT(meta->bifrost2.preload_regs, 0x10, ss->reads_frag_coord);
349 }
350
351 meta->bifrost2.uniform_count = MIN2(ss->uniform_count,
352 ss->uniform_cutoff);
353 } else {
354 meta->midgard1.uniform_count = MIN2(ss->uniform_count,
355 ss->uniform_cutoff);
356 meta->midgard1.work_count = ss->work_reg_count;
357
358 /* TODO: This is not conformant on ES3 */
359 meta->midgard1.flags_hi = MALI_SUPPRESS_INF_NAN;
360
361 meta->midgard1.flags_lo = 0x20;
362 meta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
363
364 SET_BIT(meta->midgard1.flags_hi, MALI_WRITES_GLOBAL, ss->writes_global);
365 }
366 }
367
368 static unsigned
369 panfrost_translate_compare_func(enum pipe_compare_func in)
370 {
371 switch (in) {
372 case PIPE_FUNC_NEVER:
373 return MALI_FUNC_NEVER;
374
375 case PIPE_FUNC_LESS:
376 return MALI_FUNC_LESS;
377
378 case PIPE_FUNC_EQUAL:
379 return MALI_FUNC_EQUAL;
380
381 case PIPE_FUNC_LEQUAL:
382 return MALI_FUNC_LEQUAL;
383
384 case PIPE_FUNC_GREATER:
385 return MALI_FUNC_GREATER;
386
387 case PIPE_FUNC_NOTEQUAL:
388 return MALI_FUNC_NOTEQUAL;
389
390 case PIPE_FUNC_GEQUAL:
391 return MALI_FUNC_GEQUAL;
392
393 case PIPE_FUNC_ALWAYS:
394 return MALI_FUNC_ALWAYS;
395
396 default:
397 unreachable("Invalid func");
398 }
399 }
400
401 static unsigned
402 panfrost_translate_stencil_op(enum pipe_stencil_op in)
403 {
404 switch (in) {
405 case PIPE_STENCIL_OP_KEEP:
406 return MALI_STENCIL_KEEP;
407
408 case PIPE_STENCIL_OP_ZERO:
409 return MALI_STENCIL_ZERO;
410
411 case PIPE_STENCIL_OP_REPLACE:
412 return MALI_STENCIL_REPLACE;
413
414 case PIPE_STENCIL_OP_INCR:
415 return MALI_STENCIL_INCR;
416
417 case PIPE_STENCIL_OP_DECR:
418 return MALI_STENCIL_DECR;
419
420 case PIPE_STENCIL_OP_INCR_WRAP:
421 return MALI_STENCIL_INCR_WRAP;
422
423 case PIPE_STENCIL_OP_DECR_WRAP:
424 return MALI_STENCIL_DECR_WRAP;
425
426 case PIPE_STENCIL_OP_INVERT:
427 return MALI_STENCIL_INVERT;
428
429 default:
430 unreachable("Invalid stencil op");
431 }
432 }
433
434 static unsigned
435 translate_tex_wrap(enum pipe_tex_wrap w)
436 {
437 switch (w) {
438 case PIPE_TEX_WRAP_REPEAT:
439 return MALI_WRAP_REPEAT;
440
441 case PIPE_TEX_WRAP_CLAMP:
442 return MALI_WRAP_CLAMP;
443
444 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
445 return MALI_WRAP_CLAMP_TO_EDGE;
446
447 case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
448 return MALI_WRAP_CLAMP_TO_BORDER;
449
450 case PIPE_TEX_WRAP_MIRROR_REPEAT:
451 return MALI_WRAP_MIRRORED_REPEAT;
452
453 case PIPE_TEX_WRAP_MIRROR_CLAMP:
454 return MALI_WRAP_MIRRORED_CLAMP;
455
456 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
457 return MALI_WRAP_MIRRORED_CLAMP_TO_EDGE;
458
459 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
460 return MALI_WRAP_MIRRORED_CLAMP_TO_BORDER;
461
462 default:
463 unreachable("Invalid wrap");
464 }
465 }
466
467 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
468 struct mali_sampler_descriptor *hw)
469 {
470 unsigned func = panfrost_translate_compare_func(cso->compare_func);
471 bool min_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
472 bool mag_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
473 bool mip_linear = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR;
474 unsigned min_filter = min_nearest ? MALI_SAMP_MIN_NEAREST : 0;
475 unsigned mag_filter = mag_nearest ? MALI_SAMP_MAG_NEAREST : 0;
476 unsigned mip_filter = mip_linear ?
477 (MALI_SAMP_MIP_LINEAR_1 | MALI_SAMP_MIP_LINEAR_2) : 0;
478 unsigned normalized = cso->normalized_coords ? MALI_SAMP_NORM_COORDS : 0;
479
480 *hw = (struct mali_sampler_descriptor) {
481 .filter_mode = min_filter | mag_filter | mip_filter |
482 normalized,
483 .wrap_s = translate_tex_wrap(cso->wrap_s),
484 .wrap_t = translate_tex_wrap(cso->wrap_t),
485 .wrap_r = translate_tex_wrap(cso->wrap_r),
486 .compare_func = panfrost_flip_compare_func(func),
487 .border_color = {
488 cso->border_color.f[0],
489 cso->border_color.f[1],
490 cso->border_color.f[2],
491 cso->border_color.f[3]
492 },
493 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
494 .max_lod = FIXED_16(cso->max_lod, false),
495 .lod_bias = FIXED_16(cso->lod_bias, true), /* can be negative */
496 .seamless_cube_map = cso->seamless_cube_map,
497 };
498
499 /* If necessary, we disable mipmapping in the sampler descriptor by
500 * clamping the LOD as tight as possible (from 0 to epsilon,
501 * essentially -- remember these are fixed point numbers, so
502 * epsilon=1/256) */
503
504 if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
505 hw->max_lod = hw->min_lod + 1;
506 }
507
508 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
509 struct bifrost_sampler_descriptor *hw)
510 {
511 *hw = (struct bifrost_sampler_descriptor) {
512 .unk1 = 0x1,
513 .wrap_s = translate_tex_wrap(cso->wrap_s),
514 .wrap_t = translate_tex_wrap(cso->wrap_t),
515 .wrap_r = translate_tex_wrap(cso->wrap_r),
516 .unk8 = 0x8,
517 .min_filter = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST,
518 .norm_coords = cso->normalized_coords,
519 .mip_filter = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR,
520 .mag_filter = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR,
521 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
522 .max_lod = FIXED_16(cso->max_lod, false),
523 };
524
525 /* If necessary, we disable mipmapping in the sampler descriptor by
526 * clamping the LOD as tight as possible (from 0 to epsilon,
527 * essentially -- remember these are fixed point numbers, so
528 * epsilon=1/256) */
529
530 if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
531 hw->max_lod = hw->min_lod + 1;
532 }
533
534 static void
535 panfrost_make_stencil_state(const struct pipe_stencil_state *in,
536 struct mali_stencil_test *out)
537 {
538 out->ref = 0; /* Gallium gets it from elsewhere */
539
540 out->mask = in->valuemask;
541 out->func = panfrost_translate_compare_func(in->func);
542 out->sfail = panfrost_translate_stencil_op(in->fail_op);
543 out->dpfail = panfrost_translate_stencil_op(in->zfail_op);
544 out->dppass = panfrost_translate_stencil_op(in->zpass_op);
545 }
546
547 static void
548 panfrost_frag_meta_rasterizer_update(struct panfrost_context *ctx,
549 struct mali_shader_meta *fragmeta)
550 {
551 if (!ctx->rasterizer) {
552 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, true);
553 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, false);
554 fragmeta->depth_units = 0.0f;
555 fragmeta->depth_factor = 0.0f;
556 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, false);
557 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, false);
558 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_NEAR, true);
559 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_FAR, true);
560 return;
561 }
562
563 struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
564
565 bool msaa = rast->multisample;
566
567 /* TODO: Sample size */
568 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
569 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
570 fragmeta->depth_units = rast->offset_units * 2.0f;
571 fragmeta->depth_factor = rast->offset_scale;
572
573 /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
574
575 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, rast->offset_tri);
576 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, rast->offset_tri);
577
578 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_NEAR, rast->depth_clip_near);
579 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_FAR, rast->depth_clip_far);
580 }
581
582 static void
583 panfrost_frag_meta_zsa_update(struct panfrost_context *ctx,
584 struct mali_shader_meta *fragmeta)
585 {
586 const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
587 int zfunc = PIPE_FUNC_ALWAYS;
588
589 if (!zsa) {
590 struct pipe_stencil_state default_stencil = {
591 .enabled = 0,
592 .func = PIPE_FUNC_ALWAYS,
593 .fail_op = MALI_STENCIL_KEEP,
594 .zfail_op = MALI_STENCIL_KEEP,
595 .zpass_op = MALI_STENCIL_KEEP,
596 .writemask = 0xFF,
597 .valuemask = 0xFF
598 };
599
600 panfrost_make_stencil_state(&default_stencil,
601 &fragmeta->stencil_front);
602 fragmeta->stencil_mask_front = default_stencil.writemask;
603 fragmeta->stencil_back = fragmeta->stencil_front;
604 fragmeta->stencil_mask_back = default_stencil.writemask;
605 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST, false);
606 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK, false);
607 } else {
608 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
609 zsa->stencil[0].enabled);
610 panfrost_make_stencil_state(&zsa->stencil[0],
611 &fragmeta->stencil_front);
612 fragmeta->stencil_mask_front = zsa->stencil[0].writemask;
613 fragmeta->stencil_front.ref = ctx->stencil_ref.ref_value[0];
614
615 /* If back-stencil is not enabled, use the front values */
616
617 if (zsa->stencil[1].enabled) {
618 panfrost_make_stencil_state(&zsa->stencil[1],
619 &fragmeta->stencil_back);
620 fragmeta->stencil_mask_back = zsa->stencil[1].writemask;
621 fragmeta->stencil_back.ref = ctx->stencil_ref.ref_value[1];
622 } else {
623 fragmeta->stencil_back = fragmeta->stencil_front;
624 fragmeta->stencil_mask_back = fragmeta->stencil_mask_front;
625 fragmeta->stencil_back.ref = fragmeta->stencil_front.ref;
626 }
627
628 if (zsa->depth.enabled)
629 zfunc = zsa->depth.func;
630
631 /* Depth state (TODO: Refactor) */
632
633 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
634 zsa->depth.writemask);
635 }
636
637 fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
638 fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(zfunc));
639 }
640
641 static bool
642 panfrost_fs_required(
643 struct panfrost_shader_state *fs,
644 struct panfrost_blend_final *blend,
645 unsigned rt_count)
646 {
647 /* If we generally have side effects */
648 if (fs->fs_sidefx)
649 return true;
650
651 /* If colour is written we need to execute */
652 for (unsigned i = 0; i < rt_count; ++i) {
653 if (!blend[i].no_colour)
654 return true;
655 }
656
657 /* If depth is written and not implied we need to execute.
658 * TODO: Predicate on Z/S writes being enabled */
659 return (fs->writes_depth || fs->writes_stencil);
660 }
661
662 static void
663 panfrost_frag_meta_blend_update(struct panfrost_context *ctx,
664 struct mali_shader_meta *fragmeta,
665 void *rts)
666 {
667 const struct panfrost_device *dev = pan_device(ctx->base.screen);
668 struct panfrost_shader_state *fs;
669 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
670
671 SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
672 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
673 !ctx->blend->base.dither);
674
675 SET_BIT(fragmeta->unknown2_4, MALI_ALPHA_TO_COVERAGE,
676 ctx->blend->base.alpha_to_coverage);
677
678 /* Get blending setup */
679 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
680
681 struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
682 unsigned shader_offset = 0;
683 struct panfrost_bo *shader_bo = NULL;
684
685 for (unsigned c = 0; c < rt_count; ++c)
686 blend[c] = panfrost_get_blend_for_context(ctx, c, &shader_bo,
687 &shader_offset);
688
689 /* Disable shader execution if we can */
690 if (dev->quirks & MIDGARD_SHADERLESS
691 && !panfrost_fs_required(fs, blend, rt_count)) {
692 fragmeta->shader = 0;
693 fragmeta->attribute_count = 0;
694 fragmeta->varying_count = 0;
695 fragmeta->texture_count = 0;
696 fragmeta->sampler_count = 0;
697
698 /* This feature is not known to work on Bifrost */
699 fragmeta->midgard1.work_count = 1;
700 fragmeta->midgard1.uniform_count = 0;
701 fragmeta->midgard1.uniform_buffer_count = 0;
702 }
703
704 /* If there is a blend shader, work registers are shared. We impose 8
705 * work registers as a limit for blend shaders. Should be lower XXX */
706
707 if (!(dev->quirks & IS_BIFROST)) {
708 for (unsigned c = 0; c < rt_count; ++c) {
709 if (blend[c].is_shader) {
710 fragmeta->midgard1.work_count =
711 MAX2(fragmeta->midgard1.work_count, 8);
712 }
713 }
714 }
715
716 /* Even on MFBD, the shader descriptor gets blend shaders. It's *also*
717 * copied to the blend_meta appended (by convention), but this is the
718 * field actually read by the hardware. (Or maybe both are read...?).
719 * Specify the last RTi with a blend shader. */
720
721 fragmeta->blend.shader = 0;
722
723 for (signed rt = (rt_count - 1); rt >= 0; --rt) {
724 if (!blend[rt].is_shader)
725 continue;
726
727 fragmeta->blend.shader = blend[rt].shader.gpu |
728 blend[rt].shader.first_tag;
729 break;
730 }
731
732 if (dev->quirks & MIDGARD_SFBD) {
733 /* When only a single render target platform is used, the blend
734 * information is inside the shader meta itself. We additionally
735 * need to signal CAN_DISCARD for nontrivial blend modes (so
736 * we're able to read back the destination buffer) */
737
738 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
739 blend[0].is_shader);
740
741 if (!blend[0].is_shader) {
742 fragmeta->blend.equation = *blend[0].equation.equation;
743 fragmeta->blend.constant = blend[0].equation.constant;
744 }
745
746 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
747 !blend[0].no_blending || fs->can_discard);
748 return;
749 }
750
751 if (dev->quirks & IS_BIFROST) {
752 bool no_blend = true;
753
754 for (unsigned i = 0; i < rt_count; ++i)
755 no_blend &= (blend[i].no_blending | blend[i].no_colour);
756
757 SET_BIT(fragmeta->bifrost1.unk1, MALI_BIFROST_EARLY_Z,
758 !fs->can_discard && !fs->writes_depth && no_blend);
759 }
760
761 /* Additional blend descriptor tacked on for jobs using MFBD */
762
763 for (unsigned i = 0; i < rt_count; ++i) {
764 unsigned flags = 0;
765
766 if (ctx->pipe_framebuffer.nr_cbufs > i && !blend[i].no_colour) {
767 flags = 0x200;
768
769 bool is_srgb = (ctx->pipe_framebuffer.nr_cbufs > i) &&
770 (ctx->pipe_framebuffer.cbufs[i]) &&
771 util_format_is_srgb(ctx->pipe_framebuffer.cbufs[i]->format);
772
773 SET_BIT(flags, MALI_BLEND_MRT_SHADER, blend[i].is_shader);
774 SET_BIT(flags, MALI_BLEND_LOAD_TIB, !blend[i].no_blending);
775 SET_BIT(flags, MALI_BLEND_SRGB, is_srgb);
776 SET_BIT(flags, MALI_BLEND_NO_DITHER, !ctx->blend->base.dither);
777 }
778
779 if (dev->quirks & IS_BIFROST) {
780 struct bifrost_blend_rt *brts = rts;
781
782 brts[i].flags = flags;
783
784 if (blend[i].is_shader) {
785 /* The blend shader's address needs to be at
786 * the same top 32 bit as the fragment shader.
787 * TODO: Ensure that's always the case.
788 */
789 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
790 (fs->bo->gpu & (0xffffffffull << 32)));
791 brts[i].shader = blend[i].shader.gpu;
792 brts[i].unk2 = 0x0;
793 } else if (ctx->pipe_framebuffer.nr_cbufs > i) {
794 enum pipe_format format = ctx->pipe_framebuffer.cbufs[i]->format;
795 const struct util_format_description *format_desc;
796 format_desc = util_format_description(format);
797
798 brts[i].equation = *blend[i].equation.equation;
799
800 /* TODO: this is a bit more complicated */
801 brts[i].constant = blend[i].equation.constant;
802
803 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
804
805 /* 0x19 disables blending and forces REPLACE
806 * mode (equivalent to rgb_mode = alpha_mode =
807 * x122, colour mask = 0xF). 0x1a allows
808 * blending. */
809 brts[i].unk2 = blend[i].no_blending ? 0x19 : 0x1a;
810
811 brts[i].shader_type = fs->blend_types[i];
812 } else {
813 /* Dummy attachment for depth-only */
814 brts[i].unk2 = 0x3;
815 brts[i].shader_type = fs->blend_types[i];
816 }
817 } else {
818 struct midgard_blend_rt *mrts = rts;
819 mrts[i].flags = flags;
820
821 if (blend[i].is_shader) {
822 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
823 } else {
824 mrts[i].blend.equation = *blend[i].equation.equation;
825 mrts[i].blend.constant = blend[i].equation.constant;
826 }
827 }
828 }
829 }
830
831 static void
832 panfrost_frag_shader_meta_init(struct panfrost_context *ctx,
833 struct mali_shader_meta *fragmeta,
834 void *rts)
835 {
836 const struct panfrost_device *dev = pan_device(ctx->base.screen);
837 struct panfrost_shader_state *fs;
838
839 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
840
841 bool msaa = ctx->rasterizer && ctx->rasterizer->base.multisample;
842 fragmeta->coverage_mask = (msaa ? ctx->sample_mask : ~0) & 0xF;
843
844 fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x10;
845 fragmeta->unknown2_4 = 0x4e0;
846
847 /* unknown2_4 has 0x10 bit set on T6XX and T720. We don't know why this
848 * is required (independent of 32-bit/64-bit descriptors), or why it's
849 * not used on later GPU revisions. Otherwise, all shader jobs fault on
850 * these earlier chips (perhaps this is a chicken bit of some kind).
851 * More investigation is needed. */
852
853 SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
854
855 if (dev->quirks & IS_BIFROST) {
856 /* TODO */
857 } else {
858 /* Depending on whether it's legal to in the given shader, we try to
859 * enable early-z testing. TODO: respect e-z force */
860
861 SET_BIT(fragmeta->midgard1.flags_lo, MALI_EARLY_Z,
862 !fs->can_discard && !fs->writes_global &&
863 !fs->writes_depth && !fs->writes_stencil &&
864 !ctx->blend->base.alpha_to_coverage);
865
866 /* Add the writes Z/S flags if needed. */
867 SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_Z, fs->writes_depth);
868 SET_BIT(fragmeta->midgard1.flags_hi, MALI_WRITES_S, fs->writes_stencil);
869
870 /* Any time texturing is used, derivatives are implicitly calculated,
871 * so we need to enable helper invocations */
872
873 SET_BIT(fragmeta->midgard1.flags_lo, MALI_HELPER_INVOCATIONS,
874 fs->helper_invocations);
875
876 const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
877
878 bool depth_enabled = fs->writes_depth ||
879 (zsa && zsa->depth.enabled && zsa->depth.func != PIPE_FUNC_ALWAYS);
880
881 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_TILEBUFFER,
882 fs->outputs_read || (!depth_enabled && fs->can_discard));
883 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_ZS, depth_enabled && fs->can_discard);
884 }
885
886 panfrost_frag_meta_rasterizer_update(ctx, fragmeta);
887 panfrost_frag_meta_zsa_update(ctx, fragmeta);
888 panfrost_frag_meta_blend_update(ctx, fragmeta, rts);
889 }
890
891 void
892 panfrost_emit_shader_meta(struct panfrost_batch *batch,
893 enum pipe_shader_type st,
894 struct mali_vertex_tiler_postfix *postfix)
895 {
896 struct panfrost_context *ctx = batch->ctx;
897 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
898
899 if (!ss) {
900 postfix->shader = 0;
901 return;
902 }
903
904 struct mali_shader_meta meta;
905
906 panfrost_shader_meta_init(ctx, st, &meta);
907
908 /* Add the shader BO to the batch. */
909 panfrost_batch_add_bo(batch, ss->bo,
910 PAN_BO_ACCESS_PRIVATE |
911 PAN_BO_ACCESS_READ |
912 panfrost_bo_access_for_stage(st));
913
914 mali_ptr shader_ptr;
915
916 if (st == PIPE_SHADER_FRAGMENT) {
917 struct panfrost_device *dev = pan_device(ctx->base.screen);
918 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
919 size_t desc_size = sizeof(meta);
920 void *rts = NULL;
921 struct panfrost_transfer xfer;
922 unsigned rt_size;
923
924 if (dev->quirks & MIDGARD_SFBD)
925 rt_size = 0;
926 else if (dev->quirks & IS_BIFROST)
927 rt_size = sizeof(struct bifrost_blend_rt);
928 else
929 rt_size = sizeof(struct midgard_blend_rt);
930
931 desc_size += rt_size * rt_count;
932
933 if (rt_size)
934 rts = rzalloc_size(ctx, rt_size * rt_count);
935
936 panfrost_frag_shader_meta_init(ctx, &meta, rts);
937
938 xfer = panfrost_pool_alloc(&batch->pool, desc_size);
939
940 memcpy(xfer.cpu, &meta, sizeof(meta));
941 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
942
943 if (rt_size)
944 ralloc_free(rts);
945
946 shader_ptr = xfer.gpu;
947 } else {
948 shader_ptr = panfrost_pool_upload(&batch->pool, &meta,
949 sizeof(meta));
950 }
951
952 postfix->shader = shader_ptr;
953 }
954
955 static void
956 panfrost_mali_viewport_init(struct panfrost_context *ctx,
957 struct mali_viewport *mvp)
958 {
959 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
960
961 /* Clip bounds are encoded as floats. The viewport itself is encoded as
962 * (somewhat) asymmetric ints. */
963
964 const struct pipe_scissor_state *ss = &ctx->scissor;
965
966 memset(mvp, 0, sizeof(*mvp));
967
968 /* By default, do no viewport clipping, i.e. clip to (-inf, inf) in
969 * each direction. Clipping to the viewport in theory should work, but
970 * in practice causes issues when we're not explicitly trying to
971 * scissor */
972
973 *mvp = (struct mali_viewport) {
974 .clip_minx = -INFINITY,
975 .clip_miny = -INFINITY,
976 .clip_maxx = INFINITY,
977 .clip_maxy = INFINITY,
978 };
979
980 /* Always scissor to the viewport by default. */
981 float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
982 float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
983
984 float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
985 float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
986
987 float minz = (vp->translate[2] - fabsf(vp->scale[2]));
988 float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
989
990 /* Apply the scissor test */
991
992 unsigned minx, miny, maxx, maxy;
993
994 if (ss && ctx->rasterizer && ctx->rasterizer->base.scissor) {
995 minx = MAX2(ss->minx, vp_minx);
996 miny = MAX2(ss->miny, vp_miny);
997 maxx = MIN2(ss->maxx, vp_maxx);
998 maxy = MIN2(ss->maxy, vp_maxy);
999 } else {
1000 minx = vp_minx;
1001 miny = vp_miny;
1002 maxx = vp_maxx;
1003 maxy = vp_maxy;
1004 }
1005
1006 /* Hardware needs the min/max to be strictly ordered, so flip if we
1007 * need to. The viewport transformation in the vertex shader will
1008 * handle the negatives if we don't */
1009
1010 if (miny > maxy) {
1011 unsigned temp = miny;
1012 miny = maxy;
1013 maxy = temp;
1014 }
1015
1016 if (minx > maxx) {
1017 unsigned temp = minx;
1018 minx = maxx;
1019 maxx = temp;
1020 }
1021
1022 if (minz > maxz) {
1023 float temp = minz;
1024 minz = maxz;
1025 maxz = temp;
1026 }
1027
1028 /* Clamp to the framebuffer size as a last check */
1029
1030 minx = MIN2(ctx->pipe_framebuffer.width, minx);
1031 maxx = MIN2(ctx->pipe_framebuffer.width, maxx);
1032
1033 miny = MIN2(ctx->pipe_framebuffer.height, miny);
1034 maxy = MIN2(ctx->pipe_framebuffer.height, maxy);
1035
1036 /* Upload */
1037
1038 mvp->viewport0[0] = minx;
1039 mvp->viewport1[0] = MALI_POSITIVE(maxx);
1040
1041 mvp->viewport0[1] = miny;
1042 mvp->viewport1[1] = MALI_POSITIVE(maxy);
1043
1044 bool clip_near = true;
1045 bool clip_far = true;
1046
1047 if (ctx->rasterizer) {
1048 clip_near = ctx->rasterizer->base.depth_clip_near;
1049 clip_far = ctx->rasterizer->base.depth_clip_far;
1050 }
1051
1052 mvp->clip_minz = clip_near ? minz : -INFINITY;
1053 mvp->clip_maxz = clip_far ? maxz : INFINITY;
1054 }
1055
1056 void
1057 panfrost_emit_viewport(struct panfrost_batch *batch,
1058 struct mali_vertex_tiler_postfix *tiler_postfix)
1059 {
1060 struct panfrost_context *ctx = batch->ctx;
1061 struct mali_viewport mvp;
1062
1063 panfrost_mali_viewport_init(batch->ctx, &mvp);
1064
1065 /* Update the job, unless we're doing wallpapering (whose lack of
1066 * scissor we can ignore, since if we "miss" a tile of wallpaper, it'll
1067 * just... be faster :) */
1068
1069 if (!ctx->wallpaper_batch)
1070 panfrost_batch_union_scissor(batch, mvp.viewport0[0],
1071 mvp.viewport0[1],
1072 mvp.viewport1[0] + 1,
1073 mvp.viewport1[1] + 1);
1074
1075 tiler_postfix->viewport = panfrost_pool_upload(&batch->pool, &mvp,
1076 sizeof(mvp));
1077 }
1078
1079 static mali_ptr
1080 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
1081 enum pipe_shader_type st,
1082 struct panfrost_constant_buffer *buf,
1083 unsigned index)
1084 {
1085 struct pipe_constant_buffer *cb = &buf->cb[index];
1086 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1087
1088 if (rsrc) {
1089 panfrost_batch_add_bo(batch, rsrc->bo,
1090 PAN_BO_ACCESS_SHARED |
1091 PAN_BO_ACCESS_READ |
1092 panfrost_bo_access_for_stage(st));
1093
1094 /* Alignment gauranteed by
1095 * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
1096 return rsrc->bo->gpu + cb->buffer_offset;
1097 } else if (cb->user_buffer) {
1098 return panfrost_pool_upload(&batch->pool,
1099 cb->user_buffer +
1100 cb->buffer_offset,
1101 cb->buffer_size);
1102 } else {
1103 unreachable("No constant buffer");
1104 }
1105 }
1106
1107 struct sysval_uniform {
1108 union {
1109 float f[4];
1110 int32_t i[4];
1111 uint32_t u[4];
1112 uint64_t du[2];
1113 };
1114 };
1115
1116 static void
1117 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
1118 struct sysval_uniform *uniform)
1119 {
1120 struct panfrost_context *ctx = batch->ctx;
1121 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1122
1123 uniform->f[0] = vp->scale[0];
1124 uniform->f[1] = vp->scale[1];
1125 uniform->f[2] = vp->scale[2];
1126 }
1127
1128 static void
1129 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
1130 struct sysval_uniform *uniform)
1131 {
1132 struct panfrost_context *ctx = batch->ctx;
1133 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1134
1135 uniform->f[0] = vp->translate[0];
1136 uniform->f[1] = vp->translate[1];
1137 uniform->f[2] = vp->translate[2];
1138 }
1139
1140 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
1141 enum pipe_shader_type st,
1142 unsigned int sysvalid,
1143 struct sysval_uniform *uniform)
1144 {
1145 struct panfrost_context *ctx = batch->ctx;
1146 unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
1147 unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
1148 bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
1149 struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
1150
1151 assert(dim);
1152 uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
1153
1154 if (dim > 1)
1155 uniform->i[1] = u_minify(tex->texture->height0,
1156 tex->u.tex.first_level);
1157
1158 if (dim > 2)
1159 uniform->i[2] = u_minify(tex->texture->depth0,
1160 tex->u.tex.first_level);
1161
1162 if (is_array)
1163 uniform->i[dim] = tex->texture->array_size;
1164 }
1165
1166 static void
1167 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
1168 enum pipe_shader_type st,
1169 unsigned ssbo_id,
1170 struct sysval_uniform *uniform)
1171 {
1172 struct panfrost_context *ctx = batch->ctx;
1173
1174 assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
1175 struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
1176
1177 /* Compute address */
1178 struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
1179
1180 panfrost_batch_add_bo(batch, bo,
1181 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
1182 panfrost_bo_access_for_stage(st));
1183
1184 /* Upload address and size as sysval */
1185 uniform->du[0] = bo->gpu + sb.buffer_offset;
1186 uniform->u[2] = sb.buffer_size;
1187 }
1188
1189 static void
1190 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1191 enum pipe_shader_type st,
1192 unsigned samp_idx,
1193 struct sysval_uniform *uniform)
1194 {
1195 struct panfrost_context *ctx = batch->ctx;
1196 struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1197
1198 uniform->f[0] = sampl->min_lod;
1199 uniform->f[1] = sampl->max_lod;
1200 uniform->f[2] = sampl->lod_bias;
1201
1202 /* Even without any errata, Midgard represents "no mipmapping" as
1203 * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1204 * panfrost_create_sampler_state which also explains our choice of
1205 * epsilon value (again to keep behaviour consistent) */
1206
1207 if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1208 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1209 }
1210
1211 static void
1212 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1213 struct sysval_uniform *uniform)
1214 {
1215 struct panfrost_context *ctx = batch->ctx;
1216
1217 uniform->u[0] = ctx->compute_grid->grid[0];
1218 uniform->u[1] = ctx->compute_grid->grid[1];
1219 uniform->u[2] = ctx->compute_grid->grid[2];
1220 }
1221
1222 static void
1223 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1224 struct panfrost_shader_state *ss,
1225 enum pipe_shader_type st)
1226 {
1227 struct sysval_uniform *uniforms = (void *)buf;
1228
1229 for (unsigned i = 0; i < ss->sysval_count; ++i) {
1230 int sysval = ss->sysval[i];
1231
1232 switch (PAN_SYSVAL_TYPE(sysval)) {
1233 case PAN_SYSVAL_VIEWPORT_SCALE:
1234 panfrost_upload_viewport_scale_sysval(batch,
1235 &uniforms[i]);
1236 break;
1237 case PAN_SYSVAL_VIEWPORT_OFFSET:
1238 panfrost_upload_viewport_offset_sysval(batch,
1239 &uniforms[i]);
1240 break;
1241 case PAN_SYSVAL_TEXTURE_SIZE:
1242 panfrost_upload_txs_sysval(batch, st,
1243 PAN_SYSVAL_ID(sysval),
1244 &uniforms[i]);
1245 break;
1246 case PAN_SYSVAL_SSBO:
1247 panfrost_upload_ssbo_sysval(batch, st,
1248 PAN_SYSVAL_ID(sysval),
1249 &uniforms[i]);
1250 break;
1251 case PAN_SYSVAL_NUM_WORK_GROUPS:
1252 panfrost_upload_num_work_groups_sysval(batch,
1253 &uniforms[i]);
1254 break;
1255 case PAN_SYSVAL_SAMPLER:
1256 panfrost_upload_sampler_sysval(batch, st,
1257 PAN_SYSVAL_ID(sysval),
1258 &uniforms[i]);
1259 break;
1260 default:
1261 assert(0);
1262 }
1263 }
1264 }
1265
1266 static const void *
1267 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1268 unsigned index)
1269 {
1270 struct pipe_constant_buffer *cb = &buf->cb[index];
1271 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1272
1273 if (rsrc)
1274 return rsrc->bo->cpu;
1275 else if (cb->user_buffer)
1276 return cb->user_buffer;
1277 else
1278 unreachable("No constant buffer");
1279 }
1280
1281 void
1282 panfrost_emit_const_buf(struct panfrost_batch *batch,
1283 enum pipe_shader_type stage,
1284 struct mali_vertex_tiler_postfix *postfix)
1285 {
1286 struct panfrost_context *ctx = batch->ctx;
1287 struct panfrost_shader_variants *all = ctx->shader[stage];
1288
1289 if (!all)
1290 return;
1291
1292 struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1293
1294 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1295
1296 /* Uniforms are implicitly UBO #0 */
1297 bool has_uniforms = buf->enabled_mask & (1 << 0);
1298
1299 /* Allocate room for the sysval and the uniforms */
1300 size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1301 size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1302 size_t size = sys_size + uniform_size;
1303 struct panfrost_transfer transfer = panfrost_pool_alloc(&batch->pool,
1304 size);
1305
1306 /* Upload sysvals requested by the shader */
1307 panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1308
1309 /* Upload uniforms */
1310 if (has_uniforms && uniform_size) {
1311 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1312 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1313 }
1314
1315 /* Next up, attach UBOs. UBO #0 is the uniforms we just
1316 * uploaded */
1317
1318 unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1319 assert(ubo_count >= 1);
1320
1321 size_t sz = sizeof(uint64_t) * ubo_count;
1322 uint64_t ubos[PAN_MAX_CONST_BUFFERS];
1323 int uniform_count = ss->uniform_count;
1324
1325 /* Upload uniforms as a UBO */
1326 ubos[0] = MALI_MAKE_UBO(2 + uniform_count, transfer.gpu);
1327
1328 /* The rest are honest-to-goodness UBOs */
1329
1330 for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1331 size_t usz = buf->cb[ubo].buffer_size;
1332 bool enabled = buf->enabled_mask & (1 << ubo);
1333 bool empty = usz == 0;
1334
1335 if (!enabled || empty) {
1336 /* Stub out disabled UBOs to catch accesses */
1337 ubos[ubo] = MALI_MAKE_UBO(0, 0xDEAD0000);
1338 continue;
1339 }
1340
1341 mali_ptr gpu = panfrost_map_constant_buffer_gpu(batch, stage,
1342 buf, ubo);
1343
1344 unsigned bytes_per_field = 16;
1345 unsigned aligned = ALIGN_POT(usz, bytes_per_field);
1346 ubos[ubo] = MALI_MAKE_UBO(aligned / bytes_per_field, gpu);
1347 }
1348
1349 mali_ptr ubufs = panfrost_pool_upload(&batch->pool, ubos, sz);
1350 postfix->uniforms = transfer.gpu;
1351 postfix->uniform_buffers = ubufs;
1352
1353 buf->dirty_mask = 0;
1354 }
1355
1356 void
1357 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1358 const struct pipe_grid_info *info,
1359 struct midgard_payload_vertex_tiler *vtp)
1360 {
1361 struct panfrost_context *ctx = batch->ctx;
1362 struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1363 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1364 unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1365 128));
1366 unsigned shared_size = single_size * info->grid[0] * info->grid[1] *
1367 info->grid[2] * 4;
1368 struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1369 shared_size,
1370 1);
1371
1372 struct mali_shared_memory shared = {
1373 .shared_memory = bo->gpu,
1374 .shared_workgroup_count =
1375 util_logbase2_ceil(info->grid[0]) +
1376 util_logbase2_ceil(info->grid[1]) +
1377 util_logbase2_ceil(info->grid[2]),
1378 .shared_unk1 = 0x2,
1379 .shared_shift = util_logbase2(single_size) - 1
1380 };
1381
1382 vtp->postfix.shared_memory = panfrost_pool_upload(&batch->pool, &shared,
1383 sizeof(shared));
1384 }
1385
1386 static mali_ptr
1387 panfrost_get_tex_desc(struct panfrost_batch *batch,
1388 enum pipe_shader_type st,
1389 struct panfrost_sampler_view *view)
1390 {
1391 if (!view)
1392 return (mali_ptr) 0;
1393
1394 struct pipe_sampler_view *pview = &view->base;
1395 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1396
1397 /* Add the BO to the job so it's retained until the job is done. */
1398
1399 panfrost_batch_add_bo(batch, rsrc->bo,
1400 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1401 panfrost_bo_access_for_stage(st));
1402
1403 panfrost_batch_add_bo(batch, view->bo,
1404 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1405 panfrost_bo_access_for_stage(st));
1406
1407 return view->bo->gpu;
1408 }
1409
1410 static void
1411 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
1412 struct pipe_context *pctx)
1413 {
1414 struct panfrost_resource *rsrc = pan_resource(view->base.texture);
1415 if (view->texture_bo != rsrc->bo->gpu ||
1416 view->layout != rsrc->layout) {
1417 panfrost_bo_unreference(view->bo);
1418 panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);
1419 }
1420 }
1421
1422 void
1423 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1424 enum pipe_shader_type stage,
1425 struct mali_vertex_tiler_postfix *postfix)
1426 {
1427 struct panfrost_context *ctx = batch->ctx;
1428 struct panfrost_device *device = pan_device(ctx->base.screen);
1429
1430 if (!ctx->sampler_view_count[stage])
1431 return;
1432
1433 if (device->quirks & IS_BIFROST) {
1434 struct bifrost_texture_descriptor *descriptors;
1435
1436 descriptors = malloc(sizeof(struct bifrost_texture_descriptor) *
1437 ctx->sampler_view_count[stage]);
1438
1439 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1440 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1441 struct pipe_sampler_view *pview = &view->base;
1442 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1443 panfrost_update_sampler_view(view, &ctx->base);
1444
1445 /* Add the BOs to the job so they are retained until the job is done. */
1446
1447 panfrost_batch_add_bo(batch, rsrc->bo,
1448 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1449 panfrost_bo_access_for_stage(stage));
1450
1451 panfrost_batch_add_bo(batch, view->bo,
1452 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1453 panfrost_bo_access_for_stage(stage));
1454
1455 memcpy(&descriptors[i], view->bifrost_descriptor, sizeof(*view->bifrost_descriptor));
1456 }
1457
1458 postfix->textures = panfrost_pool_upload(&batch->pool,
1459 descriptors,
1460 sizeof(struct bifrost_texture_descriptor) *
1461 ctx->sampler_view_count[stage]);
1462
1463 free(descriptors);
1464 } else {
1465 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1466
1467 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1468 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1469
1470 panfrost_update_sampler_view(view, &ctx->base);
1471
1472 trampolines[i] = panfrost_get_tex_desc(batch, stage, view);
1473 }
1474
1475 postfix->textures = panfrost_pool_upload(&batch->pool,
1476 trampolines,
1477 sizeof(uint64_t) *
1478 ctx->sampler_view_count[stage]);
1479 }
1480 }
1481
1482 void
1483 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1484 enum pipe_shader_type stage,
1485 struct mali_vertex_tiler_postfix *postfix)
1486 {
1487 struct panfrost_context *ctx = batch->ctx;
1488 struct panfrost_device *device = pan_device(ctx->base.screen);
1489
1490 if (!ctx->sampler_count[stage])
1491 return;
1492
1493 if (device->quirks & IS_BIFROST) {
1494 size_t desc_size = sizeof(struct bifrost_sampler_descriptor);
1495 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1496 struct panfrost_transfer transfer = panfrost_pool_alloc(&batch->pool,
1497 transfer_size);
1498 struct bifrost_sampler_descriptor *desc = (struct bifrost_sampler_descriptor *)transfer.cpu;
1499
1500 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1501 desc[i] = ctx->samplers[stage][i]->bifrost_hw;
1502
1503 postfix->sampler_descriptor = transfer.gpu;
1504 } else {
1505 size_t desc_size = sizeof(struct mali_sampler_descriptor);
1506 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1507 struct panfrost_transfer transfer = panfrost_pool_alloc(&batch->pool,
1508 transfer_size);
1509 struct mali_sampler_descriptor *desc = (struct mali_sampler_descriptor *)transfer.cpu;
1510
1511 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1512 desc[i] = ctx->samplers[stage][i]->midgard_hw;
1513
1514 postfix->sampler_descriptor = transfer.gpu;
1515 }
1516 }
1517
1518 void
1519 panfrost_emit_vertex_attr_meta(struct panfrost_batch *batch,
1520 struct mali_vertex_tiler_postfix *vertex_postfix)
1521 {
1522 struct panfrost_context *ctx = batch->ctx;
1523
1524 if (!ctx->vertex)
1525 return;
1526
1527 struct panfrost_vertex_state *so = ctx->vertex;
1528
1529 panfrost_vertex_state_upd_attr_offs(ctx, vertex_postfix);
1530 vertex_postfix->attribute_meta = panfrost_pool_upload(&batch->pool, so->hw,
1531 sizeof(*so->hw) *
1532 PAN_MAX_ATTRIBUTE);
1533 }
1534
1535 void
1536 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1537 struct mali_vertex_tiler_postfix *vertex_postfix)
1538 {
1539 struct panfrost_context *ctx = batch->ctx;
1540 struct panfrost_vertex_state *so = ctx->vertex;
1541
1542 /* Staged mali_attr, and index into them. i =/= k, depending on the
1543 * vertex buffer mask and instancing. Twice as much room is allocated,
1544 * for a worst case of NPOT_DIVIDEs which take up extra slot */
1545 union mali_attr attrs[PIPE_MAX_ATTRIBS * 2];
1546 unsigned k = 0;
1547
1548 for (unsigned i = 0; i < so->num_elements; ++i) {
1549 /* We map a mali_attr to be 1:1 with the mali_attr_meta, which
1550 * means duplicating some vertex buffers (who cares? aside from
1551 * maybe some caching implications but I somehow doubt that
1552 * matters) */
1553
1554 struct pipe_vertex_element *elem = &so->pipe[i];
1555 unsigned vbi = elem->vertex_buffer_index;
1556
1557 /* The exception to 1:1 mapping is that we can have multiple
1558 * entries (NPOT divisors), so we fixup anyways */
1559
1560 so->hw[i].index = k;
1561
1562 if (!(ctx->vb_mask & (1 << vbi)))
1563 continue;
1564
1565 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1566 struct panfrost_resource *rsrc;
1567
1568 rsrc = pan_resource(buf->buffer.resource);
1569 if (!rsrc)
1570 continue;
1571
1572 /* Align to 64 bytes by masking off the lower bits. This
1573 * will be adjusted back when we fixup the src_offset in
1574 * mali_attr_meta */
1575
1576 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1577 mali_ptr addr = raw_addr & ~63;
1578 unsigned chopped_addr = raw_addr - addr;
1579
1580 /* Add a dependency of the batch on the vertex buffer */
1581 panfrost_batch_add_bo(batch, rsrc->bo,
1582 PAN_BO_ACCESS_SHARED |
1583 PAN_BO_ACCESS_READ |
1584 PAN_BO_ACCESS_VERTEX_TILER);
1585
1586 /* Set common fields */
1587 attrs[k].elements = addr;
1588 attrs[k].stride = buf->stride;
1589
1590 /* Since we advanced the base pointer, we shrink the buffer
1591 * size */
1592 attrs[k].size = rsrc->base.width0 - buf->buffer_offset;
1593
1594 /* We need to add the extra size we masked off (for
1595 * correctness) so the data doesn't get clamped away */
1596 attrs[k].size += chopped_addr;
1597
1598 /* For non-instancing make sure we initialize */
1599 attrs[k].shift = attrs[k].extra_flags = 0;
1600
1601 /* Instancing uses a dramatically different code path than
1602 * linear, so dispatch for the actual emission now that the
1603 * common code is finished */
1604
1605 unsigned divisor = elem->instance_divisor;
1606
1607 if (divisor && ctx->instance_count == 1) {
1608 /* Silly corner case where there's a divisor(=1) but
1609 * there's no legitimate instancing. So we want *every*
1610 * attribute to be the same. So set stride to zero so
1611 * we don't go anywhere. */
1612
1613 attrs[k].size = attrs[k].stride + chopped_addr;
1614 attrs[k].stride = 0;
1615 attrs[k++].elements |= MALI_ATTR_LINEAR;
1616 } else if (ctx->instance_count <= 1) {
1617 /* Normal, non-instanced attributes */
1618 attrs[k++].elements |= MALI_ATTR_LINEAR;
1619 } else {
1620 unsigned instance_shift = vertex_postfix->instance_shift;
1621 unsigned instance_odd = vertex_postfix->instance_odd;
1622
1623 k += panfrost_vertex_instanced(ctx->padded_count,
1624 instance_shift,
1625 instance_odd,
1626 divisor, &attrs[k]);
1627 }
1628 }
1629
1630 /* Add special gl_VertexID/gl_InstanceID buffers */
1631
1632 panfrost_vertex_id(ctx->padded_count, &attrs[k]);
1633 so->hw[PAN_VERTEX_ID].index = k++;
1634 panfrost_instance_id(ctx->padded_count, &attrs[k]);
1635 so->hw[PAN_INSTANCE_ID].index = k++;
1636
1637 /* Upload whatever we emitted and go */
1638
1639 vertex_postfix->attributes = panfrost_pool_upload(&batch->pool, attrs,
1640 k * sizeof(*attrs));
1641 }
1642
1643 static mali_ptr
1644 panfrost_emit_varyings(struct panfrost_batch *batch, union mali_attr *slot,
1645 unsigned stride, unsigned count)
1646 {
1647 /* Fill out the descriptor */
1648 slot->stride = stride;
1649 slot->size = stride * count;
1650 slot->shift = slot->extra_flags = 0;
1651
1652 struct panfrost_transfer transfer = panfrost_pool_alloc(&batch->pool,
1653 slot->size);
1654
1655 slot->elements = transfer.gpu | MALI_ATTR_LINEAR;
1656
1657 return transfer.gpu;
1658 }
1659
1660 static unsigned
1661 panfrost_streamout_offset(unsigned stride, unsigned offset,
1662 struct pipe_stream_output_target *target)
1663 {
1664 return (target->buffer_offset + (offset * stride * 4)) & 63;
1665 }
1666
1667 static void
1668 panfrost_emit_streamout(struct panfrost_batch *batch, union mali_attr *slot,
1669 unsigned stride, unsigned offset, unsigned count,
1670 struct pipe_stream_output_target *target)
1671 {
1672 /* Fill out the descriptor */
1673 slot->stride = stride * 4;
1674 slot->shift = slot->extra_flags = 0;
1675
1676 unsigned max_size = target->buffer_size;
1677 unsigned expected_size = slot->stride * count;
1678
1679 /* Grab the BO and bind it to the batch */
1680 struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1681
1682 /* Varyings are WRITE from the perspective of the VERTEX but READ from
1683 * the perspective of the TILER and FRAGMENT.
1684 */
1685 panfrost_batch_add_bo(batch, bo,
1686 PAN_BO_ACCESS_SHARED |
1687 PAN_BO_ACCESS_RW |
1688 PAN_BO_ACCESS_VERTEX_TILER |
1689 PAN_BO_ACCESS_FRAGMENT);
1690
1691 /* We will have an offset applied to get alignment */
1692 mali_ptr addr = bo->gpu + target->buffer_offset + (offset * slot->stride);
1693 slot->elements = (addr & ~63) | MALI_ATTR_LINEAR;
1694 slot->size = MIN2(max_size, expected_size) + (addr & 63);
1695 }
1696
1697 static bool
1698 has_point_coord(unsigned mask, gl_varying_slot loc)
1699 {
1700 if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1701 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1702 else if (loc == VARYING_SLOT_PNTC)
1703 return (mask & (1 << 8));
1704 else
1705 return false;
1706 }
1707
1708 /* Helpers for manipulating stream out information so we can pack varyings
1709 * accordingly. Compute the src_offset for a given captured varying */
1710
1711 static struct pipe_stream_output *
1712 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1713 {
1714 for (unsigned i = 0; i < info->num_outputs; ++i) {
1715 if (info->output[i].register_index == loc)
1716 return &info->output[i];
1717 }
1718
1719 unreachable("Varying not captured");
1720 }
1721
1722 static unsigned
1723 pan_varying_size(enum mali_format fmt)
1724 {
1725 unsigned type = MALI_EXTRACT_TYPE(fmt);
1726 unsigned chan = MALI_EXTRACT_CHANNELS(fmt);
1727 unsigned bits = MALI_EXTRACT_BITS(fmt);
1728 unsigned bpc = 0;
1729
1730 if (bits == MALI_CHANNEL_FLOAT) {
1731 /* No doubles */
1732 bool fp16 = (type == MALI_FORMAT_SINT);
1733 assert(fp16 || (type == MALI_FORMAT_UNORM));
1734
1735 bpc = fp16 ? 2 : 4;
1736 } else {
1737 assert(type >= MALI_FORMAT_SNORM && type <= MALI_FORMAT_SINT);
1738
1739 /* See the enums */
1740 bits = 1 << bits;
1741 assert(bits >= 8);
1742 bpc = bits / 8;
1743 }
1744
1745 return bpc * chan;
1746 }
1747
1748 /* Indices for named (non-XFB) varyings that are present. These are packed
1749 * tightly so they correspond to a bitfield present (P) indexed by (1 <<
1750 * PAN_VARY_*). This has the nice property that you can lookup the buffer index
1751 * of a given special field given a shift S by:
1752 *
1753 * idx = popcount(P & ((1 << S) - 1))
1754 *
1755 * That is... look at all of the varyings that come earlier and count them, the
1756 * count is the new index since plus one. Likewise, the total number of special
1757 * buffers required is simply popcount(P)
1758 */
1759
1760 enum pan_special_varying {
1761 PAN_VARY_GENERAL = 0,
1762 PAN_VARY_POSITION = 1,
1763 PAN_VARY_PSIZ = 2,
1764 PAN_VARY_PNTCOORD = 3,
1765 PAN_VARY_FACE = 4,
1766 PAN_VARY_FRAGCOORD = 5,
1767
1768 /* Keep last */
1769 PAN_VARY_MAX,
1770 };
1771
1772 /* Given a varying, figure out which index it correpsonds to */
1773
1774 static inline unsigned
1775 pan_varying_index(unsigned present, enum pan_special_varying v)
1776 {
1777 unsigned mask = (1 << v) - 1;
1778 return util_bitcount(present & mask);
1779 }
1780
1781 /* Get the base offset for XFB buffers, which by convention come after
1782 * everything else. Wrapper function for semantic reasons; by construction this
1783 * is just popcount. */
1784
1785 static inline unsigned
1786 pan_xfb_base(unsigned present)
1787 {
1788 return util_bitcount(present);
1789 }
1790
1791 /* Computes the present mask for varyings so we can start emitting varying records */
1792
1793 static inline unsigned
1794 pan_varying_present(
1795 struct panfrost_shader_state *vs,
1796 struct panfrost_shader_state *fs,
1797 unsigned quirks)
1798 {
1799 /* At the moment we always emit general and position buffers. Not
1800 * strictly necessary but usually harmless */
1801
1802 unsigned present = (1 << PAN_VARY_GENERAL) | (1 << PAN_VARY_POSITION);
1803
1804 /* Enable special buffers by the shader info */
1805
1806 if (vs->writes_point_size)
1807 present |= (1 << PAN_VARY_PSIZ);
1808
1809 if (fs->reads_point_coord)
1810 present |= (1 << PAN_VARY_PNTCOORD);
1811
1812 if (fs->reads_face)
1813 present |= (1 << PAN_VARY_FACE);
1814
1815 if (fs->reads_frag_coord && !(quirks & IS_BIFROST))
1816 present |= (1 << PAN_VARY_FRAGCOORD);
1817
1818 /* Also, if we have a point sprite, we need a point coord buffer */
1819
1820 for (unsigned i = 0; i < fs->varying_count; i++) {
1821 gl_varying_slot loc = fs->varyings_loc[i];
1822
1823 if (has_point_coord(fs->point_sprite_mask, loc))
1824 present |= (1 << PAN_VARY_PNTCOORD);
1825 }
1826
1827 return present;
1828 }
1829
1830 /* Emitters for varying records */
1831
1832 static struct mali_attr_meta
1833 pan_emit_vary(unsigned present, enum pan_special_varying buf,
1834 unsigned quirks, enum mali_format format,
1835 unsigned offset)
1836 {
1837 unsigned nr_channels = MALI_EXTRACT_CHANNELS(format);
1838
1839 struct mali_attr_meta meta = {
1840 .index = pan_varying_index(present, buf),
1841 .unknown1 = quirks & IS_BIFROST ? 0x0 : 0x2,
1842 .swizzle = quirks & HAS_SWIZZLES ?
1843 panfrost_get_default_swizzle(nr_channels) :
1844 panfrost_bifrost_swizzle(nr_channels),
1845 .format = format,
1846 .src_offset = offset
1847 };
1848
1849 return meta;
1850 }
1851
1852 /* General varying that is unused */
1853
1854 static struct mali_attr_meta
1855 pan_emit_vary_only(unsigned present, unsigned quirks)
1856 {
1857 return pan_emit_vary(present, 0, quirks, MALI_VARYING_DISCARD, 0);
1858 }
1859
1860 /* Special records */
1861
1862 static const enum mali_format pan_varying_formats[PAN_VARY_MAX] = {
1863 [PAN_VARY_POSITION] = MALI_VARYING_POS,
1864 [PAN_VARY_PSIZ] = MALI_R16F,
1865 [PAN_VARY_PNTCOORD] = MALI_R16F,
1866 [PAN_VARY_FACE] = MALI_R32I,
1867 [PAN_VARY_FRAGCOORD] = MALI_RGBA32F
1868 };
1869
1870 static struct mali_attr_meta
1871 pan_emit_vary_special(unsigned present, enum pan_special_varying buf,
1872 unsigned quirks)
1873 {
1874 assert(buf < PAN_VARY_MAX);
1875 return pan_emit_vary(present, buf, quirks, pan_varying_formats[buf], 0);
1876 }
1877
1878 static enum mali_format
1879 pan_xfb_format(enum mali_format format, unsigned nr)
1880 {
1881 if (MALI_EXTRACT_BITS(format) == MALI_CHANNEL_FLOAT)
1882 return MALI_R32F | MALI_NR_CHANNELS(nr);
1883 else
1884 return MALI_EXTRACT_TYPE(format) | MALI_NR_CHANNELS(nr) | MALI_CHANNEL_32;
1885 }
1886
1887 /* Transform feedback records. Note struct pipe_stream_output is (if packed as
1888 * a bitfield) 32-bit, smaller than a 64-bit pointer, so may as well pass by
1889 * value. */
1890
1891 static struct mali_attr_meta
1892 pan_emit_vary_xfb(unsigned present,
1893 unsigned max_xfb,
1894 unsigned *streamout_offsets,
1895 unsigned quirks,
1896 enum mali_format format,
1897 struct pipe_stream_output o)
1898 {
1899 /* Otherwise construct a record for it */
1900 struct mali_attr_meta meta = {
1901 /* XFB buffers come after everything else */
1902 .index = pan_xfb_base(present) + o.output_buffer,
1903
1904 /* As usual unknown bit */
1905 .unknown1 = quirks & IS_BIFROST ? 0x0 : 0x2,
1906
1907 /* Override swizzle with number of channels */
1908 .swizzle = quirks & HAS_SWIZZLES ?
1909 panfrost_get_default_swizzle(o.num_components) :
1910 panfrost_bifrost_swizzle(o.num_components),
1911
1912 /* Override number of channels and precision to highp */
1913 .format = pan_xfb_format(format, o.num_components),
1914
1915 /* Apply given offsets together */
1916 .src_offset = (o.dst_offset * 4) /* dwords */
1917 + streamout_offsets[o.output_buffer]
1918 };
1919
1920 return meta;
1921 }
1922
1923 /* Determine if we should capture a varying for XFB. This requires actually
1924 * having a buffer for it. If we don't capture it, we'll fallback to a general
1925 * varying path (linked or unlinked, possibly discarding the write) */
1926
1927 static bool
1928 panfrost_xfb_captured(struct panfrost_shader_state *xfb,
1929 unsigned loc, unsigned max_xfb)
1930 {
1931 if (!(xfb->so_mask & (1ll << loc)))
1932 return false;
1933
1934 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1935 return o->output_buffer < max_xfb;
1936 }
1937
1938 /* Higher-level wrapper around all of the above, classifying a varying into one
1939 * of the above types */
1940
1941 static struct mali_attr_meta
1942 panfrost_emit_varying(
1943 struct panfrost_shader_state *stage,
1944 struct panfrost_shader_state *other,
1945 struct panfrost_shader_state *xfb,
1946 unsigned present,
1947 unsigned max_xfb,
1948 unsigned *streamout_offsets,
1949 unsigned quirks,
1950 unsigned *gen_offsets,
1951 enum mali_format *gen_formats,
1952 unsigned *gen_stride,
1953 unsigned idx,
1954 bool should_alloc,
1955 bool is_fragment)
1956 {
1957 gl_varying_slot loc = stage->varyings_loc[idx];
1958 enum mali_format format = stage->varyings[idx];
1959
1960 /* Override format to match linkage */
1961 if (!should_alloc && gen_formats[idx])
1962 format = gen_formats[idx];
1963
1964 if (has_point_coord(stage->point_sprite_mask, loc)) {
1965 return pan_emit_vary_special(present, PAN_VARY_PNTCOORD, quirks);
1966 } else if (panfrost_xfb_captured(xfb, loc, max_xfb)) {
1967 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1968 return pan_emit_vary_xfb(present, max_xfb, streamout_offsets, quirks, format, *o);
1969 } else if (loc == VARYING_SLOT_POS) {
1970 if (is_fragment)
1971 return pan_emit_vary_special(present, PAN_VARY_FRAGCOORD, quirks);
1972 else
1973 return pan_emit_vary_special(present, PAN_VARY_POSITION, quirks);
1974 } else if (loc == VARYING_SLOT_PSIZ) {
1975 return pan_emit_vary_special(present, PAN_VARY_PSIZ, quirks);
1976 } else if (loc == VARYING_SLOT_PNTC) {
1977 return pan_emit_vary_special(present, PAN_VARY_PNTCOORD, quirks);
1978 } else if (loc == VARYING_SLOT_FACE) {
1979 return pan_emit_vary_special(present, PAN_VARY_FACE, quirks);
1980 }
1981
1982 /* We've exhausted special cases, so it's otherwise a general varying. Check if we're linked */
1983 signed other_idx = -1;
1984
1985 for (unsigned j = 0; j < other->varying_count; ++j) {
1986 if (other->varyings_loc[j] == loc) {
1987 other_idx = j;
1988 break;
1989 }
1990 }
1991
1992 if (other_idx < 0)
1993 return pan_emit_vary_only(present, quirks);
1994
1995 unsigned offset = gen_offsets[other_idx];
1996
1997 if (should_alloc) {
1998 /* We're linked, so allocate a space via a watermark allocation */
1999 enum mali_format alt = other->varyings[other_idx];
2000
2001 /* Do interpolation at minimum precision */
2002 unsigned size_main = pan_varying_size(format);
2003 unsigned size_alt = pan_varying_size(alt);
2004 unsigned size = MIN2(size_main, size_alt);
2005
2006 /* If a varying is marked for XFB but not actually captured, we
2007 * should match the format to the format that would otherwise
2008 * be used for XFB, since dEQP checks for invariance here. It's
2009 * unclear if this is required by the spec. */
2010
2011 if (xfb->so_mask & (1ull << loc)) {
2012 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
2013 format = pan_xfb_format(format, o->num_components);
2014 size = pan_varying_size(format);
2015 } else if (size == size_alt) {
2016 format = alt;
2017 }
2018
2019 gen_offsets[idx] = *gen_stride;
2020 gen_formats[other_idx] = format;
2021 offset = *gen_stride;
2022 *gen_stride += size;
2023 }
2024
2025 return pan_emit_vary(present, PAN_VARY_GENERAL,
2026 quirks, format, offset);
2027 }
2028
2029 static void
2030 pan_emit_special_input(union mali_attr *varyings,
2031 unsigned present,
2032 enum pan_special_varying v,
2033 mali_ptr addr)
2034 {
2035 if (present & (1 << v)) {
2036 /* Ensure we write exactly once for performance and with fields
2037 * zeroed appropriately to avoid flakes */
2038
2039 union mali_attr s = {
2040 .elements = addr
2041 };
2042
2043 varyings[pan_varying_index(present, v)] = s;
2044 }
2045 }
2046
2047 void
2048 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
2049 unsigned vertex_count,
2050 struct mali_vertex_tiler_postfix *vertex_postfix,
2051 struct mali_vertex_tiler_postfix *tiler_postfix,
2052 union midgard_primitive_size *primitive_size)
2053 {
2054 /* Load the shaders */
2055 struct panfrost_context *ctx = batch->ctx;
2056 struct panfrost_device *dev = pan_device(ctx->base.screen);
2057 struct panfrost_shader_state *vs, *fs;
2058 size_t vs_size, fs_size;
2059
2060 /* Allocate the varying descriptor */
2061
2062 vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
2063 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
2064 vs_size = sizeof(struct mali_attr_meta) * vs->varying_count;
2065 fs_size = sizeof(struct mali_attr_meta) * fs->varying_count;
2066
2067 struct panfrost_transfer trans = panfrost_pool_alloc(&batch->pool,
2068 vs_size +
2069 fs_size);
2070
2071 struct pipe_stream_output_info *so = &vs->stream_output;
2072 unsigned present = pan_varying_present(vs, fs, dev->quirks);
2073
2074 /* Check if this varying is linked by us. This is the case for
2075 * general-purpose, non-captured varyings. If it is, link it. If it's
2076 * not, use the provided stream out information to determine the
2077 * offset, since it was already linked for us. */
2078
2079 unsigned gen_offsets[32];
2080 enum mali_format gen_formats[32];
2081 memset(gen_offsets, 0, sizeof(gen_offsets));
2082 memset(gen_formats, 0, sizeof(gen_formats));
2083
2084 unsigned gen_stride = 0;
2085 assert(vs->varying_count < ARRAY_SIZE(gen_offsets));
2086 assert(fs->varying_count < ARRAY_SIZE(gen_offsets));
2087
2088 unsigned streamout_offsets[32];
2089
2090 for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
2091 streamout_offsets[i] = panfrost_streamout_offset(
2092 so->stride[i],
2093 ctx->streamout.offsets[i],
2094 ctx->streamout.targets[i]);
2095 }
2096
2097 struct mali_attr_meta *ovs = (struct mali_attr_meta *)trans.cpu;
2098 struct mali_attr_meta *ofs = ovs + vs->varying_count;
2099
2100 for (unsigned i = 0; i < vs->varying_count; i++) {
2101 ovs[i] = panfrost_emit_varying(vs, fs, vs, present,
2102 ctx->streamout.num_targets, streamout_offsets,
2103 dev->quirks,
2104 gen_offsets, gen_formats, &gen_stride, i, true, false);
2105 }
2106
2107 for (unsigned i = 0; i < fs->varying_count; i++) {
2108 ofs[i] = panfrost_emit_varying(fs, vs, vs, present,
2109 ctx->streamout.num_targets, streamout_offsets,
2110 dev->quirks,
2111 gen_offsets, gen_formats, &gen_stride, i, false, true);
2112 }
2113
2114 unsigned xfb_base = pan_xfb_base(present);
2115 struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool,
2116 sizeof(union mali_attr) * (xfb_base + ctx->streamout.num_targets));
2117 union mali_attr *varyings = (union mali_attr *) T.cpu;
2118
2119 /* Emit the stream out buffers */
2120
2121 unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
2122 ctx->vertex_count);
2123
2124 for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
2125 panfrost_emit_streamout(batch, &varyings[xfb_base + i],
2126 so->stride[i],
2127 ctx->streamout.offsets[i],
2128 out_count,
2129 ctx->streamout.targets[i]);
2130 }
2131
2132 panfrost_emit_varyings(batch,
2133 &varyings[pan_varying_index(present, PAN_VARY_GENERAL)],
2134 gen_stride, vertex_count);
2135
2136 /* fp32 vec4 gl_Position */
2137 tiler_postfix->position_varying = panfrost_emit_varyings(batch,
2138 &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
2139 sizeof(float) * 4, vertex_count);
2140
2141 if (present & (1 << PAN_VARY_PSIZ)) {
2142 primitive_size->pointer = panfrost_emit_varyings(batch,
2143 &varyings[pan_varying_index(present, PAN_VARY_PSIZ)],
2144 2, vertex_count);
2145 }
2146
2147 pan_emit_special_input(varyings, present, PAN_VARY_PNTCOORD, MALI_VARYING_POINT_COORD);
2148 pan_emit_special_input(varyings, present, PAN_VARY_FACE, MALI_VARYING_FRONT_FACING);
2149 pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD, MALI_VARYING_FRAG_COORD);
2150
2151 vertex_postfix->varyings = T.gpu;
2152 tiler_postfix->varyings = T.gpu;
2153
2154 vertex_postfix->varying_meta = trans.gpu;
2155 tiler_postfix->varying_meta = trans.gpu + vs_size;
2156 }
2157
2158 void
2159 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
2160 struct mali_vertex_tiler_prefix *vertex_prefix,
2161 struct mali_vertex_tiler_postfix *vertex_postfix,
2162 struct mali_vertex_tiler_prefix *tiler_prefix,
2163 struct mali_vertex_tiler_postfix *tiler_postfix,
2164 union midgard_primitive_size *primitive_size)
2165 {
2166 struct panfrost_context *ctx = batch->ctx;
2167 struct panfrost_device *device = pan_device(ctx->base.screen);
2168 bool wallpapering = ctx->wallpaper_batch && batch->scoreboard.tiler_dep;
2169 struct bifrost_payload_vertex bifrost_vertex = {0,};
2170 struct bifrost_payload_tiler bifrost_tiler = {0,};
2171 struct midgard_payload_vertex_tiler midgard_vertex = {0,};
2172 struct midgard_payload_vertex_tiler midgard_tiler = {0,};
2173 void *vp, *tp;
2174 size_t vp_size, tp_size;
2175
2176 if (device->quirks & IS_BIFROST) {
2177 bifrost_vertex.prefix = *vertex_prefix;
2178 bifrost_vertex.postfix = *vertex_postfix;
2179 vp = &bifrost_vertex;
2180 vp_size = sizeof(bifrost_vertex);
2181
2182 bifrost_tiler.prefix = *tiler_prefix;
2183 bifrost_tiler.tiler.primitive_size = *primitive_size;
2184 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
2185 bifrost_tiler.postfix = *tiler_postfix;
2186 tp = &bifrost_tiler;
2187 tp_size = sizeof(bifrost_tiler);
2188 } else {
2189 midgard_vertex.prefix = *vertex_prefix;
2190 midgard_vertex.postfix = *vertex_postfix;
2191 vp = &midgard_vertex;
2192 vp_size = sizeof(midgard_vertex);
2193
2194 midgard_tiler.prefix = *tiler_prefix;
2195 midgard_tiler.postfix = *tiler_postfix;
2196 midgard_tiler.primitive_size = *primitive_size;
2197 tp = &midgard_tiler;
2198 tp_size = sizeof(midgard_tiler);
2199 }
2200
2201 if (wallpapering) {
2202 /* Inject in reverse order, with "predicted" job indices.
2203 * THIS IS A HACK XXX */
2204 panfrost_new_job(&batch->pool, &batch->scoreboard, JOB_TYPE_TILER, false,
2205 batch->scoreboard.job_index + 2, tp, tp_size, true);
2206 panfrost_new_job(&batch->pool, &batch->scoreboard, JOB_TYPE_VERTEX, false, 0,
2207 vp, vp_size, true);
2208 return;
2209 }
2210
2211 /* If rasterizer discard is enable, only submit the vertex */
2212
2213 bool rasterizer_discard = ctx->rasterizer &&
2214 ctx->rasterizer->base.rasterizer_discard;
2215
2216 unsigned vertex = panfrost_new_job(&batch->pool, &batch->scoreboard, JOB_TYPE_VERTEX, false, 0,
2217 vp, vp_size, false);
2218
2219 if (rasterizer_discard)
2220 return;
2221
2222 panfrost_new_job(&batch->pool, &batch->scoreboard, JOB_TYPE_TILER, false, vertex, tp, tp_size,
2223 false);
2224 }
2225
2226 /* TODO: stop hardcoding this */
2227 mali_ptr
2228 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2229 {
2230 uint16_t locations[] = {
2231 128, 128,
2232 0, 256,
2233 0, 256,
2234 0, 256,
2235 0, 256,
2236 0, 256,
2237 0, 256,
2238 0, 256,
2239 0, 256,
2240 0, 256,
2241 0, 256,
2242 0, 256,
2243 0, 256,
2244 0, 256,
2245 0, 256,
2246 0, 256,
2247 0, 256,
2248 0, 256,
2249 0, 256,
2250 0, 256,
2251 0, 256,
2252 0, 256,
2253 0, 256,
2254 0, 256,
2255 0, 256,
2256 0, 256,
2257 0, 256,
2258 0, 256,
2259 0, 256,
2260 0, 256,
2261 0, 256,
2262 0, 256,
2263 128, 128,
2264 0, 0,
2265 0, 0,
2266 0, 0,
2267 0, 0,
2268 0, 0,
2269 0, 0,
2270 0, 0,
2271 0, 0,
2272 0, 0,
2273 0, 0,
2274 0, 0,
2275 0, 0,
2276 0, 0,
2277 0, 0,
2278 0, 0,
2279 };
2280
2281 return panfrost_pool_upload(&batch->pool, locations, 96 * sizeof(uint16_t));
2282 }