panfrost: Track surfaces drawn per-batch
[mesa.git] / src / gallium / drivers / panfrost / pan_cmdstream.c
1 /*
2 * Copyright (C) 2018 Alyssa Rosenzweig
3 * Copyright (C) 2020 Collabora Ltd.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "util/macros.h"
26 #include "util/u_prim.h"
27 #include "util/u_vbuf.h"
28
29 #include "panfrost-quirks.h"
30
31 #include "pan_pool.h"
32 #include "pan_bo.h"
33 #include "pan_cmdstream.h"
34 #include "pan_context.h"
35 #include "pan_job.h"
36
37 /* If a BO is accessed for a particular shader stage, will it be in the primary
38 * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
39 * fragment will be primary, e.g. compute jobs will be considered
40 * "vertex/tiler" by analogy */
41
42 static inline uint32_t
43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
44 {
45 assert(stage == PIPE_SHADER_FRAGMENT ||
46 stage == PIPE_SHADER_VERTEX ||
47 stage == PIPE_SHADER_COMPUTE);
48
49 return stage == PIPE_SHADER_FRAGMENT ?
50 PAN_BO_ACCESS_FRAGMENT :
51 PAN_BO_ACCESS_VERTEX_TILER;
52 }
53
54 static void
55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
56 struct mali_vertex_tiler_postfix *postfix)
57 {
58 struct panfrost_device *dev = pan_device(ctx->base.screen);
59 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
60
61 unsigned shift = panfrost_get_stack_shift(batch->stack_size);
62 struct mali_shared_memory shared = {
63 .stack_shift = shift,
64 .scratchpad = panfrost_batch_get_scratchpad(batch, shift, dev->thread_tls_alloc, dev->core_count)->gpu,
65 .shared_workgroup_count = ~0,
66 };
67 postfix->shared_memory = panfrost_pool_upload(&batch->pool, &shared, sizeof(shared));
68 }
69
70 static void
71 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
72 struct mali_vertex_tiler_postfix *postfix)
73 {
74 struct panfrost_device *dev = pan_device(ctx->base.screen);
75 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
76
77 /* If we haven't, reserve space for the framebuffer */
78
79 if (!batch->framebuffer.gpu) {
80 unsigned size = (dev->quirks & MIDGARD_SFBD) ?
81 sizeof(struct mali_single_framebuffer) :
82 sizeof(struct mali_framebuffer);
83
84 batch->framebuffer = panfrost_pool_alloc(&batch->pool, size);
85
86 /* Tag the pointer */
87 if (!(dev->quirks & MIDGARD_SFBD))
88 batch->framebuffer.gpu |= MALI_MFBD;
89 }
90
91 postfix->shared_memory = batch->framebuffer.gpu;
92 }
93
94 static void
95 panfrost_vt_update_rasterizer(struct panfrost_context *ctx,
96 struct mali_vertex_tiler_prefix *prefix,
97 struct mali_vertex_tiler_postfix *postfix)
98 {
99 struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
100
101 postfix->gl_enables |= 0x7;
102 SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
103 rasterizer && rasterizer->base.front_ccw);
104 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
105 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_FRONT));
106 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
107 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_BACK));
108 SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
109 rasterizer && rasterizer->base.flatshade_first);
110 }
111
112 void
113 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
114 struct mali_vertex_tiler_prefix *prefix,
115 union midgard_primitive_size *primitive_size)
116 {
117 struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
118
119 if (!panfrost_writes_point_size(ctx)) {
120 bool points = prefix->draw_mode == MALI_POINTS;
121 float val = 0.0f;
122
123 if (rasterizer)
124 val = points ?
125 rasterizer->base.point_size :
126 rasterizer->base.line_width;
127
128 primitive_size->constant = val;
129 }
130 }
131
132 static void
133 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
134 struct mali_vertex_tiler_postfix *postfix)
135 {
136 SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
137 if (ctx->occlusion_query) {
138 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
139 panfrost_batch_add_bo(ctx->batch, ctx->occlusion_query->bo,
140 PAN_BO_ACCESS_SHARED |
141 PAN_BO_ACCESS_RW |
142 PAN_BO_ACCESS_FRAGMENT);
143 } else {
144 postfix->occlusion_counter = 0;
145 }
146 }
147
148 void
149 panfrost_vt_init(struct panfrost_context *ctx,
150 enum pipe_shader_type stage,
151 struct mali_vertex_tiler_prefix *prefix,
152 struct mali_vertex_tiler_postfix *postfix)
153 {
154 struct panfrost_device *device = pan_device(ctx->base.screen);
155
156 if (!ctx->shader[stage])
157 return;
158
159 memset(prefix, 0, sizeof(*prefix));
160 memset(postfix, 0, sizeof(*postfix));
161
162 if (device->quirks & IS_BIFROST) {
163 postfix->gl_enables = 0x2;
164 panfrost_vt_emit_shared_memory(ctx, postfix);
165 } else {
166 postfix->gl_enables = 0x6;
167 panfrost_vt_attach_framebuffer(ctx, postfix);
168 }
169
170 if (stage == PIPE_SHADER_FRAGMENT) {
171 panfrost_vt_update_occlusion_query(ctx, postfix);
172 panfrost_vt_update_rasterizer(ctx, prefix, postfix);
173 }
174 }
175
176 static unsigned
177 panfrost_translate_index_size(unsigned size)
178 {
179 switch (size) {
180 case 1:
181 return MALI_DRAW_INDEXED_UINT8;
182
183 case 2:
184 return MALI_DRAW_INDEXED_UINT16;
185
186 case 4:
187 return MALI_DRAW_INDEXED_UINT32;
188
189 default:
190 unreachable("Invalid index size");
191 }
192 }
193
194 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
195 * good for the duration of the draw (transient), could last longer. Also get
196 * the bounds on the index buffer for the range accessed by the draw. We do
197 * these operations together because there are natural optimizations which
198 * require them to be together. */
199
200 static mali_ptr
201 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
202 const struct pipe_draw_info *info,
203 unsigned *min_index, unsigned *max_index)
204 {
205 struct panfrost_resource *rsrc = pan_resource(info->index.resource);
206 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
207 off_t offset = info->start * info->index_size;
208 bool needs_indices = true;
209 mali_ptr out = 0;
210
211 if (info->max_index != ~0u) {
212 *min_index = info->min_index;
213 *max_index = info->max_index;
214 needs_indices = false;
215 }
216
217 if (!info->has_user_indices) {
218 /* Only resources can be directly mapped */
219 panfrost_batch_add_bo(batch, rsrc->bo,
220 PAN_BO_ACCESS_SHARED |
221 PAN_BO_ACCESS_READ |
222 PAN_BO_ACCESS_VERTEX_TILER);
223 out = rsrc->bo->gpu + offset;
224
225 /* Check the cache */
226 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
227 info->start,
228 info->count,
229 min_index,
230 max_index);
231 } else {
232 /* Otherwise, we need to upload to transient memory */
233 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
234 out = panfrost_pool_upload(&batch->pool, ibuf8 + offset,
235 info->count *
236 info->index_size);
237 }
238
239 if (needs_indices) {
240 /* Fallback */
241 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
242
243 if (!info->has_user_indices)
244 panfrost_minmax_cache_add(rsrc->index_cache,
245 info->start, info->count,
246 *min_index, *max_index);
247 }
248
249 return out;
250 }
251
252 void
253 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
254 const struct pipe_draw_info *info,
255 enum mali_draw_mode draw_mode,
256 struct mali_vertex_tiler_postfix *vertex_postfix,
257 struct mali_vertex_tiler_prefix *tiler_prefix,
258 struct mali_vertex_tiler_postfix *tiler_postfix,
259 unsigned *vertex_count,
260 unsigned *padded_count)
261 {
262 tiler_prefix->draw_mode = draw_mode;
263
264 unsigned draw_flags = 0;
265
266 if (panfrost_writes_point_size(ctx))
267 draw_flags |= MALI_DRAW_VARYING_SIZE;
268
269 if (info->primitive_restart)
270 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
271
272 /* These doesn't make much sense */
273
274 draw_flags |= 0x3000;
275
276 if (info->index_size) {
277 unsigned min_index = 0, max_index = 0;
278
279 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
280 info,
281 &min_index,
282 &max_index);
283
284 /* Use the corresponding values */
285 *vertex_count = max_index - min_index + 1;
286 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
287 tiler_prefix->offset_bias_correction = -min_index;
288 tiler_prefix->index_count = MALI_POSITIVE(info->count);
289 draw_flags |= panfrost_translate_index_size(info->index_size);
290 } else {
291 tiler_prefix->indices = 0;
292 *vertex_count = ctx->vertex_count;
293 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
294 tiler_prefix->offset_bias_correction = 0;
295 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
296 }
297
298 tiler_prefix->unknown_draw = draw_flags;
299
300 /* Encode the padded vertex count */
301
302 if (info->instance_count > 1) {
303 *padded_count = panfrost_padded_vertex_count(*vertex_count);
304
305 unsigned shift = __builtin_ctz(ctx->padded_count);
306 unsigned k = ctx->padded_count >> (shift + 1);
307
308 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
309 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
310 } else {
311 *padded_count = *vertex_count;
312
313 /* Reset instancing state */
314 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
315 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
316 }
317 }
318
319 static void
320 panfrost_shader_meta_init(struct panfrost_context *ctx,
321 enum pipe_shader_type st,
322 struct mali_shader_meta *meta)
323 {
324 const struct panfrost_device *dev = pan_device(ctx->base.screen);
325 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
326
327 memset(meta, 0, sizeof(*meta));
328 meta->shader = (ss->bo ? ss->bo->gpu : 0) | ss->first_tag;
329 meta->attribute_count = ss->attribute_count;
330 meta->varying_count = ss->varying_count;
331 meta->texture_count = ctx->sampler_view_count[st];
332 meta->sampler_count = ctx->sampler_count[st];
333
334 if (dev->quirks & IS_BIFROST) {
335 if (st == PIPE_SHADER_VERTEX)
336 meta->bifrost1.unk1 = 0x800000;
337 else {
338 /* First clause ATEST |= 0x4000000.
339 * Less than 32 regs |= 0x200 */
340 meta->bifrost1.unk1 = 0x950020;
341 }
342
343 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
344 if (st == PIPE_SHADER_VERTEX)
345 meta->bifrost2.preload_regs = 0xC0;
346 else {
347 meta->bifrost2.preload_regs = 0x1;
348 SET_BIT(meta->bifrost2.preload_regs, 0x10, ss->reads_frag_coord);
349 }
350
351 meta->bifrost2.uniform_count = MIN2(ss->uniform_count,
352 ss->uniform_cutoff);
353 } else {
354 meta->midgard1.uniform_count = MIN2(ss->uniform_count,
355 ss->uniform_cutoff);
356 meta->midgard1.work_count = ss->work_reg_count;
357
358 /* TODO: This is not conformant on ES3 */
359 meta->midgard1.flags_hi = MALI_SUPPRESS_INF_NAN;
360
361 meta->midgard1.flags_lo = 0x20;
362 meta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
363
364 SET_BIT(meta->midgard1.flags_hi, MALI_WRITES_GLOBAL, ss->writes_global);
365 }
366 }
367
368 static unsigned
369 panfrost_translate_compare_func(enum pipe_compare_func in)
370 {
371 switch (in) {
372 case PIPE_FUNC_NEVER:
373 return MALI_FUNC_NEVER;
374
375 case PIPE_FUNC_LESS:
376 return MALI_FUNC_LESS;
377
378 case PIPE_FUNC_EQUAL:
379 return MALI_FUNC_EQUAL;
380
381 case PIPE_FUNC_LEQUAL:
382 return MALI_FUNC_LEQUAL;
383
384 case PIPE_FUNC_GREATER:
385 return MALI_FUNC_GREATER;
386
387 case PIPE_FUNC_NOTEQUAL:
388 return MALI_FUNC_NOTEQUAL;
389
390 case PIPE_FUNC_GEQUAL:
391 return MALI_FUNC_GEQUAL;
392
393 case PIPE_FUNC_ALWAYS:
394 return MALI_FUNC_ALWAYS;
395
396 default:
397 unreachable("Invalid func");
398 }
399 }
400
401 static unsigned
402 panfrost_translate_stencil_op(enum pipe_stencil_op in)
403 {
404 switch (in) {
405 case PIPE_STENCIL_OP_KEEP:
406 return MALI_STENCIL_KEEP;
407
408 case PIPE_STENCIL_OP_ZERO:
409 return MALI_STENCIL_ZERO;
410
411 case PIPE_STENCIL_OP_REPLACE:
412 return MALI_STENCIL_REPLACE;
413
414 case PIPE_STENCIL_OP_INCR:
415 return MALI_STENCIL_INCR;
416
417 case PIPE_STENCIL_OP_DECR:
418 return MALI_STENCIL_DECR;
419
420 case PIPE_STENCIL_OP_INCR_WRAP:
421 return MALI_STENCIL_INCR_WRAP;
422
423 case PIPE_STENCIL_OP_DECR_WRAP:
424 return MALI_STENCIL_DECR_WRAP;
425
426 case PIPE_STENCIL_OP_INVERT:
427 return MALI_STENCIL_INVERT;
428
429 default:
430 unreachable("Invalid stencil op");
431 }
432 }
433
434 static unsigned
435 translate_tex_wrap(enum pipe_tex_wrap w)
436 {
437 switch (w) {
438 case PIPE_TEX_WRAP_REPEAT:
439 return MALI_WRAP_REPEAT;
440
441 case PIPE_TEX_WRAP_CLAMP:
442 return MALI_WRAP_CLAMP;
443
444 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
445 return MALI_WRAP_CLAMP_TO_EDGE;
446
447 case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
448 return MALI_WRAP_CLAMP_TO_BORDER;
449
450 case PIPE_TEX_WRAP_MIRROR_REPEAT:
451 return MALI_WRAP_MIRRORED_REPEAT;
452
453 case PIPE_TEX_WRAP_MIRROR_CLAMP:
454 return MALI_WRAP_MIRRORED_CLAMP;
455
456 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
457 return MALI_WRAP_MIRRORED_CLAMP_TO_EDGE;
458
459 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
460 return MALI_WRAP_MIRRORED_CLAMP_TO_BORDER;
461
462 default:
463 unreachable("Invalid wrap");
464 }
465 }
466
467 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
468 struct mali_sampler_descriptor *hw)
469 {
470 unsigned func = panfrost_translate_compare_func(cso->compare_func);
471 bool min_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
472 bool mag_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
473 bool mip_linear = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR;
474 unsigned min_filter = min_nearest ? MALI_SAMP_MIN_NEAREST : 0;
475 unsigned mag_filter = mag_nearest ? MALI_SAMP_MAG_NEAREST : 0;
476 unsigned mip_filter = mip_linear ?
477 (MALI_SAMP_MIP_LINEAR_1 | MALI_SAMP_MIP_LINEAR_2) : 0;
478 unsigned normalized = cso->normalized_coords ? MALI_SAMP_NORM_COORDS : 0;
479
480 *hw = (struct mali_sampler_descriptor) {
481 .filter_mode = min_filter | mag_filter | mip_filter |
482 normalized,
483 .wrap_s = translate_tex_wrap(cso->wrap_s),
484 .wrap_t = translate_tex_wrap(cso->wrap_t),
485 .wrap_r = translate_tex_wrap(cso->wrap_r),
486 .compare_func = panfrost_flip_compare_func(func),
487 .border_color = {
488 cso->border_color.f[0],
489 cso->border_color.f[1],
490 cso->border_color.f[2],
491 cso->border_color.f[3]
492 },
493 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
494 .max_lod = FIXED_16(cso->max_lod, false),
495 .lod_bias = FIXED_16(cso->lod_bias, true), /* can be negative */
496 .seamless_cube_map = cso->seamless_cube_map,
497 };
498
499 /* If necessary, we disable mipmapping in the sampler descriptor by
500 * clamping the LOD as tight as possible (from 0 to epsilon,
501 * essentially -- remember these are fixed point numbers, so
502 * epsilon=1/256) */
503
504 if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
505 hw->max_lod = hw->min_lod + 1;
506 }
507
508 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
509 struct bifrost_sampler_descriptor *hw)
510 {
511 *hw = (struct bifrost_sampler_descriptor) {
512 .unk1 = 0x1,
513 .wrap_s = translate_tex_wrap(cso->wrap_s),
514 .wrap_t = translate_tex_wrap(cso->wrap_t),
515 .wrap_r = translate_tex_wrap(cso->wrap_r),
516 .unk8 = 0x8,
517 .min_filter = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST,
518 .norm_coords = cso->normalized_coords,
519 .mip_filter = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR,
520 .mag_filter = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR,
521 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
522 .max_lod = FIXED_16(cso->max_lod, false),
523 };
524
525 /* If necessary, we disable mipmapping in the sampler descriptor by
526 * clamping the LOD as tight as possible (from 0 to epsilon,
527 * essentially -- remember these are fixed point numbers, so
528 * epsilon=1/256) */
529
530 if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
531 hw->max_lod = hw->min_lod + 1;
532 }
533
534 static void
535 panfrost_make_stencil_state(const struct pipe_stencil_state *in,
536 struct mali_stencil_test *out)
537 {
538 out->ref = 0; /* Gallium gets it from elsewhere */
539
540 out->mask = in->valuemask;
541 out->func = panfrost_translate_compare_func(in->func);
542 out->sfail = panfrost_translate_stencil_op(in->fail_op);
543 out->dpfail = panfrost_translate_stencil_op(in->zfail_op);
544 out->dppass = panfrost_translate_stencil_op(in->zpass_op);
545 }
546
547 static void
548 panfrost_frag_meta_rasterizer_update(struct panfrost_context *ctx,
549 struct mali_shader_meta *fragmeta)
550 {
551 if (!ctx->rasterizer) {
552 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, true);
553 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, false);
554 fragmeta->depth_units = 0.0f;
555 fragmeta->depth_factor = 0.0f;
556 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, false);
557 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, false);
558 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_NEAR, true);
559 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_FAR, true);
560 return;
561 }
562
563 struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
564
565 bool msaa = rast->multisample;
566
567 /* TODO: Sample size */
568 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
569 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
570
571 SET_BIT(fragmeta->unknown2_3, MALI_PER_SAMPLE,
572 msaa && ctx->min_samples > 1);
573
574 fragmeta->depth_units = rast->offset_units * 2.0f;
575 fragmeta->depth_factor = rast->offset_scale;
576
577 /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
578
579 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, rast->offset_tri);
580 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, rast->offset_tri);
581
582 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_NEAR, rast->depth_clip_near);
583 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_FAR, rast->depth_clip_far);
584 }
585
586 static void
587 panfrost_frag_meta_zsa_update(struct panfrost_context *ctx,
588 struct mali_shader_meta *fragmeta)
589 {
590 const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
591 int zfunc = PIPE_FUNC_ALWAYS;
592
593 if (!zsa) {
594 struct pipe_stencil_state default_stencil = {
595 .enabled = 0,
596 .func = PIPE_FUNC_ALWAYS,
597 .fail_op = MALI_STENCIL_KEEP,
598 .zfail_op = MALI_STENCIL_KEEP,
599 .zpass_op = MALI_STENCIL_KEEP,
600 .writemask = 0xFF,
601 .valuemask = 0xFF
602 };
603
604 panfrost_make_stencil_state(&default_stencil,
605 &fragmeta->stencil_front);
606 fragmeta->stencil_mask_front = default_stencil.writemask;
607 fragmeta->stencil_back = fragmeta->stencil_front;
608 fragmeta->stencil_mask_back = default_stencil.writemask;
609 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST, false);
610 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK, false);
611 } else {
612 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
613 zsa->stencil[0].enabled);
614 panfrost_make_stencil_state(&zsa->stencil[0],
615 &fragmeta->stencil_front);
616 fragmeta->stencil_mask_front = zsa->stencil[0].writemask;
617 fragmeta->stencil_front.ref = ctx->stencil_ref.ref_value[0];
618
619 /* If back-stencil is not enabled, use the front values */
620
621 if (zsa->stencil[1].enabled) {
622 panfrost_make_stencil_state(&zsa->stencil[1],
623 &fragmeta->stencil_back);
624 fragmeta->stencil_mask_back = zsa->stencil[1].writemask;
625 fragmeta->stencil_back.ref = ctx->stencil_ref.ref_value[1];
626 } else {
627 fragmeta->stencil_back = fragmeta->stencil_front;
628 fragmeta->stencil_mask_back = fragmeta->stencil_mask_front;
629 fragmeta->stencil_back.ref = fragmeta->stencil_front.ref;
630 }
631
632 if (zsa->depth.enabled)
633 zfunc = zsa->depth.func;
634
635 /* Depth state (TODO: Refactor) */
636
637 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
638 zsa->depth.writemask);
639 }
640
641 fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
642 fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(zfunc));
643 }
644
645 static bool
646 panfrost_fs_required(
647 struct panfrost_shader_state *fs,
648 struct panfrost_blend_final *blend,
649 unsigned rt_count)
650 {
651 /* If we generally have side effects */
652 if (fs->fs_sidefx)
653 return true;
654
655 /* If colour is written we need to execute */
656 for (unsigned i = 0; i < rt_count; ++i) {
657 if (!blend[i].no_colour)
658 return true;
659 }
660
661 /* If depth is written and not implied we need to execute.
662 * TODO: Predicate on Z/S writes being enabled */
663 return (fs->writes_depth || fs->writes_stencil);
664 }
665
666 static void
667 panfrost_frag_meta_blend_update(struct panfrost_context *ctx,
668 struct mali_shader_meta *fragmeta,
669 void *rts)
670 {
671 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
672 const struct panfrost_device *dev = pan_device(ctx->base.screen);
673 struct panfrost_shader_state *fs;
674 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
675
676 SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
677 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
678 !ctx->blend->base.dither);
679
680 SET_BIT(fragmeta->unknown2_4, MALI_ALPHA_TO_COVERAGE,
681 ctx->blend->base.alpha_to_coverage);
682
683 /* Get blending setup */
684 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
685
686 struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
687 unsigned shader_offset = 0;
688 struct panfrost_bo *shader_bo = NULL;
689
690 for (unsigned c = 0; c < rt_count; ++c)
691 blend[c] = panfrost_get_blend_for_context(ctx, c, &shader_bo,
692 &shader_offset);
693
694 /* Disable shader execution if we can */
695 if (dev->quirks & MIDGARD_SHADERLESS
696 && !panfrost_fs_required(fs, blend, rt_count)) {
697 fragmeta->shader = 0;
698 fragmeta->attribute_count = 0;
699 fragmeta->varying_count = 0;
700 fragmeta->texture_count = 0;
701 fragmeta->sampler_count = 0;
702
703 /* This feature is not known to work on Bifrost */
704 fragmeta->midgard1.work_count = 1;
705 fragmeta->midgard1.uniform_count = 0;
706 fragmeta->midgard1.uniform_buffer_count = 0;
707 }
708
709 /* If there is a blend shader, work registers are shared. We impose 8
710 * work registers as a limit for blend shaders. Should be lower XXX */
711
712 if (!(dev->quirks & IS_BIFROST)) {
713 for (unsigned c = 0; c < rt_count; ++c) {
714 if (blend[c].is_shader) {
715 fragmeta->midgard1.work_count =
716 MAX2(fragmeta->midgard1.work_count, 8);
717 }
718 }
719 }
720
721 /* Even on MFBD, the shader descriptor gets blend shaders. It's *also*
722 * copied to the blend_meta appended (by convention), but this is the
723 * field actually read by the hardware. (Or maybe both are read...?).
724 * Specify the last RTi with a blend shader. */
725
726 fragmeta->blend.shader = 0;
727
728 for (signed rt = (rt_count - 1); rt >= 0; --rt) {
729 if (!blend[rt].is_shader)
730 continue;
731
732 fragmeta->blend.shader = blend[rt].shader.gpu |
733 blend[rt].shader.first_tag;
734 break;
735 }
736
737 if (dev->quirks & MIDGARD_SFBD) {
738 /* When only a single render target platform is used, the blend
739 * information is inside the shader meta itself. We additionally
740 * need to signal CAN_DISCARD for nontrivial blend modes (so
741 * we're able to read back the destination buffer) */
742
743 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
744 blend[0].is_shader);
745
746 if (!blend[0].is_shader) {
747 fragmeta->blend.equation = *blend[0].equation.equation;
748 fragmeta->blend.constant = blend[0].equation.constant;
749 }
750
751 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
752 !blend[0].no_blending || fs->can_discard);
753
754 batch->draws |= PIPE_CLEAR_COLOR0;
755 return;
756 }
757
758 if (dev->quirks & IS_BIFROST) {
759 bool no_blend = true;
760
761 for (unsigned i = 0; i < rt_count; ++i)
762 no_blend &= (blend[i].no_blending | blend[i].no_colour);
763
764 SET_BIT(fragmeta->bifrost1.unk1, MALI_BIFROST_EARLY_Z,
765 !fs->can_discard && !fs->writes_depth && no_blend);
766 }
767
768 /* Additional blend descriptor tacked on for jobs using MFBD */
769
770 for (unsigned i = 0; i < rt_count; ++i) {
771 unsigned flags = 0;
772
773 if (ctx->pipe_framebuffer.nr_cbufs > i && !blend[i].no_colour) {
774 flags = 0x200;
775 batch->draws |= (PIPE_CLEAR_COLOR0 << i);
776
777 bool is_srgb = (ctx->pipe_framebuffer.nr_cbufs > i) &&
778 (ctx->pipe_framebuffer.cbufs[i]) &&
779 util_format_is_srgb(ctx->pipe_framebuffer.cbufs[i]->format);
780
781 SET_BIT(flags, MALI_BLEND_MRT_SHADER, blend[i].is_shader);
782 SET_BIT(flags, MALI_BLEND_LOAD_TIB, !blend[i].no_blending);
783 SET_BIT(flags, MALI_BLEND_SRGB, is_srgb);
784 SET_BIT(flags, MALI_BLEND_NO_DITHER, !ctx->blend->base.dither);
785 }
786
787 if (dev->quirks & IS_BIFROST) {
788 struct bifrost_blend_rt *brts = rts;
789
790 brts[i].flags = flags;
791
792 if (blend[i].is_shader) {
793 /* The blend shader's address needs to be at
794 * the same top 32 bit as the fragment shader.
795 * TODO: Ensure that's always the case.
796 */
797 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
798 (fs->bo->gpu & (0xffffffffull << 32)));
799 brts[i].shader = blend[i].shader.gpu;
800 brts[i].unk2 = 0x0;
801 } else if (ctx->pipe_framebuffer.nr_cbufs > i) {
802 enum pipe_format format = ctx->pipe_framebuffer.cbufs[i]->format;
803 const struct util_format_description *format_desc;
804 format_desc = util_format_description(format);
805
806 brts[i].equation = *blend[i].equation.equation;
807
808 /* TODO: this is a bit more complicated */
809 brts[i].constant = blend[i].equation.constant;
810
811 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
812
813 /* 0x19 disables blending and forces REPLACE
814 * mode (equivalent to rgb_mode = alpha_mode =
815 * x122, colour mask = 0xF). 0x1a allows
816 * blending. */
817 brts[i].unk2 = blend[i].no_blending ? 0x19 : 0x1a;
818
819 brts[i].shader_type = fs->blend_types[i];
820 } else {
821 /* Dummy attachment for depth-only */
822 brts[i].unk2 = 0x3;
823 brts[i].shader_type = fs->blend_types[i];
824 }
825 } else {
826 struct midgard_blend_rt *mrts = rts;
827 mrts[i].flags = flags;
828
829 if (blend[i].is_shader) {
830 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
831 } else {
832 mrts[i].blend.equation = *blend[i].equation.equation;
833 mrts[i].blend.constant = blend[i].equation.constant;
834 }
835 }
836 }
837 }
838
839 static void
840 panfrost_frag_shader_meta_init(struct panfrost_context *ctx,
841 struct mali_shader_meta *fragmeta,
842 void *rts)
843 {
844 const struct panfrost_device *dev = pan_device(ctx->base.screen);
845 struct panfrost_shader_state *fs;
846
847 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
848
849 bool msaa = ctx->rasterizer && ctx->rasterizer->base.multisample;
850 fragmeta->coverage_mask = (msaa ? ctx->sample_mask : ~0) & 0xF;
851
852 fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x10;
853 fragmeta->unknown2_4 = 0x4e0;
854
855 /* unknown2_4 has 0x10 bit set on T6XX and T720. We don't know why this
856 * is required (independent of 32-bit/64-bit descriptors), or why it's
857 * not used on later GPU revisions. Otherwise, all shader jobs fault on
858 * these earlier chips (perhaps this is a chicken bit of some kind).
859 * More investigation is needed. */
860
861 SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
862
863 if (dev->quirks & IS_BIFROST) {
864 /* TODO */
865 } else {
866 /* Depending on whether it's legal to in the given shader, we try to
867 * enable early-z testing. TODO: respect e-z force */
868
869 SET_BIT(fragmeta->midgard1.flags_lo, MALI_EARLY_Z,
870 !fs->can_discard && !fs->writes_global &&
871 !fs->writes_depth && !fs->writes_stencil &&
872 !ctx->blend->base.alpha_to_coverage);
873
874 /* Add the writes Z/S flags if needed. */
875 SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_Z, fs->writes_depth);
876 SET_BIT(fragmeta->midgard1.flags_hi, MALI_WRITES_S, fs->writes_stencil);
877
878 /* Any time texturing is used, derivatives are implicitly calculated,
879 * so we need to enable helper invocations */
880
881 SET_BIT(fragmeta->midgard1.flags_lo, MALI_HELPER_INVOCATIONS,
882 fs->helper_invocations);
883
884 const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
885
886 bool depth_enabled = fs->writes_depth ||
887 (zsa && zsa->depth.enabled && zsa->depth.func != PIPE_FUNC_ALWAYS);
888
889 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_TILEBUFFER,
890 fs->outputs_read || (!depth_enabled && fs->can_discard));
891 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_ZS, depth_enabled && fs->can_discard);
892 }
893
894 panfrost_frag_meta_rasterizer_update(ctx, fragmeta);
895 panfrost_frag_meta_zsa_update(ctx, fragmeta);
896 panfrost_frag_meta_blend_update(ctx, fragmeta, rts);
897 }
898
899 void
900 panfrost_emit_shader_meta(struct panfrost_batch *batch,
901 enum pipe_shader_type st,
902 struct mali_vertex_tiler_postfix *postfix)
903 {
904 struct panfrost_context *ctx = batch->ctx;
905 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
906
907 if (!ss) {
908 postfix->shader = 0;
909 return;
910 }
911
912 struct mali_shader_meta meta;
913
914 panfrost_shader_meta_init(ctx, st, &meta);
915
916 /* Add the shader BO to the batch. */
917 panfrost_batch_add_bo(batch, ss->bo,
918 PAN_BO_ACCESS_PRIVATE |
919 PAN_BO_ACCESS_READ |
920 panfrost_bo_access_for_stage(st));
921
922 mali_ptr shader_ptr;
923
924 if (st == PIPE_SHADER_FRAGMENT) {
925 struct panfrost_device *dev = pan_device(ctx->base.screen);
926 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
927 size_t desc_size = sizeof(meta);
928 void *rts = NULL;
929 struct panfrost_transfer xfer;
930 unsigned rt_size;
931
932 if (dev->quirks & MIDGARD_SFBD)
933 rt_size = 0;
934 else if (dev->quirks & IS_BIFROST)
935 rt_size = sizeof(struct bifrost_blend_rt);
936 else
937 rt_size = sizeof(struct midgard_blend_rt);
938
939 desc_size += rt_size * rt_count;
940
941 if (rt_size)
942 rts = rzalloc_size(ctx, rt_size * rt_count);
943
944 panfrost_frag_shader_meta_init(ctx, &meta, rts);
945
946 xfer = panfrost_pool_alloc(&batch->pool, desc_size);
947
948 memcpy(xfer.cpu, &meta, sizeof(meta));
949 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
950
951 if (rt_size)
952 ralloc_free(rts);
953
954 shader_ptr = xfer.gpu;
955 } else {
956 shader_ptr = panfrost_pool_upload(&batch->pool, &meta,
957 sizeof(meta));
958 }
959
960 postfix->shader = shader_ptr;
961 }
962
963 static void
964 panfrost_mali_viewport_init(struct panfrost_context *ctx,
965 struct mali_viewport *mvp)
966 {
967 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
968
969 /* Clip bounds are encoded as floats. The viewport itself is encoded as
970 * (somewhat) asymmetric ints. */
971
972 const struct pipe_scissor_state *ss = &ctx->scissor;
973
974 memset(mvp, 0, sizeof(*mvp));
975
976 /* By default, do no viewport clipping, i.e. clip to (-inf, inf) in
977 * each direction. Clipping to the viewport in theory should work, but
978 * in practice causes issues when we're not explicitly trying to
979 * scissor */
980
981 *mvp = (struct mali_viewport) {
982 .clip_minx = -INFINITY,
983 .clip_miny = -INFINITY,
984 .clip_maxx = INFINITY,
985 .clip_maxy = INFINITY,
986 };
987
988 /* Always scissor to the viewport by default. */
989 float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
990 float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
991
992 float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
993 float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
994
995 float minz = (vp->translate[2] - fabsf(vp->scale[2]));
996 float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
997
998 /* Apply the scissor test */
999
1000 unsigned minx, miny, maxx, maxy;
1001
1002 if (ss && ctx->rasterizer && ctx->rasterizer->base.scissor) {
1003 minx = MAX2(ss->minx, vp_minx);
1004 miny = MAX2(ss->miny, vp_miny);
1005 maxx = MIN2(ss->maxx, vp_maxx);
1006 maxy = MIN2(ss->maxy, vp_maxy);
1007 } else {
1008 minx = vp_minx;
1009 miny = vp_miny;
1010 maxx = vp_maxx;
1011 maxy = vp_maxy;
1012 }
1013
1014 /* Hardware needs the min/max to be strictly ordered, so flip if we
1015 * need to. The viewport transformation in the vertex shader will
1016 * handle the negatives if we don't */
1017
1018 if (miny > maxy) {
1019 unsigned temp = miny;
1020 miny = maxy;
1021 maxy = temp;
1022 }
1023
1024 if (minx > maxx) {
1025 unsigned temp = minx;
1026 minx = maxx;
1027 maxx = temp;
1028 }
1029
1030 if (minz > maxz) {
1031 float temp = minz;
1032 minz = maxz;
1033 maxz = temp;
1034 }
1035
1036 /* Clamp to the framebuffer size as a last check */
1037
1038 minx = MIN2(ctx->pipe_framebuffer.width, minx);
1039 maxx = MIN2(ctx->pipe_framebuffer.width, maxx);
1040
1041 miny = MIN2(ctx->pipe_framebuffer.height, miny);
1042 maxy = MIN2(ctx->pipe_framebuffer.height, maxy);
1043
1044 /* Upload */
1045
1046 mvp->viewport0[0] = minx;
1047 mvp->viewport1[0] = MALI_POSITIVE(maxx);
1048
1049 mvp->viewport0[1] = miny;
1050 mvp->viewport1[1] = MALI_POSITIVE(maxy);
1051
1052 bool clip_near = true;
1053 bool clip_far = true;
1054
1055 if (ctx->rasterizer) {
1056 clip_near = ctx->rasterizer->base.depth_clip_near;
1057 clip_far = ctx->rasterizer->base.depth_clip_far;
1058 }
1059
1060 mvp->clip_minz = clip_near ? minz : -INFINITY;
1061 mvp->clip_maxz = clip_far ? maxz : INFINITY;
1062 }
1063
1064 void
1065 panfrost_emit_viewport(struct panfrost_batch *batch,
1066 struct mali_vertex_tiler_postfix *tiler_postfix)
1067 {
1068 struct panfrost_context *ctx = batch->ctx;
1069 struct mali_viewport mvp;
1070
1071 panfrost_mali_viewport_init(batch->ctx, &mvp);
1072
1073 /* Update the job, unless we're doing wallpapering (whose lack of
1074 * scissor we can ignore, since if we "miss" a tile of wallpaper, it'll
1075 * just... be faster :) */
1076
1077 if (!ctx->wallpaper_batch)
1078 panfrost_batch_union_scissor(batch, mvp.viewport0[0],
1079 mvp.viewport0[1],
1080 mvp.viewport1[0] + 1,
1081 mvp.viewport1[1] + 1);
1082
1083 tiler_postfix->viewport = panfrost_pool_upload(&batch->pool, &mvp,
1084 sizeof(mvp));
1085 }
1086
1087 static mali_ptr
1088 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
1089 enum pipe_shader_type st,
1090 struct panfrost_constant_buffer *buf,
1091 unsigned index)
1092 {
1093 struct pipe_constant_buffer *cb = &buf->cb[index];
1094 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1095
1096 if (rsrc) {
1097 panfrost_batch_add_bo(batch, rsrc->bo,
1098 PAN_BO_ACCESS_SHARED |
1099 PAN_BO_ACCESS_READ |
1100 panfrost_bo_access_for_stage(st));
1101
1102 /* Alignment gauranteed by
1103 * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
1104 return rsrc->bo->gpu + cb->buffer_offset;
1105 } else if (cb->user_buffer) {
1106 return panfrost_pool_upload(&batch->pool,
1107 cb->user_buffer +
1108 cb->buffer_offset,
1109 cb->buffer_size);
1110 } else {
1111 unreachable("No constant buffer");
1112 }
1113 }
1114
1115 struct sysval_uniform {
1116 union {
1117 float f[4];
1118 int32_t i[4];
1119 uint32_t u[4];
1120 uint64_t du[2];
1121 };
1122 };
1123
1124 static void
1125 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
1126 struct sysval_uniform *uniform)
1127 {
1128 struct panfrost_context *ctx = batch->ctx;
1129 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1130
1131 uniform->f[0] = vp->scale[0];
1132 uniform->f[1] = vp->scale[1];
1133 uniform->f[2] = vp->scale[2];
1134 }
1135
1136 static void
1137 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
1138 struct sysval_uniform *uniform)
1139 {
1140 struct panfrost_context *ctx = batch->ctx;
1141 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1142
1143 uniform->f[0] = vp->translate[0];
1144 uniform->f[1] = vp->translate[1];
1145 uniform->f[2] = vp->translate[2];
1146 }
1147
1148 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
1149 enum pipe_shader_type st,
1150 unsigned int sysvalid,
1151 struct sysval_uniform *uniform)
1152 {
1153 struct panfrost_context *ctx = batch->ctx;
1154 unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
1155 unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
1156 bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
1157 struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
1158
1159 assert(dim);
1160 uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
1161
1162 if (dim > 1)
1163 uniform->i[1] = u_minify(tex->texture->height0,
1164 tex->u.tex.first_level);
1165
1166 if (dim > 2)
1167 uniform->i[2] = u_minify(tex->texture->depth0,
1168 tex->u.tex.first_level);
1169
1170 if (is_array)
1171 uniform->i[dim] = tex->texture->array_size;
1172 }
1173
1174 static void
1175 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
1176 enum pipe_shader_type st,
1177 unsigned ssbo_id,
1178 struct sysval_uniform *uniform)
1179 {
1180 struct panfrost_context *ctx = batch->ctx;
1181
1182 assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
1183 struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
1184
1185 /* Compute address */
1186 struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
1187
1188 panfrost_batch_add_bo(batch, bo,
1189 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
1190 panfrost_bo_access_for_stage(st));
1191
1192 /* Upload address and size as sysval */
1193 uniform->du[0] = bo->gpu + sb.buffer_offset;
1194 uniform->u[2] = sb.buffer_size;
1195 }
1196
1197 static void
1198 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1199 enum pipe_shader_type st,
1200 unsigned samp_idx,
1201 struct sysval_uniform *uniform)
1202 {
1203 struct panfrost_context *ctx = batch->ctx;
1204 struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1205
1206 uniform->f[0] = sampl->min_lod;
1207 uniform->f[1] = sampl->max_lod;
1208 uniform->f[2] = sampl->lod_bias;
1209
1210 /* Even without any errata, Midgard represents "no mipmapping" as
1211 * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1212 * panfrost_create_sampler_state which also explains our choice of
1213 * epsilon value (again to keep behaviour consistent) */
1214
1215 if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1216 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1217 }
1218
1219 static void
1220 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1221 struct sysval_uniform *uniform)
1222 {
1223 struct panfrost_context *ctx = batch->ctx;
1224
1225 uniform->u[0] = ctx->compute_grid->grid[0];
1226 uniform->u[1] = ctx->compute_grid->grid[1];
1227 uniform->u[2] = ctx->compute_grid->grid[2];
1228 }
1229
1230 static void
1231 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1232 struct panfrost_shader_state *ss,
1233 enum pipe_shader_type st)
1234 {
1235 struct sysval_uniform *uniforms = (void *)buf;
1236
1237 for (unsigned i = 0; i < ss->sysval_count; ++i) {
1238 int sysval = ss->sysval[i];
1239
1240 switch (PAN_SYSVAL_TYPE(sysval)) {
1241 case PAN_SYSVAL_VIEWPORT_SCALE:
1242 panfrost_upload_viewport_scale_sysval(batch,
1243 &uniforms[i]);
1244 break;
1245 case PAN_SYSVAL_VIEWPORT_OFFSET:
1246 panfrost_upload_viewport_offset_sysval(batch,
1247 &uniforms[i]);
1248 break;
1249 case PAN_SYSVAL_TEXTURE_SIZE:
1250 panfrost_upload_txs_sysval(batch, st,
1251 PAN_SYSVAL_ID(sysval),
1252 &uniforms[i]);
1253 break;
1254 case PAN_SYSVAL_SSBO:
1255 panfrost_upload_ssbo_sysval(batch, st,
1256 PAN_SYSVAL_ID(sysval),
1257 &uniforms[i]);
1258 break;
1259 case PAN_SYSVAL_NUM_WORK_GROUPS:
1260 panfrost_upload_num_work_groups_sysval(batch,
1261 &uniforms[i]);
1262 break;
1263 case PAN_SYSVAL_SAMPLER:
1264 panfrost_upload_sampler_sysval(batch, st,
1265 PAN_SYSVAL_ID(sysval),
1266 &uniforms[i]);
1267 break;
1268 default:
1269 assert(0);
1270 }
1271 }
1272 }
1273
1274 static const void *
1275 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1276 unsigned index)
1277 {
1278 struct pipe_constant_buffer *cb = &buf->cb[index];
1279 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1280
1281 if (rsrc)
1282 return rsrc->bo->cpu;
1283 else if (cb->user_buffer)
1284 return cb->user_buffer;
1285 else
1286 unreachable("No constant buffer");
1287 }
1288
1289 void
1290 panfrost_emit_const_buf(struct panfrost_batch *batch,
1291 enum pipe_shader_type stage,
1292 struct mali_vertex_tiler_postfix *postfix)
1293 {
1294 struct panfrost_context *ctx = batch->ctx;
1295 struct panfrost_shader_variants *all = ctx->shader[stage];
1296
1297 if (!all)
1298 return;
1299
1300 struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1301
1302 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1303
1304 /* Uniforms are implicitly UBO #0 */
1305 bool has_uniforms = buf->enabled_mask & (1 << 0);
1306
1307 /* Allocate room for the sysval and the uniforms */
1308 size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1309 size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1310 size_t size = sys_size + uniform_size;
1311 struct panfrost_transfer transfer = panfrost_pool_alloc(&batch->pool,
1312 size);
1313
1314 /* Upload sysvals requested by the shader */
1315 panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1316
1317 /* Upload uniforms */
1318 if (has_uniforms && uniform_size) {
1319 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1320 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1321 }
1322
1323 /* Next up, attach UBOs. UBO #0 is the uniforms we just
1324 * uploaded */
1325
1326 unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1327 assert(ubo_count >= 1);
1328
1329 size_t sz = sizeof(uint64_t) * ubo_count;
1330 uint64_t ubos[PAN_MAX_CONST_BUFFERS];
1331 int uniform_count = ss->uniform_count;
1332
1333 /* Upload uniforms as a UBO */
1334 ubos[0] = MALI_MAKE_UBO(2 + uniform_count, transfer.gpu);
1335
1336 /* The rest are honest-to-goodness UBOs */
1337
1338 for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1339 size_t usz = buf->cb[ubo].buffer_size;
1340 bool enabled = buf->enabled_mask & (1 << ubo);
1341 bool empty = usz == 0;
1342
1343 if (!enabled || empty) {
1344 /* Stub out disabled UBOs to catch accesses */
1345 ubos[ubo] = MALI_MAKE_UBO(0, 0xDEAD0000);
1346 continue;
1347 }
1348
1349 mali_ptr gpu = panfrost_map_constant_buffer_gpu(batch, stage,
1350 buf, ubo);
1351
1352 unsigned bytes_per_field = 16;
1353 unsigned aligned = ALIGN_POT(usz, bytes_per_field);
1354 ubos[ubo] = MALI_MAKE_UBO(aligned / bytes_per_field, gpu);
1355 }
1356
1357 mali_ptr ubufs = panfrost_pool_upload(&batch->pool, ubos, sz);
1358 postfix->uniforms = transfer.gpu;
1359 postfix->uniform_buffers = ubufs;
1360
1361 buf->dirty_mask = 0;
1362 }
1363
1364 void
1365 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1366 const struct pipe_grid_info *info,
1367 struct midgard_payload_vertex_tiler *vtp)
1368 {
1369 struct panfrost_context *ctx = batch->ctx;
1370 struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1371 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1372 unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1373 128));
1374 unsigned shared_size = single_size * info->grid[0] * info->grid[1] *
1375 info->grid[2] * 4;
1376 struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1377 shared_size,
1378 1);
1379
1380 struct mali_shared_memory shared = {
1381 .shared_memory = bo->gpu,
1382 .shared_workgroup_count =
1383 util_logbase2_ceil(info->grid[0]) +
1384 util_logbase2_ceil(info->grid[1]) +
1385 util_logbase2_ceil(info->grid[2]),
1386 .shared_unk1 = 0x2,
1387 .shared_shift = util_logbase2(single_size) - 1
1388 };
1389
1390 vtp->postfix.shared_memory = panfrost_pool_upload(&batch->pool, &shared,
1391 sizeof(shared));
1392 }
1393
1394 static mali_ptr
1395 panfrost_get_tex_desc(struct panfrost_batch *batch,
1396 enum pipe_shader_type st,
1397 struct panfrost_sampler_view *view)
1398 {
1399 if (!view)
1400 return (mali_ptr) 0;
1401
1402 struct pipe_sampler_view *pview = &view->base;
1403 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1404
1405 /* Add the BO to the job so it's retained until the job is done. */
1406
1407 panfrost_batch_add_bo(batch, rsrc->bo,
1408 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1409 panfrost_bo_access_for_stage(st));
1410
1411 panfrost_batch_add_bo(batch, view->bo,
1412 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1413 panfrost_bo_access_for_stage(st));
1414
1415 return view->bo->gpu;
1416 }
1417
1418 static void
1419 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
1420 struct pipe_context *pctx)
1421 {
1422 struct panfrost_resource *rsrc = pan_resource(view->base.texture);
1423 if (view->texture_bo != rsrc->bo->gpu ||
1424 view->layout != rsrc->layout) {
1425 panfrost_bo_unreference(view->bo);
1426 panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);
1427 }
1428 }
1429
1430 void
1431 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1432 enum pipe_shader_type stage,
1433 struct mali_vertex_tiler_postfix *postfix)
1434 {
1435 struct panfrost_context *ctx = batch->ctx;
1436 struct panfrost_device *device = pan_device(ctx->base.screen);
1437
1438 if (!ctx->sampler_view_count[stage])
1439 return;
1440
1441 if (device->quirks & IS_BIFROST) {
1442 struct bifrost_texture_descriptor *descriptors;
1443
1444 descriptors = malloc(sizeof(struct bifrost_texture_descriptor) *
1445 ctx->sampler_view_count[stage]);
1446
1447 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1448 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1449 struct pipe_sampler_view *pview = &view->base;
1450 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1451 panfrost_update_sampler_view(view, &ctx->base);
1452
1453 /* Add the BOs to the job so they are retained until the job is done. */
1454
1455 panfrost_batch_add_bo(batch, rsrc->bo,
1456 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1457 panfrost_bo_access_for_stage(stage));
1458
1459 panfrost_batch_add_bo(batch, view->bo,
1460 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1461 panfrost_bo_access_for_stage(stage));
1462
1463 memcpy(&descriptors[i], view->bifrost_descriptor, sizeof(*view->bifrost_descriptor));
1464 }
1465
1466 postfix->textures = panfrost_pool_upload(&batch->pool,
1467 descriptors,
1468 sizeof(struct bifrost_texture_descriptor) *
1469 ctx->sampler_view_count[stage]);
1470
1471 free(descriptors);
1472 } else {
1473 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1474
1475 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1476 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1477
1478 panfrost_update_sampler_view(view, &ctx->base);
1479
1480 trampolines[i] = panfrost_get_tex_desc(batch, stage, view);
1481 }
1482
1483 postfix->textures = panfrost_pool_upload(&batch->pool,
1484 trampolines,
1485 sizeof(uint64_t) *
1486 ctx->sampler_view_count[stage]);
1487 }
1488 }
1489
1490 void
1491 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1492 enum pipe_shader_type stage,
1493 struct mali_vertex_tiler_postfix *postfix)
1494 {
1495 struct panfrost_context *ctx = batch->ctx;
1496 struct panfrost_device *device = pan_device(ctx->base.screen);
1497
1498 if (!ctx->sampler_count[stage])
1499 return;
1500
1501 if (device->quirks & IS_BIFROST) {
1502 size_t desc_size = sizeof(struct bifrost_sampler_descriptor);
1503 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1504 struct panfrost_transfer transfer = panfrost_pool_alloc(&batch->pool,
1505 transfer_size);
1506 struct bifrost_sampler_descriptor *desc = (struct bifrost_sampler_descriptor *)transfer.cpu;
1507
1508 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1509 desc[i] = ctx->samplers[stage][i]->bifrost_hw;
1510
1511 postfix->sampler_descriptor = transfer.gpu;
1512 } else {
1513 size_t desc_size = sizeof(struct mali_sampler_descriptor);
1514 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1515 struct panfrost_transfer transfer = panfrost_pool_alloc(&batch->pool,
1516 transfer_size);
1517 struct mali_sampler_descriptor *desc = (struct mali_sampler_descriptor *)transfer.cpu;
1518
1519 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1520 desc[i] = ctx->samplers[stage][i]->midgard_hw;
1521
1522 postfix->sampler_descriptor = transfer.gpu;
1523 }
1524 }
1525
1526 void
1527 panfrost_emit_vertex_attr_meta(struct panfrost_batch *batch,
1528 struct mali_vertex_tiler_postfix *vertex_postfix)
1529 {
1530 struct panfrost_context *ctx = batch->ctx;
1531
1532 if (!ctx->vertex)
1533 return;
1534
1535 struct panfrost_vertex_state *so = ctx->vertex;
1536
1537 panfrost_vertex_state_upd_attr_offs(ctx, vertex_postfix);
1538 vertex_postfix->attribute_meta = panfrost_pool_upload(&batch->pool, so->hw,
1539 sizeof(*so->hw) *
1540 PAN_MAX_ATTRIBUTE);
1541 }
1542
1543 void
1544 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1545 struct mali_vertex_tiler_postfix *vertex_postfix)
1546 {
1547 struct panfrost_context *ctx = batch->ctx;
1548 struct panfrost_vertex_state *so = ctx->vertex;
1549
1550 /* Staged mali_attr, and index into them. i =/= k, depending on the
1551 * vertex buffer mask and instancing. Twice as much room is allocated,
1552 * for a worst case of NPOT_DIVIDEs which take up extra slot */
1553 union mali_attr attrs[PIPE_MAX_ATTRIBS * 2];
1554 unsigned k = 0;
1555
1556 for (unsigned i = 0; i < so->num_elements; ++i) {
1557 /* We map a mali_attr to be 1:1 with the mali_attr_meta, which
1558 * means duplicating some vertex buffers (who cares? aside from
1559 * maybe some caching implications but I somehow doubt that
1560 * matters) */
1561
1562 struct pipe_vertex_element *elem = &so->pipe[i];
1563 unsigned vbi = elem->vertex_buffer_index;
1564
1565 /* The exception to 1:1 mapping is that we can have multiple
1566 * entries (NPOT divisors), so we fixup anyways */
1567
1568 so->hw[i].index = k;
1569
1570 if (!(ctx->vb_mask & (1 << vbi)))
1571 continue;
1572
1573 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1574 struct panfrost_resource *rsrc;
1575
1576 rsrc = pan_resource(buf->buffer.resource);
1577 if (!rsrc)
1578 continue;
1579
1580 /* Align to 64 bytes by masking off the lower bits. This
1581 * will be adjusted back when we fixup the src_offset in
1582 * mali_attr_meta */
1583
1584 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1585 mali_ptr addr = raw_addr & ~63;
1586 unsigned chopped_addr = raw_addr - addr;
1587
1588 /* Add a dependency of the batch on the vertex buffer */
1589 panfrost_batch_add_bo(batch, rsrc->bo,
1590 PAN_BO_ACCESS_SHARED |
1591 PAN_BO_ACCESS_READ |
1592 PAN_BO_ACCESS_VERTEX_TILER);
1593
1594 /* Set common fields */
1595 attrs[k].elements = addr;
1596 attrs[k].stride = buf->stride;
1597
1598 /* Since we advanced the base pointer, we shrink the buffer
1599 * size */
1600 attrs[k].size = rsrc->base.width0 - buf->buffer_offset;
1601
1602 /* We need to add the extra size we masked off (for
1603 * correctness) so the data doesn't get clamped away */
1604 attrs[k].size += chopped_addr;
1605
1606 /* For non-instancing make sure we initialize */
1607 attrs[k].shift = attrs[k].extra_flags = 0;
1608
1609 /* Instancing uses a dramatically different code path than
1610 * linear, so dispatch for the actual emission now that the
1611 * common code is finished */
1612
1613 unsigned divisor = elem->instance_divisor;
1614
1615 if (divisor && ctx->instance_count == 1) {
1616 /* Silly corner case where there's a divisor(=1) but
1617 * there's no legitimate instancing. So we want *every*
1618 * attribute to be the same. So set stride to zero so
1619 * we don't go anywhere. */
1620
1621 attrs[k].size = attrs[k].stride + chopped_addr;
1622 attrs[k].stride = 0;
1623 attrs[k++].elements |= MALI_ATTR_LINEAR;
1624 } else if (ctx->instance_count <= 1) {
1625 /* Normal, non-instanced attributes */
1626 attrs[k++].elements |= MALI_ATTR_LINEAR;
1627 } else {
1628 unsigned instance_shift = vertex_postfix->instance_shift;
1629 unsigned instance_odd = vertex_postfix->instance_odd;
1630
1631 k += panfrost_vertex_instanced(ctx->padded_count,
1632 instance_shift,
1633 instance_odd,
1634 divisor, &attrs[k]);
1635 }
1636 }
1637
1638 /* Add special gl_VertexID/gl_InstanceID buffers */
1639
1640 panfrost_vertex_id(ctx->padded_count, &attrs[k]);
1641 so->hw[PAN_VERTEX_ID].index = k++;
1642 panfrost_instance_id(ctx->padded_count, &attrs[k]);
1643 so->hw[PAN_INSTANCE_ID].index = k++;
1644
1645 /* Upload whatever we emitted and go */
1646
1647 vertex_postfix->attributes = panfrost_pool_upload(&batch->pool, attrs,
1648 k * sizeof(*attrs));
1649 }
1650
1651 static mali_ptr
1652 panfrost_emit_varyings(struct panfrost_batch *batch, union mali_attr *slot,
1653 unsigned stride, unsigned count)
1654 {
1655 /* Fill out the descriptor */
1656 slot->stride = stride;
1657 slot->size = stride * count;
1658 slot->shift = slot->extra_flags = 0;
1659
1660 struct panfrost_transfer transfer = panfrost_pool_alloc(&batch->pool,
1661 slot->size);
1662
1663 slot->elements = transfer.gpu | MALI_ATTR_LINEAR;
1664
1665 return transfer.gpu;
1666 }
1667
1668 static unsigned
1669 panfrost_streamout_offset(unsigned stride, unsigned offset,
1670 struct pipe_stream_output_target *target)
1671 {
1672 return (target->buffer_offset + (offset * stride * 4)) & 63;
1673 }
1674
1675 static void
1676 panfrost_emit_streamout(struct panfrost_batch *batch, union mali_attr *slot,
1677 unsigned stride, unsigned offset, unsigned count,
1678 struct pipe_stream_output_target *target)
1679 {
1680 /* Fill out the descriptor */
1681 slot->stride = stride * 4;
1682 slot->shift = slot->extra_flags = 0;
1683
1684 unsigned max_size = target->buffer_size;
1685 unsigned expected_size = slot->stride * count;
1686
1687 /* Grab the BO and bind it to the batch */
1688 struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1689
1690 /* Varyings are WRITE from the perspective of the VERTEX but READ from
1691 * the perspective of the TILER and FRAGMENT.
1692 */
1693 panfrost_batch_add_bo(batch, bo,
1694 PAN_BO_ACCESS_SHARED |
1695 PAN_BO_ACCESS_RW |
1696 PAN_BO_ACCESS_VERTEX_TILER |
1697 PAN_BO_ACCESS_FRAGMENT);
1698
1699 /* We will have an offset applied to get alignment */
1700 mali_ptr addr = bo->gpu + target->buffer_offset + (offset * slot->stride);
1701 slot->elements = (addr & ~63) | MALI_ATTR_LINEAR;
1702 slot->size = MIN2(max_size, expected_size) + (addr & 63);
1703 }
1704
1705 static bool
1706 has_point_coord(unsigned mask, gl_varying_slot loc)
1707 {
1708 if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1709 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1710 else if (loc == VARYING_SLOT_PNTC)
1711 return (mask & (1 << 8));
1712 else
1713 return false;
1714 }
1715
1716 /* Helpers for manipulating stream out information so we can pack varyings
1717 * accordingly. Compute the src_offset for a given captured varying */
1718
1719 static struct pipe_stream_output *
1720 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1721 {
1722 for (unsigned i = 0; i < info->num_outputs; ++i) {
1723 if (info->output[i].register_index == loc)
1724 return &info->output[i];
1725 }
1726
1727 unreachable("Varying not captured");
1728 }
1729
1730 static unsigned
1731 pan_varying_size(enum mali_format fmt)
1732 {
1733 unsigned type = MALI_EXTRACT_TYPE(fmt);
1734 unsigned chan = MALI_EXTRACT_CHANNELS(fmt);
1735 unsigned bits = MALI_EXTRACT_BITS(fmt);
1736 unsigned bpc = 0;
1737
1738 if (bits == MALI_CHANNEL_FLOAT) {
1739 /* No doubles */
1740 bool fp16 = (type == MALI_FORMAT_SINT);
1741 assert(fp16 || (type == MALI_FORMAT_UNORM));
1742
1743 bpc = fp16 ? 2 : 4;
1744 } else {
1745 assert(type >= MALI_FORMAT_SNORM && type <= MALI_FORMAT_SINT);
1746
1747 /* See the enums */
1748 bits = 1 << bits;
1749 assert(bits >= 8);
1750 bpc = bits / 8;
1751 }
1752
1753 return bpc * chan;
1754 }
1755
1756 /* Indices for named (non-XFB) varyings that are present. These are packed
1757 * tightly so they correspond to a bitfield present (P) indexed by (1 <<
1758 * PAN_VARY_*). This has the nice property that you can lookup the buffer index
1759 * of a given special field given a shift S by:
1760 *
1761 * idx = popcount(P & ((1 << S) - 1))
1762 *
1763 * That is... look at all of the varyings that come earlier and count them, the
1764 * count is the new index since plus one. Likewise, the total number of special
1765 * buffers required is simply popcount(P)
1766 */
1767
1768 enum pan_special_varying {
1769 PAN_VARY_GENERAL = 0,
1770 PAN_VARY_POSITION = 1,
1771 PAN_VARY_PSIZ = 2,
1772 PAN_VARY_PNTCOORD = 3,
1773 PAN_VARY_FACE = 4,
1774 PAN_VARY_FRAGCOORD = 5,
1775
1776 /* Keep last */
1777 PAN_VARY_MAX,
1778 };
1779
1780 /* Given a varying, figure out which index it correpsonds to */
1781
1782 static inline unsigned
1783 pan_varying_index(unsigned present, enum pan_special_varying v)
1784 {
1785 unsigned mask = (1 << v) - 1;
1786 return util_bitcount(present & mask);
1787 }
1788
1789 /* Get the base offset for XFB buffers, which by convention come after
1790 * everything else. Wrapper function for semantic reasons; by construction this
1791 * is just popcount. */
1792
1793 static inline unsigned
1794 pan_xfb_base(unsigned present)
1795 {
1796 return util_bitcount(present);
1797 }
1798
1799 /* Computes the present mask for varyings so we can start emitting varying records */
1800
1801 static inline unsigned
1802 pan_varying_present(
1803 struct panfrost_shader_state *vs,
1804 struct panfrost_shader_state *fs,
1805 unsigned quirks)
1806 {
1807 /* At the moment we always emit general and position buffers. Not
1808 * strictly necessary but usually harmless */
1809
1810 unsigned present = (1 << PAN_VARY_GENERAL) | (1 << PAN_VARY_POSITION);
1811
1812 /* Enable special buffers by the shader info */
1813
1814 if (vs->writes_point_size)
1815 present |= (1 << PAN_VARY_PSIZ);
1816
1817 if (fs->reads_point_coord)
1818 present |= (1 << PAN_VARY_PNTCOORD);
1819
1820 if (fs->reads_face)
1821 present |= (1 << PAN_VARY_FACE);
1822
1823 if (fs->reads_frag_coord && !(quirks & IS_BIFROST))
1824 present |= (1 << PAN_VARY_FRAGCOORD);
1825
1826 /* Also, if we have a point sprite, we need a point coord buffer */
1827
1828 for (unsigned i = 0; i < fs->varying_count; i++) {
1829 gl_varying_slot loc = fs->varyings_loc[i];
1830
1831 if (has_point_coord(fs->point_sprite_mask, loc))
1832 present |= (1 << PAN_VARY_PNTCOORD);
1833 }
1834
1835 return present;
1836 }
1837
1838 /* Emitters for varying records */
1839
1840 static struct mali_attr_meta
1841 pan_emit_vary(unsigned present, enum pan_special_varying buf,
1842 unsigned quirks, enum mali_format format,
1843 unsigned offset)
1844 {
1845 unsigned nr_channels = MALI_EXTRACT_CHANNELS(format);
1846
1847 struct mali_attr_meta meta = {
1848 .index = pan_varying_index(present, buf),
1849 .unknown1 = quirks & IS_BIFROST ? 0x0 : 0x2,
1850 .swizzle = quirks & HAS_SWIZZLES ?
1851 panfrost_get_default_swizzle(nr_channels) :
1852 panfrost_bifrost_swizzle(nr_channels),
1853 .format = format,
1854 .src_offset = offset
1855 };
1856
1857 return meta;
1858 }
1859
1860 /* General varying that is unused */
1861
1862 static struct mali_attr_meta
1863 pan_emit_vary_only(unsigned present, unsigned quirks)
1864 {
1865 return pan_emit_vary(present, 0, quirks, MALI_VARYING_DISCARD, 0);
1866 }
1867
1868 /* Special records */
1869
1870 static const enum mali_format pan_varying_formats[PAN_VARY_MAX] = {
1871 [PAN_VARY_POSITION] = MALI_VARYING_POS,
1872 [PAN_VARY_PSIZ] = MALI_R16F,
1873 [PAN_VARY_PNTCOORD] = MALI_R16F,
1874 [PAN_VARY_FACE] = MALI_R32I,
1875 [PAN_VARY_FRAGCOORD] = MALI_RGBA32F
1876 };
1877
1878 static struct mali_attr_meta
1879 pan_emit_vary_special(unsigned present, enum pan_special_varying buf,
1880 unsigned quirks)
1881 {
1882 assert(buf < PAN_VARY_MAX);
1883 return pan_emit_vary(present, buf, quirks, pan_varying_formats[buf], 0);
1884 }
1885
1886 static enum mali_format
1887 pan_xfb_format(enum mali_format format, unsigned nr)
1888 {
1889 if (MALI_EXTRACT_BITS(format) == MALI_CHANNEL_FLOAT)
1890 return MALI_R32F | MALI_NR_CHANNELS(nr);
1891 else
1892 return MALI_EXTRACT_TYPE(format) | MALI_NR_CHANNELS(nr) | MALI_CHANNEL_32;
1893 }
1894
1895 /* Transform feedback records. Note struct pipe_stream_output is (if packed as
1896 * a bitfield) 32-bit, smaller than a 64-bit pointer, so may as well pass by
1897 * value. */
1898
1899 static struct mali_attr_meta
1900 pan_emit_vary_xfb(unsigned present,
1901 unsigned max_xfb,
1902 unsigned *streamout_offsets,
1903 unsigned quirks,
1904 enum mali_format format,
1905 struct pipe_stream_output o)
1906 {
1907 /* Otherwise construct a record for it */
1908 struct mali_attr_meta meta = {
1909 /* XFB buffers come after everything else */
1910 .index = pan_xfb_base(present) + o.output_buffer,
1911
1912 /* As usual unknown bit */
1913 .unknown1 = quirks & IS_BIFROST ? 0x0 : 0x2,
1914
1915 /* Override swizzle with number of channels */
1916 .swizzle = quirks & HAS_SWIZZLES ?
1917 panfrost_get_default_swizzle(o.num_components) :
1918 panfrost_bifrost_swizzle(o.num_components),
1919
1920 /* Override number of channels and precision to highp */
1921 .format = pan_xfb_format(format, o.num_components),
1922
1923 /* Apply given offsets together */
1924 .src_offset = (o.dst_offset * 4) /* dwords */
1925 + streamout_offsets[o.output_buffer]
1926 };
1927
1928 return meta;
1929 }
1930
1931 /* Determine if we should capture a varying for XFB. This requires actually
1932 * having a buffer for it. If we don't capture it, we'll fallback to a general
1933 * varying path (linked or unlinked, possibly discarding the write) */
1934
1935 static bool
1936 panfrost_xfb_captured(struct panfrost_shader_state *xfb,
1937 unsigned loc, unsigned max_xfb)
1938 {
1939 if (!(xfb->so_mask & (1ll << loc)))
1940 return false;
1941
1942 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1943 return o->output_buffer < max_xfb;
1944 }
1945
1946 /* Higher-level wrapper around all of the above, classifying a varying into one
1947 * of the above types */
1948
1949 static struct mali_attr_meta
1950 panfrost_emit_varying(
1951 struct panfrost_shader_state *stage,
1952 struct panfrost_shader_state *other,
1953 struct panfrost_shader_state *xfb,
1954 unsigned present,
1955 unsigned max_xfb,
1956 unsigned *streamout_offsets,
1957 unsigned quirks,
1958 unsigned *gen_offsets,
1959 enum mali_format *gen_formats,
1960 unsigned *gen_stride,
1961 unsigned idx,
1962 bool should_alloc,
1963 bool is_fragment)
1964 {
1965 gl_varying_slot loc = stage->varyings_loc[idx];
1966 enum mali_format format = stage->varyings[idx];
1967
1968 /* Override format to match linkage */
1969 if (!should_alloc && gen_formats[idx])
1970 format = gen_formats[idx];
1971
1972 if (has_point_coord(stage->point_sprite_mask, loc)) {
1973 return pan_emit_vary_special(present, PAN_VARY_PNTCOORD, quirks);
1974 } else if (panfrost_xfb_captured(xfb, loc, max_xfb)) {
1975 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1976 return pan_emit_vary_xfb(present, max_xfb, streamout_offsets, quirks, format, *o);
1977 } else if (loc == VARYING_SLOT_POS) {
1978 if (is_fragment)
1979 return pan_emit_vary_special(present, PAN_VARY_FRAGCOORD, quirks);
1980 else
1981 return pan_emit_vary_special(present, PAN_VARY_POSITION, quirks);
1982 } else if (loc == VARYING_SLOT_PSIZ) {
1983 return pan_emit_vary_special(present, PAN_VARY_PSIZ, quirks);
1984 } else if (loc == VARYING_SLOT_PNTC) {
1985 return pan_emit_vary_special(present, PAN_VARY_PNTCOORD, quirks);
1986 } else if (loc == VARYING_SLOT_FACE) {
1987 return pan_emit_vary_special(present, PAN_VARY_FACE, quirks);
1988 }
1989
1990 /* We've exhausted special cases, so it's otherwise a general varying. Check if we're linked */
1991 signed other_idx = -1;
1992
1993 for (unsigned j = 0; j < other->varying_count; ++j) {
1994 if (other->varyings_loc[j] == loc) {
1995 other_idx = j;
1996 break;
1997 }
1998 }
1999
2000 if (other_idx < 0)
2001 return pan_emit_vary_only(present, quirks);
2002
2003 unsigned offset = gen_offsets[other_idx];
2004
2005 if (should_alloc) {
2006 /* We're linked, so allocate a space via a watermark allocation */
2007 enum mali_format alt = other->varyings[other_idx];
2008
2009 /* Do interpolation at minimum precision */
2010 unsigned size_main = pan_varying_size(format);
2011 unsigned size_alt = pan_varying_size(alt);
2012 unsigned size = MIN2(size_main, size_alt);
2013
2014 /* If a varying is marked for XFB but not actually captured, we
2015 * should match the format to the format that would otherwise
2016 * be used for XFB, since dEQP checks for invariance here. It's
2017 * unclear if this is required by the spec. */
2018
2019 if (xfb->so_mask & (1ull << loc)) {
2020 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
2021 format = pan_xfb_format(format, o->num_components);
2022 size = pan_varying_size(format);
2023 } else if (size == size_alt) {
2024 format = alt;
2025 }
2026
2027 gen_offsets[idx] = *gen_stride;
2028 gen_formats[other_idx] = format;
2029 offset = *gen_stride;
2030 *gen_stride += size;
2031 }
2032
2033 return pan_emit_vary(present, PAN_VARY_GENERAL,
2034 quirks, format, offset);
2035 }
2036
2037 static void
2038 pan_emit_special_input(union mali_attr *varyings,
2039 unsigned present,
2040 enum pan_special_varying v,
2041 mali_ptr addr)
2042 {
2043 if (present & (1 << v)) {
2044 /* Ensure we write exactly once for performance and with fields
2045 * zeroed appropriately to avoid flakes */
2046
2047 union mali_attr s = {
2048 .elements = addr
2049 };
2050
2051 varyings[pan_varying_index(present, v)] = s;
2052 }
2053 }
2054
2055 void
2056 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
2057 unsigned vertex_count,
2058 struct mali_vertex_tiler_postfix *vertex_postfix,
2059 struct mali_vertex_tiler_postfix *tiler_postfix,
2060 union midgard_primitive_size *primitive_size)
2061 {
2062 /* Load the shaders */
2063 struct panfrost_context *ctx = batch->ctx;
2064 struct panfrost_device *dev = pan_device(ctx->base.screen);
2065 struct panfrost_shader_state *vs, *fs;
2066 size_t vs_size, fs_size;
2067
2068 /* Allocate the varying descriptor */
2069
2070 vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
2071 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
2072 vs_size = sizeof(struct mali_attr_meta) * vs->varying_count;
2073 fs_size = sizeof(struct mali_attr_meta) * fs->varying_count;
2074
2075 struct panfrost_transfer trans = panfrost_pool_alloc(&batch->pool,
2076 vs_size +
2077 fs_size);
2078
2079 struct pipe_stream_output_info *so = &vs->stream_output;
2080 unsigned present = pan_varying_present(vs, fs, dev->quirks);
2081
2082 /* Check if this varying is linked by us. This is the case for
2083 * general-purpose, non-captured varyings. If it is, link it. If it's
2084 * not, use the provided stream out information to determine the
2085 * offset, since it was already linked for us. */
2086
2087 unsigned gen_offsets[32];
2088 enum mali_format gen_formats[32];
2089 memset(gen_offsets, 0, sizeof(gen_offsets));
2090 memset(gen_formats, 0, sizeof(gen_formats));
2091
2092 unsigned gen_stride = 0;
2093 assert(vs->varying_count < ARRAY_SIZE(gen_offsets));
2094 assert(fs->varying_count < ARRAY_SIZE(gen_offsets));
2095
2096 unsigned streamout_offsets[32];
2097
2098 for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
2099 streamout_offsets[i] = panfrost_streamout_offset(
2100 so->stride[i],
2101 ctx->streamout.offsets[i],
2102 ctx->streamout.targets[i]);
2103 }
2104
2105 struct mali_attr_meta *ovs = (struct mali_attr_meta *)trans.cpu;
2106 struct mali_attr_meta *ofs = ovs + vs->varying_count;
2107
2108 for (unsigned i = 0; i < vs->varying_count; i++) {
2109 ovs[i] = panfrost_emit_varying(vs, fs, vs, present,
2110 ctx->streamout.num_targets, streamout_offsets,
2111 dev->quirks,
2112 gen_offsets, gen_formats, &gen_stride, i, true, false);
2113 }
2114
2115 for (unsigned i = 0; i < fs->varying_count; i++) {
2116 ofs[i] = panfrost_emit_varying(fs, vs, vs, present,
2117 ctx->streamout.num_targets, streamout_offsets,
2118 dev->quirks,
2119 gen_offsets, gen_formats, &gen_stride, i, false, true);
2120 }
2121
2122 unsigned xfb_base = pan_xfb_base(present);
2123 struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool,
2124 sizeof(union mali_attr) * (xfb_base + ctx->streamout.num_targets));
2125 union mali_attr *varyings = (union mali_attr *) T.cpu;
2126
2127 /* Emit the stream out buffers */
2128
2129 unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
2130 ctx->vertex_count);
2131
2132 for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
2133 panfrost_emit_streamout(batch, &varyings[xfb_base + i],
2134 so->stride[i],
2135 ctx->streamout.offsets[i],
2136 out_count,
2137 ctx->streamout.targets[i]);
2138 }
2139
2140 panfrost_emit_varyings(batch,
2141 &varyings[pan_varying_index(present, PAN_VARY_GENERAL)],
2142 gen_stride, vertex_count);
2143
2144 /* fp32 vec4 gl_Position */
2145 tiler_postfix->position_varying = panfrost_emit_varyings(batch,
2146 &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
2147 sizeof(float) * 4, vertex_count);
2148
2149 if (present & (1 << PAN_VARY_PSIZ)) {
2150 primitive_size->pointer = panfrost_emit_varyings(batch,
2151 &varyings[pan_varying_index(present, PAN_VARY_PSIZ)],
2152 2, vertex_count);
2153 }
2154
2155 pan_emit_special_input(varyings, present, PAN_VARY_PNTCOORD, MALI_VARYING_POINT_COORD);
2156 pan_emit_special_input(varyings, present, PAN_VARY_FACE, MALI_VARYING_FRONT_FACING);
2157 pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD, MALI_VARYING_FRAG_COORD);
2158
2159 vertex_postfix->varyings = T.gpu;
2160 tiler_postfix->varyings = T.gpu;
2161
2162 vertex_postfix->varying_meta = trans.gpu;
2163 tiler_postfix->varying_meta = trans.gpu + vs_size;
2164 }
2165
2166 void
2167 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
2168 struct mali_vertex_tiler_prefix *vertex_prefix,
2169 struct mali_vertex_tiler_postfix *vertex_postfix,
2170 struct mali_vertex_tiler_prefix *tiler_prefix,
2171 struct mali_vertex_tiler_postfix *tiler_postfix,
2172 union midgard_primitive_size *primitive_size)
2173 {
2174 struct panfrost_context *ctx = batch->ctx;
2175 struct panfrost_device *device = pan_device(ctx->base.screen);
2176 bool wallpapering = ctx->wallpaper_batch && batch->scoreboard.tiler_dep;
2177 struct bifrost_payload_vertex bifrost_vertex = {0,};
2178 struct bifrost_payload_tiler bifrost_tiler = {0,};
2179 struct midgard_payload_vertex_tiler midgard_vertex = {0,};
2180 struct midgard_payload_vertex_tiler midgard_tiler = {0,};
2181 void *vp, *tp;
2182 size_t vp_size, tp_size;
2183
2184 if (device->quirks & IS_BIFROST) {
2185 bifrost_vertex.prefix = *vertex_prefix;
2186 bifrost_vertex.postfix = *vertex_postfix;
2187 vp = &bifrost_vertex;
2188 vp_size = sizeof(bifrost_vertex);
2189
2190 bifrost_tiler.prefix = *tiler_prefix;
2191 bifrost_tiler.tiler.primitive_size = *primitive_size;
2192 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
2193 bifrost_tiler.postfix = *tiler_postfix;
2194 tp = &bifrost_tiler;
2195 tp_size = sizeof(bifrost_tiler);
2196 } else {
2197 midgard_vertex.prefix = *vertex_prefix;
2198 midgard_vertex.postfix = *vertex_postfix;
2199 vp = &midgard_vertex;
2200 vp_size = sizeof(midgard_vertex);
2201
2202 midgard_tiler.prefix = *tiler_prefix;
2203 midgard_tiler.postfix = *tiler_postfix;
2204 midgard_tiler.primitive_size = *primitive_size;
2205 tp = &midgard_tiler;
2206 tp_size = sizeof(midgard_tiler);
2207 }
2208
2209 if (wallpapering) {
2210 /* Inject in reverse order, with "predicted" job indices.
2211 * THIS IS A HACK XXX */
2212 panfrost_new_job(&batch->pool, &batch->scoreboard, JOB_TYPE_TILER, false,
2213 batch->scoreboard.job_index + 2, tp, tp_size, true);
2214 panfrost_new_job(&batch->pool, &batch->scoreboard, JOB_TYPE_VERTEX, false, 0,
2215 vp, vp_size, true);
2216 return;
2217 }
2218
2219 /* If rasterizer discard is enable, only submit the vertex */
2220
2221 bool rasterizer_discard = ctx->rasterizer &&
2222 ctx->rasterizer->base.rasterizer_discard;
2223
2224 unsigned vertex = panfrost_new_job(&batch->pool, &batch->scoreboard, JOB_TYPE_VERTEX, false, 0,
2225 vp, vp_size, false);
2226
2227 if (rasterizer_discard)
2228 return;
2229
2230 panfrost_new_job(&batch->pool, &batch->scoreboard, JOB_TYPE_TILER, false, vertex, tp, tp_size,
2231 false);
2232 }
2233
2234 /* TODO: stop hardcoding this */
2235 mali_ptr
2236 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2237 {
2238 uint16_t locations[] = {
2239 128, 128,
2240 0, 256,
2241 0, 256,
2242 0, 256,
2243 0, 256,
2244 0, 256,
2245 0, 256,
2246 0, 256,
2247 0, 256,
2248 0, 256,
2249 0, 256,
2250 0, 256,
2251 0, 256,
2252 0, 256,
2253 0, 256,
2254 0, 256,
2255 0, 256,
2256 0, 256,
2257 0, 256,
2258 0, 256,
2259 0, 256,
2260 0, 256,
2261 0, 256,
2262 0, 256,
2263 0, 256,
2264 0, 256,
2265 0, 256,
2266 0, 256,
2267 0, 256,
2268 0, 256,
2269 0, 256,
2270 0, 256,
2271 128, 128,
2272 0, 0,
2273 0, 0,
2274 0, 0,
2275 0, 0,
2276 0, 0,
2277 0, 0,
2278 0, 0,
2279 0, 0,
2280 0, 0,
2281 0, 0,
2282 0, 0,
2283 0, 0,
2284 0, 0,
2285 0, 0,
2286 0, 0,
2287 };
2288
2289 return panfrost_pool_upload(&batch->pool, locations, 96 * sizeof(uint16_t));
2290 }