panfrost: Handle per-sample shading
[mesa.git] / src / gallium / drivers / panfrost / pan_cmdstream.c
1 /*
2 * Copyright (C) 2018 Alyssa Rosenzweig
3 * Copyright (C) 2020 Collabora Ltd.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "util/macros.h"
26 #include "util/u_prim.h"
27 #include "util/u_vbuf.h"
28
29 #include "panfrost-quirks.h"
30
31 #include "pan_pool.h"
32 #include "pan_bo.h"
33 #include "pan_cmdstream.h"
34 #include "pan_context.h"
35 #include "pan_job.h"
36
37 /* If a BO is accessed for a particular shader stage, will it be in the primary
38 * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
39 * fragment will be primary, e.g. compute jobs will be considered
40 * "vertex/tiler" by analogy */
41
42 static inline uint32_t
43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
44 {
45 assert(stage == PIPE_SHADER_FRAGMENT ||
46 stage == PIPE_SHADER_VERTEX ||
47 stage == PIPE_SHADER_COMPUTE);
48
49 return stage == PIPE_SHADER_FRAGMENT ?
50 PAN_BO_ACCESS_FRAGMENT :
51 PAN_BO_ACCESS_VERTEX_TILER;
52 }
53
54 static void
55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
56 struct mali_vertex_tiler_postfix *postfix)
57 {
58 struct panfrost_device *dev = pan_device(ctx->base.screen);
59 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
60
61 unsigned shift = panfrost_get_stack_shift(batch->stack_size);
62 struct mali_shared_memory shared = {
63 .stack_shift = shift,
64 .scratchpad = panfrost_batch_get_scratchpad(batch, shift, dev->thread_tls_alloc, dev->core_count)->gpu,
65 .shared_workgroup_count = ~0,
66 };
67 postfix->shared_memory = panfrost_pool_upload(&batch->pool, &shared, sizeof(shared));
68 }
69
70 static void
71 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
72 struct mali_vertex_tiler_postfix *postfix)
73 {
74 struct panfrost_device *dev = pan_device(ctx->base.screen);
75 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
76
77 /* If we haven't, reserve space for the framebuffer */
78
79 if (!batch->framebuffer.gpu) {
80 unsigned size = (dev->quirks & MIDGARD_SFBD) ?
81 sizeof(struct mali_single_framebuffer) :
82 sizeof(struct mali_framebuffer);
83
84 batch->framebuffer = panfrost_pool_alloc(&batch->pool, size);
85
86 /* Tag the pointer */
87 if (!(dev->quirks & MIDGARD_SFBD))
88 batch->framebuffer.gpu |= MALI_MFBD;
89 }
90
91 postfix->shared_memory = batch->framebuffer.gpu;
92 }
93
94 static void
95 panfrost_vt_update_rasterizer(struct panfrost_context *ctx,
96 struct mali_vertex_tiler_prefix *prefix,
97 struct mali_vertex_tiler_postfix *postfix)
98 {
99 struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
100
101 postfix->gl_enables |= 0x7;
102 SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
103 rasterizer && rasterizer->base.front_ccw);
104 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
105 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_FRONT));
106 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
107 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_BACK));
108 SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
109 rasterizer && rasterizer->base.flatshade_first);
110 }
111
112 void
113 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
114 struct mali_vertex_tiler_prefix *prefix,
115 union midgard_primitive_size *primitive_size)
116 {
117 struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
118
119 if (!panfrost_writes_point_size(ctx)) {
120 bool points = prefix->draw_mode == MALI_POINTS;
121 float val = 0.0f;
122
123 if (rasterizer)
124 val = points ?
125 rasterizer->base.point_size :
126 rasterizer->base.line_width;
127
128 primitive_size->constant = val;
129 }
130 }
131
132 static void
133 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
134 struct mali_vertex_tiler_postfix *postfix)
135 {
136 SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
137 if (ctx->occlusion_query) {
138 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
139 panfrost_batch_add_bo(ctx->batch, ctx->occlusion_query->bo,
140 PAN_BO_ACCESS_SHARED |
141 PAN_BO_ACCESS_RW |
142 PAN_BO_ACCESS_FRAGMENT);
143 } else {
144 postfix->occlusion_counter = 0;
145 }
146 }
147
148 void
149 panfrost_vt_init(struct panfrost_context *ctx,
150 enum pipe_shader_type stage,
151 struct mali_vertex_tiler_prefix *prefix,
152 struct mali_vertex_tiler_postfix *postfix)
153 {
154 struct panfrost_device *device = pan_device(ctx->base.screen);
155
156 if (!ctx->shader[stage])
157 return;
158
159 memset(prefix, 0, sizeof(*prefix));
160 memset(postfix, 0, sizeof(*postfix));
161
162 if (device->quirks & IS_BIFROST) {
163 postfix->gl_enables = 0x2;
164 panfrost_vt_emit_shared_memory(ctx, postfix);
165 } else {
166 postfix->gl_enables = 0x6;
167 panfrost_vt_attach_framebuffer(ctx, postfix);
168 }
169
170 if (stage == PIPE_SHADER_FRAGMENT) {
171 panfrost_vt_update_occlusion_query(ctx, postfix);
172 panfrost_vt_update_rasterizer(ctx, prefix, postfix);
173 }
174 }
175
176 static unsigned
177 panfrost_translate_index_size(unsigned size)
178 {
179 switch (size) {
180 case 1:
181 return MALI_DRAW_INDEXED_UINT8;
182
183 case 2:
184 return MALI_DRAW_INDEXED_UINT16;
185
186 case 4:
187 return MALI_DRAW_INDEXED_UINT32;
188
189 default:
190 unreachable("Invalid index size");
191 }
192 }
193
194 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
195 * good for the duration of the draw (transient), could last longer. Also get
196 * the bounds on the index buffer for the range accessed by the draw. We do
197 * these operations together because there are natural optimizations which
198 * require them to be together. */
199
200 static mali_ptr
201 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
202 const struct pipe_draw_info *info,
203 unsigned *min_index, unsigned *max_index)
204 {
205 struct panfrost_resource *rsrc = pan_resource(info->index.resource);
206 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
207 off_t offset = info->start * info->index_size;
208 bool needs_indices = true;
209 mali_ptr out = 0;
210
211 if (info->max_index != ~0u) {
212 *min_index = info->min_index;
213 *max_index = info->max_index;
214 needs_indices = false;
215 }
216
217 if (!info->has_user_indices) {
218 /* Only resources can be directly mapped */
219 panfrost_batch_add_bo(batch, rsrc->bo,
220 PAN_BO_ACCESS_SHARED |
221 PAN_BO_ACCESS_READ |
222 PAN_BO_ACCESS_VERTEX_TILER);
223 out = rsrc->bo->gpu + offset;
224
225 /* Check the cache */
226 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
227 info->start,
228 info->count,
229 min_index,
230 max_index);
231 } else {
232 /* Otherwise, we need to upload to transient memory */
233 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
234 out = panfrost_pool_upload(&batch->pool, ibuf8 + offset,
235 info->count *
236 info->index_size);
237 }
238
239 if (needs_indices) {
240 /* Fallback */
241 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
242
243 if (!info->has_user_indices)
244 panfrost_minmax_cache_add(rsrc->index_cache,
245 info->start, info->count,
246 *min_index, *max_index);
247 }
248
249 return out;
250 }
251
252 void
253 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
254 const struct pipe_draw_info *info,
255 enum mali_draw_mode draw_mode,
256 struct mali_vertex_tiler_postfix *vertex_postfix,
257 struct mali_vertex_tiler_prefix *tiler_prefix,
258 struct mali_vertex_tiler_postfix *tiler_postfix,
259 unsigned *vertex_count,
260 unsigned *padded_count)
261 {
262 tiler_prefix->draw_mode = draw_mode;
263
264 unsigned draw_flags = 0;
265
266 if (panfrost_writes_point_size(ctx))
267 draw_flags |= MALI_DRAW_VARYING_SIZE;
268
269 if (info->primitive_restart)
270 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
271
272 /* These doesn't make much sense */
273
274 draw_flags |= 0x3000;
275
276 if (info->index_size) {
277 unsigned min_index = 0, max_index = 0;
278
279 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
280 info,
281 &min_index,
282 &max_index);
283
284 /* Use the corresponding values */
285 *vertex_count = max_index - min_index + 1;
286 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
287 tiler_prefix->offset_bias_correction = -min_index;
288 tiler_prefix->index_count = MALI_POSITIVE(info->count);
289 draw_flags |= panfrost_translate_index_size(info->index_size);
290 } else {
291 tiler_prefix->indices = 0;
292 *vertex_count = ctx->vertex_count;
293 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
294 tiler_prefix->offset_bias_correction = 0;
295 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
296 }
297
298 tiler_prefix->unknown_draw = draw_flags;
299
300 /* Encode the padded vertex count */
301
302 if (info->instance_count > 1) {
303 *padded_count = panfrost_padded_vertex_count(*vertex_count);
304
305 unsigned shift = __builtin_ctz(ctx->padded_count);
306 unsigned k = ctx->padded_count >> (shift + 1);
307
308 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
309 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
310 } else {
311 *padded_count = *vertex_count;
312
313 /* Reset instancing state */
314 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
315 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
316 }
317 }
318
319 static void
320 panfrost_shader_meta_init(struct panfrost_context *ctx,
321 enum pipe_shader_type st,
322 struct mali_shader_meta *meta)
323 {
324 const struct panfrost_device *dev = pan_device(ctx->base.screen);
325 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
326
327 memset(meta, 0, sizeof(*meta));
328 meta->shader = (ss->bo ? ss->bo->gpu : 0) | ss->first_tag;
329 meta->attribute_count = ss->attribute_count;
330 meta->varying_count = ss->varying_count;
331 meta->texture_count = ctx->sampler_view_count[st];
332 meta->sampler_count = ctx->sampler_count[st];
333
334 if (dev->quirks & IS_BIFROST) {
335 if (st == PIPE_SHADER_VERTEX)
336 meta->bifrost1.unk1 = 0x800000;
337 else {
338 /* First clause ATEST |= 0x4000000.
339 * Less than 32 regs |= 0x200 */
340 meta->bifrost1.unk1 = 0x950020;
341 }
342
343 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
344 if (st == PIPE_SHADER_VERTEX)
345 meta->bifrost2.preload_regs = 0xC0;
346 else {
347 meta->bifrost2.preload_regs = 0x1;
348 SET_BIT(meta->bifrost2.preload_regs, 0x10, ss->reads_frag_coord);
349 }
350
351 meta->bifrost2.uniform_count = MIN2(ss->uniform_count,
352 ss->uniform_cutoff);
353 } else {
354 meta->midgard1.uniform_count = MIN2(ss->uniform_count,
355 ss->uniform_cutoff);
356 meta->midgard1.work_count = ss->work_reg_count;
357
358 /* TODO: This is not conformant on ES3 */
359 meta->midgard1.flags_hi = MALI_SUPPRESS_INF_NAN;
360
361 meta->midgard1.flags_lo = 0x20;
362 meta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
363
364 SET_BIT(meta->midgard1.flags_hi, MALI_WRITES_GLOBAL, ss->writes_global);
365 }
366 }
367
368 static unsigned
369 panfrost_translate_compare_func(enum pipe_compare_func in)
370 {
371 switch (in) {
372 case PIPE_FUNC_NEVER:
373 return MALI_FUNC_NEVER;
374
375 case PIPE_FUNC_LESS:
376 return MALI_FUNC_LESS;
377
378 case PIPE_FUNC_EQUAL:
379 return MALI_FUNC_EQUAL;
380
381 case PIPE_FUNC_LEQUAL:
382 return MALI_FUNC_LEQUAL;
383
384 case PIPE_FUNC_GREATER:
385 return MALI_FUNC_GREATER;
386
387 case PIPE_FUNC_NOTEQUAL:
388 return MALI_FUNC_NOTEQUAL;
389
390 case PIPE_FUNC_GEQUAL:
391 return MALI_FUNC_GEQUAL;
392
393 case PIPE_FUNC_ALWAYS:
394 return MALI_FUNC_ALWAYS;
395
396 default:
397 unreachable("Invalid func");
398 }
399 }
400
401 static unsigned
402 panfrost_translate_stencil_op(enum pipe_stencil_op in)
403 {
404 switch (in) {
405 case PIPE_STENCIL_OP_KEEP:
406 return MALI_STENCIL_KEEP;
407
408 case PIPE_STENCIL_OP_ZERO:
409 return MALI_STENCIL_ZERO;
410
411 case PIPE_STENCIL_OP_REPLACE:
412 return MALI_STENCIL_REPLACE;
413
414 case PIPE_STENCIL_OP_INCR:
415 return MALI_STENCIL_INCR;
416
417 case PIPE_STENCIL_OP_DECR:
418 return MALI_STENCIL_DECR;
419
420 case PIPE_STENCIL_OP_INCR_WRAP:
421 return MALI_STENCIL_INCR_WRAP;
422
423 case PIPE_STENCIL_OP_DECR_WRAP:
424 return MALI_STENCIL_DECR_WRAP;
425
426 case PIPE_STENCIL_OP_INVERT:
427 return MALI_STENCIL_INVERT;
428
429 default:
430 unreachable("Invalid stencil op");
431 }
432 }
433
434 static unsigned
435 translate_tex_wrap(enum pipe_tex_wrap w)
436 {
437 switch (w) {
438 case PIPE_TEX_WRAP_REPEAT:
439 return MALI_WRAP_REPEAT;
440
441 case PIPE_TEX_WRAP_CLAMP:
442 return MALI_WRAP_CLAMP;
443
444 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
445 return MALI_WRAP_CLAMP_TO_EDGE;
446
447 case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
448 return MALI_WRAP_CLAMP_TO_BORDER;
449
450 case PIPE_TEX_WRAP_MIRROR_REPEAT:
451 return MALI_WRAP_MIRRORED_REPEAT;
452
453 case PIPE_TEX_WRAP_MIRROR_CLAMP:
454 return MALI_WRAP_MIRRORED_CLAMP;
455
456 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
457 return MALI_WRAP_MIRRORED_CLAMP_TO_EDGE;
458
459 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
460 return MALI_WRAP_MIRRORED_CLAMP_TO_BORDER;
461
462 default:
463 unreachable("Invalid wrap");
464 }
465 }
466
467 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
468 struct mali_sampler_descriptor *hw)
469 {
470 unsigned func = panfrost_translate_compare_func(cso->compare_func);
471 bool min_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
472 bool mag_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
473 bool mip_linear = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR;
474 unsigned min_filter = min_nearest ? MALI_SAMP_MIN_NEAREST : 0;
475 unsigned mag_filter = mag_nearest ? MALI_SAMP_MAG_NEAREST : 0;
476 unsigned mip_filter = mip_linear ?
477 (MALI_SAMP_MIP_LINEAR_1 | MALI_SAMP_MIP_LINEAR_2) : 0;
478 unsigned normalized = cso->normalized_coords ? MALI_SAMP_NORM_COORDS : 0;
479
480 *hw = (struct mali_sampler_descriptor) {
481 .filter_mode = min_filter | mag_filter | mip_filter |
482 normalized,
483 .wrap_s = translate_tex_wrap(cso->wrap_s),
484 .wrap_t = translate_tex_wrap(cso->wrap_t),
485 .wrap_r = translate_tex_wrap(cso->wrap_r),
486 .compare_func = panfrost_flip_compare_func(func),
487 .border_color = {
488 cso->border_color.f[0],
489 cso->border_color.f[1],
490 cso->border_color.f[2],
491 cso->border_color.f[3]
492 },
493 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
494 .max_lod = FIXED_16(cso->max_lod, false),
495 .lod_bias = FIXED_16(cso->lod_bias, true), /* can be negative */
496 .seamless_cube_map = cso->seamless_cube_map,
497 };
498
499 /* If necessary, we disable mipmapping in the sampler descriptor by
500 * clamping the LOD as tight as possible (from 0 to epsilon,
501 * essentially -- remember these are fixed point numbers, so
502 * epsilon=1/256) */
503
504 if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
505 hw->max_lod = hw->min_lod + 1;
506 }
507
508 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
509 struct bifrost_sampler_descriptor *hw)
510 {
511 *hw = (struct bifrost_sampler_descriptor) {
512 .unk1 = 0x1,
513 .wrap_s = translate_tex_wrap(cso->wrap_s),
514 .wrap_t = translate_tex_wrap(cso->wrap_t),
515 .wrap_r = translate_tex_wrap(cso->wrap_r),
516 .unk8 = 0x8,
517 .min_filter = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST,
518 .norm_coords = cso->normalized_coords,
519 .mip_filter = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR,
520 .mag_filter = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR,
521 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
522 .max_lod = FIXED_16(cso->max_lod, false),
523 };
524
525 /* If necessary, we disable mipmapping in the sampler descriptor by
526 * clamping the LOD as tight as possible (from 0 to epsilon,
527 * essentially -- remember these are fixed point numbers, so
528 * epsilon=1/256) */
529
530 if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
531 hw->max_lod = hw->min_lod + 1;
532 }
533
534 static void
535 panfrost_make_stencil_state(const struct pipe_stencil_state *in,
536 struct mali_stencil_test *out)
537 {
538 out->ref = 0; /* Gallium gets it from elsewhere */
539
540 out->mask = in->valuemask;
541 out->func = panfrost_translate_compare_func(in->func);
542 out->sfail = panfrost_translate_stencil_op(in->fail_op);
543 out->dpfail = panfrost_translate_stencil_op(in->zfail_op);
544 out->dppass = panfrost_translate_stencil_op(in->zpass_op);
545 }
546
547 static void
548 panfrost_frag_meta_rasterizer_update(struct panfrost_context *ctx,
549 struct mali_shader_meta *fragmeta)
550 {
551 if (!ctx->rasterizer) {
552 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, true);
553 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, false);
554 fragmeta->depth_units = 0.0f;
555 fragmeta->depth_factor = 0.0f;
556 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, false);
557 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, false);
558 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_NEAR, true);
559 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_FAR, true);
560 return;
561 }
562
563 struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
564
565 bool msaa = rast->multisample;
566
567 /* TODO: Sample size */
568 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
569 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
570
571 SET_BIT(fragmeta->unknown2_3, MALI_PER_SAMPLE,
572 msaa && ctx->min_samples > 1);
573
574 fragmeta->depth_units = rast->offset_units * 2.0f;
575 fragmeta->depth_factor = rast->offset_scale;
576
577 /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
578
579 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, rast->offset_tri);
580 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, rast->offset_tri);
581
582 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_NEAR, rast->depth_clip_near);
583 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_FAR, rast->depth_clip_far);
584 }
585
586 static void
587 panfrost_frag_meta_zsa_update(struct panfrost_context *ctx,
588 struct mali_shader_meta *fragmeta)
589 {
590 const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
591 int zfunc = PIPE_FUNC_ALWAYS;
592
593 if (!zsa) {
594 struct pipe_stencil_state default_stencil = {
595 .enabled = 0,
596 .func = PIPE_FUNC_ALWAYS,
597 .fail_op = MALI_STENCIL_KEEP,
598 .zfail_op = MALI_STENCIL_KEEP,
599 .zpass_op = MALI_STENCIL_KEEP,
600 .writemask = 0xFF,
601 .valuemask = 0xFF
602 };
603
604 panfrost_make_stencil_state(&default_stencil,
605 &fragmeta->stencil_front);
606 fragmeta->stencil_mask_front = default_stencil.writemask;
607 fragmeta->stencil_back = fragmeta->stencil_front;
608 fragmeta->stencil_mask_back = default_stencil.writemask;
609 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST, false);
610 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK, false);
611 } else {
612 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
613 zsa->stencil[0].enabled);
614 panfrost_make_stencil_state(&zsa->stencil[0],
615 &fragmeta->stencil_front);
616 fragmeta->stencil_mask_front = zsa->stencil[0].writemask;
617 fragmeta->stencil_front.ref = ctx->stencil_ref.ref_value[0];
618
619 /* If back-stencil is not enabled, use the front values */
620
621 if (zsa->stencil[1].enabled) {
622 panfrost_make_stencil_state(&zsa->stencil[1],
623 &fragmeta->stencil_back);
624 fragmeta->stencil_mask_back = zsa->stencil[1].writemask;
625 fragmeta->stencil_back.ref = ctx->stencil_ref.ref_value[1];
626 } else {
627 fragmeta->stencil_back = fragmeta->stencil_front;
628 fragmeta->stencil_mask_back = fragmeta->stencil_mask_front;
629 fragmeta->stencil_back.ref = fragmeta->stencil_front.ref;
630 }
631
632 if (zsa->depth.enabled)
633 zfunc = zsa->depth.func;
634
635 /* Depth state (TODO: Refactor) */
636
637 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
638 zsa->depth.writemask);
639 }
640
641 fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
642 fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(zfunc));
643 }
644
645 static bool
646 panfrost_fs_required(
647 struct panfrost_shader_state *fs,
648 struct panfrost_blend_final *blend,
649 unsigned rt_count)
650 {
651 /* If we generally have side effects */
652 if (fs->fs_sidefx)
653 return true;
654
655 /* If colour is written we need to execute */
656 for (unsigned i = 0; i < rt_count; ++i) {
657 if (!blend[i].no_colour)
658 return true;
659 }
660
661 /* If depth is written and not implied we need to execute.
662 * TODO: Predicate on Z/S writes being enabled */
663 return (fs->writes_depth || fs->writes_stencil);
664 }
665
666 static void
667 panfrost_frag_meta_blend_update(struct panfrost_context *ctx,
668 struct mali_shader_meta *fragmeta,
669 void *rts)
670 {
671 const struct panfrost_device *dev = pan_device(ctx->base.screen);
672 struct panfrost_shader_state *fs;
673 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
674
675 SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
676 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
677 !ctx->blend->base.dither);
678
679 SET_BIT(fragmeta->unknown2_4, MALI_ALPHA_TO_COVERAGE,
680 ctx->blend->base.alpha_to_coverage);
681
682 /* Get blending setup */
683 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
684
685 struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
686 unsigned shader_offset = 0;
687 struct panfrost_bo *shader_bo = NULL;
688
689 for (unsigned c = 0; c < rt_count; ++c)
690 blend[c] = panfrost_get_blend_for_context(ctx, c, &shader_bo,
691 &shader_offset);
692
693 /* Disable shader execution if we can */
694 if (dev->quirks & MIDGARD_SHADERLESS
695 && !panfrost_fs_required(fs, blend, rt_count)) {
696 fragmeta->shader = 0;
697 fragmeta->attribute_count = 0;
698 fragmeta->varying_count = 0;
699 fragmeta->texture_count = 0;
700 fragmeta->sampler_count = 0;
701
702 /* This feature is not known to work on Bifrost */
703 fragmeta->midgard1.work_count = 1;
704 fragmeta->midgard1.uniform_count = 0;
705 fragmeta->midgard1.uniform_buffer_count = 0;
706 }
707
708 /* If there is a blend shader, work registers are shared. We impose 8
709 * work registers as a limit for blend shaders. Should be lower XXX */
710
711 if (!(dev->quirks & IS_BIFROST)) {
712 for (unsigned c = 0; c < rt_count; ++c) {
713 if (blend[c].is_shader) {
714 fragmeta->midgard1.work_count =
715 MAX2(fragmeta->midgard1.work_count, 8);
716 }
717 }
718 }
719
720 /* Even on MFBD, the shader descriptor gets blend shaders. It's *also*
721 * copied to the blend_meta appended (by convention), but this is the
722 * field actually read by the hardware. (Or maybe both are read...?).
723 * Specify the last RTi with a blend shader. */
724
725 fragmeta->blend.shader = 0;
726
727 for (signed rt = (rt_count - 1); rt >= 0; --rt) {
728 if (!blend[rt].is_shader)
729 continue;
730
731 fragmeta->blend.shader = blend[rt].shader.gpu |
732 blend[rt].shader.first_tag;
733 break;
734 }
735
736 if (dev->quirks & MIDGARD_SFBD) {
737 /* When only a single render target platform is used, the blend
738 * information is inside the shader meta itself. We additionally
739 * need to signal CAN_DISCARD for nontrivial blend modes (so
740 * we're able to read back the destination buffer) */
741
742 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
743 blend[0].is_shader);
744
745 if (!blend[0].is_shader) {
746 fragmeta->blend.equation = *blend[0].equation.equation;
747 fragmeta->blend.constant = blend[0].equation.constant;
748 }
749
750 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
751 !blend[0].no_blending || fs->can_discard);
752 return;
753 }
754
755 if (dev->quirks & IS_BIFROST) {
756 bool no_blend = true;
757
758 for (unsigned i = 0; i < rt_count; ++i)
759 no_blend &= (blend[i].no_blending | blend[i].no_colour);
760
761 SET_BIT(fragmeta->bifrost1.unk1, MALI_BIFROST_EARLY_Z,
762 !fs->can_discard && !fs->writes_depth && no_blend);
763 }
764
765 /* Additional blend descriptor tacked on for jobs using MFBD */
766
767 for (unsigned i = 0; i < rt_count; ++i) {
768 unsigned flags = 0;
769
770 if (ctx->pipe_framebuffer.nr_cbufs > i && !blend[i].no_colour) {
771 flags = 0x200;
772
773 bool is_srgb = (ctx->pipe_framebuffer.nr_cbufs > i) &&
774 (ctx->pipe_framebuffer.cbufs[i]) &&
775 util_format_is_srgb(ctx->pipe_framebuffer.cbufs[i]->format);
776
777 SET_BIT(flags, MALI_BLEND_MRT_SHADER, blend[i].is_shader);
778 SET_BIT(flags, MALI_BLEND_LOAD_TIB, !blend[i].no_blending);
779 SET_BIT(flags, MALI_BLEND_SRGB, is_srgb);
780 SET_BIT(flags, MALI_BLEND_NO_DITHER, !ctx->blend->base.dither);
781 }
782
783 if (dev->quirks & IS_BIFROST) {
784 struct bifrost_blend_rt *brts = rts;
785
786 brts[i].flags = flags;
787
788 if (blend[i].is_shader) {
789 /* The blend shader's address needs to be at
790 * the same top 32 bit as the fragment shader.
791 * TODO: Ensure that's always the case.
792 */
793 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
794 (fs->bo->gpu & (0xffffffffull << 32)));
795 brts[i].shader = blend[i].shader.gpu;
796 brts[i].unk2 = 0x0;
797 } else if (ctx->pipe_framebuffer.nr_cbufs > i) {
798 enum pipe_format format = ctx->pipe_framebuffer.cbufs[i]->format;
799 const struct util_format_description *format_desc;
800 format_desc = util_format_description(format);
801
802 brts[i].equation = *blend[i].equation.equation;
803
804 /* TODO: this is a bit more complicated */
805 brts[i].constant = blend[i].equation.constant;
806
807 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
808
809 /* 0x19 disables blending and forces REPLACE
810 * mode (equivalent to rgb_mode = alpha_mode =
811 * x122, colour mask = 0xF). 0x1a allows
812 * blending. */
813 brts[i].unk2 = blend[i].no_blending ? 0x19 : 0x1a;
814
815 brts[i].shader_type = fs->blend_types[i];
816 } else {
817 /* Dummy attachment for depth-only */
818 brts[i].unk2 = 0x3;
819 brts[i].shader_type = fs->blend_types[i];
820 }
821 } else {
822 struct midgard_blend_rt *mrts = rts;
823 mrts[i].flags = flags;
824
825 if (blend[i].is_shader) {
826 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
827 } else {
828 mrts[i].blend.equation = *blend[i].equation.equation;
829 mrts[i].blend.constant = blend[i].equation.constant;
830 }
831 }
832 }
833 }
834
835 static void
836 panfrost_frag_shader_meta_init(struct panfrost_context *ctx,
837 struct mali_shader_meta *fragmeta,
838 void *rts)
839 {
840 const struct panfrost_device *dev = pan_device(ctx->base.screen);
841 struct panfrost_shader_state *fs;
842
843 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
844
845 bool msaa = ctx->rasterizer && ctx->rasterizer->base.multisample;
846 fragmeta->coverage_mask = (msaa ? ctx->sample_mask : ~0) & 0xF;
847
848 fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x10;
849 fragmeta->unknown2_4 = 0x4e0;
850
851 /* unknown2_4 has 0x10 bit set on T6XX and T720. We don't know why this
852 * is required (independent of 32-bit/64-bit descriptors), or why it's
853 * not used on later GPU revisions. Otherwise, all shader jobs fault on
854 * these earlier chips (perhaps this is a chicken bit of some kind).
855 * More investigation is needed. */
856
857 SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
858
859 if (dev->quirks & IS_BIFROST) {
860 /* TODO */
861 } else {
862 /* Depending on whether it's legal to in the given shader, we try to
863 * enable early-z testing. TODO: respect e-z force */
864
865 SET_BIT(fragmeta->midgard1.flags_lo, MALI_EARLY_Z,
866 !fs->can_discard && !fs->writes_global &&
867 !fs->writes_depth && !fs->writes_stencil &&
868 !ctx->blend->base.alpha_to_coverage);
869
870 /* Add the writes Z/S flags if needed. */
871 SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_Z, fs->writes_depth);
872 SET_BIT(fragmeta->midgard1.flags_hi, MALI_WRITES_S, fs->writes_stencil);
873
874 /* Any time texturing is used, derivatives are implicitly calculated,
875 * so we need to enable helper invocations */
876
877 SET_BIT(fragmeta->midgard1.flags_lo, MALI_HELPER_INVOCATIONS,
878 fs->helper_invocations);
879
880 const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
881
882 bool depth_enabled = fs->writes_depth ||
883 (zsa && zsa->depth.enabled && zsa->depth.func != PIPE_FUNC_ALWAYS);
884
885 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_TILEBUFFER,
886 fs->outputs_read || (!depth_enabled && fs->can_discard));
887 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_ZS, depth_enabled && fs->can_discard);
888 }
889
890 panfrost_frag_meta_rasterizer_update(ctx, fragmeta);
891 panfrost_frag_meta_zsa_update(ctx, fragmeta);
892 panfrost_frag_meta_blend_update(ctx, fragmeta, rts);
893 }
894
895 void
896 panfrost_emit_shader_meta(struct panfrost_batch *batch,
897 enum pipe_shader_type st,
898 struct mali_vertex_tiler_postfix *postfix)
899 {
900 struct panfrost_context *ctx = batch->ctx;
901 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
902
903 if (!ss) {
904 postfix->shader = 0;
905 return;
906 }
907
908 struct mali_shader_meta meta;
909
910 panfrost_shader_meta_init(ctx, st, &meta);
911
912 /* Add the shader BO to the batch. */
913 panfrost_batch_add_bo(batch, ss->bo,
914 PAN_BO_ACCESS_PRIVATE |
915 PAN_BO_ACCESS_READ |
916 panfrost_bo_access_for_stage(st));
917
918 mali_ptr shader_ptr;
919
920 if (st == PIPE_SHADER_FRAGMENT) {
921 struct panfrost_device *dev = pan_device(ctx->base.screen);
922 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
923 size_t desc_size = sizeof(meta);
924 void *rts = NULL;
925 struct panfrost_transfer xfer;
926 unsigned rt_size;
927
928 if (dev->quirks & MIDGARD_SFBD)
929 rt_size = 0;
930 else if (dev->quirks & IS_BIFROST)
931 rt_size = sizeof(struct bifrost_blend_rt);
932 else
933 rt_size = sizeof(struct midgard_blend_rt);
934
935 desc_size += rt_size * rt_count;
936
937 if (rt_size)
938 rts = rzalloc_size(ctx, rt_size * rt_count);
939
940 panfrost_frag_shader_meta_init(ctx, &meta, rts);
941
942 xfer = panfrost_pool_alloc(&batch->pool, desc_size);
943
944 memcpy(xfer.cpu, &meta, sizeof(meta));
945 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
946
947 if (rt_size)
948 ralloc_free(rts);
949
950 shader_ptr = xfer.gpu;
951 } else {
952 shader_ptr = panfrost_pool_upload(&batch->pool, &meta,
953 sizeof(meta));
954 }
955
956 postfix->shader = shader_ptr;
957 }
958
959 static void
960 panfrost_mali_viewport_init(struct panfrost_context *ctx,
961 struct mali_viewport *mvp)
962 {
963 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
964
965 /* Clip bounds are encoded as floats. The viewport itself is encoded as
966 * (somewhat) asymmetric ints. */
967
968 const struct pipe_scissor_state *ss = &ctx->scissor;
969
970 memset(mvp, 0, sizeof(*mvp));
971
972 /* By default, do no viewport clipping, i.e. clip to (-inf, inf) in
973 * each direction. Clipping to the viewport in theory should work, but
974 * in practice causes issues when we're not explicitly trying to
975 * scissor */
976
977 *mvp = (struct mali_viewport) {
978 .clip_minx = -INFINITY,
979 .clip_miny = -INFINITY,
980 .clip_maxx = INFINITY,
981 .clip_maxy = INFINITY,
982 };
983
984 /* Always scissor to the viewport by default. */
985 float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
986 float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
987
988 float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
989 float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
990
991 float minz = (vp->translate[2] - fabsf(vp->scale[2]));
992 float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
993
994 /* Apply the scissor test */
995
996 unsigned minx, miny, maxx, maxy;
997
998 if (ss && ctx->rasterizer && ctx->rasterizer->base.scissor) {
999 minx = MAX2(ss->minx, vp_minx);
1000 miny = MAX2(ss->miny, vp_miny);
1001 maxx = MIN2(ss->maxx, vp_maxx);
1002 maxy = MIN2(ss->maxy, vp_maxy);
1003 } else {
1004 minx = vp_minx;
1005 miny = vp_miny;
1006 maxx = vp_maxx;
1007 maxy = vp_maxy;
1008 }
1009
1010 /* Hardware needs the min/max to be strictly ordered, so flip if we
1011 * need to. The viewport transformation in the vertex shader will
1012 * handle the negatives if we don't */
1013
1014 if (miny > maxy) {
1015 unsigned temp = miny;
1016 miny = maxy;
1017 maxy = temp;
1018 }
1019
1020 if (minx > maxx) {
1021 unsigned temp = minx;
1022 minx = maxx;
1023 maxx = temp;
1024 }
1025
1026 if (minz > maxz) {
1027 float temp = minz;
1028 minz = maxz;
1029 maxz = temp;
1030 }
1031
1032 /* Clamp to the framebuffer size as a last check */
1033
1034 minx = MIN2(ctx->pipe_framebuffer.width, minx);
1035 maxx = MIN2(ctx->pipe_framebuffer.width, maxx);
1036
1037 miny = MIN2(ctx->pipe_framebuffer.height, miny);
1038 maxy = MIN2(ctx->pipe_framebuffer.height, maxy);
1039
1040 /* Upload */
1041
1042 mvp->viewport0[0] = minx;
1043 mvp->viewport1[0] = MALI_POSITIVE(maxx);
1044
1045 mvp->viewport0[1] = miny;
1046 mvp->viewport1[1] = MALI_POSITIVE(maxy);
1047
1048 bool clip_near = true;
1049 bool clip_far = true;
1050
1051 if (ctx->rasterizer) {
1052 clip_near = ctx->rasterizer->base.depth_clip_near;
1053 clip_far = ctx->rasterizer->base.depth_clip_far;
1054 }
1055
1056 mvp->clip_minz = clip_near ? minz : -INFINITY;
1057 mvp->clip_maxz = clip_far ? maxz : INFINITY;
1058 }
1059
1060 void
1061 panfrost_emit_viewport(struct panfrost_batch *batch,
1062 struct mali_vertex_tiler_postfix *tiler_postfix)
1063 {
1064 struct panfrost_context *ctx = batch->ctx;
1065 struct mali_viewport mvp;
1066
1067 panfrost_mali_viewport_init(batch->ctx, &mvp);
1068
1069 /* Update the job, unless we're doing wallpapering (whose lack of
1070 * scissor we can ignore, since if we "miss" a tile of wallpaper, it'll
1071 * just... be faster :) */
1072
1073 if (!ctx->wallpaper_batch)
1074 panfrost_batch_union_scissor(batch, mvp.viewport0[0],
1075 mvp.viewport0[1],
1076 mvp.viewport1[0] + 1,
1077 mvp.viewport1[1] + 1);
1078
1079 tiler_postfix->viewport = panfrost_pool_upload(&batch->pool, &mvp,
1080 sizeof(mvp));
1081 }
1082
1083 static mali_ptr
1084 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
1085 enum pipe_shader_type st,
1086 struct panfrost_constant_buffer *buf,
1087 unsigned index)
1088 {
1089 struct pipe_constant_buffer *cb = &buf->cb[index];
1090 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1091
1092 if (rsrc) {
1093 panfrost_batch_add_bo(batch, rsrc->bo,
1094 PAN_BO_ACCESS_SHARED |
1095 PAN_BO_ACCESS_READ |
1096 panfrost_bo_access_for_stage(st));
1097
1098 /* Alignment gauranteed by
1099 * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
1100 return rsrc->bo->gpu + cb->buffer_offset;
1101 } else if (cb->user_buffer) {
1102 return panfrost_pool_upload(&batch->pool,
1103 cb->user_buffer +
1104 cb->buffer_offset,
1105 cb->buffer_size);
1106 } else {
1107 unreachable("No constant buffer");
1108 }
1109 }
1110
1111 struct sysval_uniform {
1112 union {
1113 float f[4];
1114 int32_t i[4];
1115 uint32_t u[4];
1116 uint64_t du[2];
1117 };
1118 };
1119
1120 static void
1121 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
1122 struct sysval_uniform *uniform)
1123 {
1124 struct panfrost_context *ctx = batch->ctx;
1125 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1126
1127 uniform->f[0] = vp->scale[0];
1128 uniform->f[1] = vp->scale[1];
1129 uniform->f[2] = vp->scale[2];
1130 }
1131
1132 static void
1133 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
1134 struct sysval_uniform *uniform)
1135 {
1136 struct panfrost_context *ctx = batch->ctx;
1137 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1138
1139 uniform->f[0] = vp->translate[0];
1140 uniform->f[1] = vp->translate[1];
1141 uniform->f[2] = vp->translate[2];
1142 }
1143
1144 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
1145 enum pipe_shader_type st,
1146 unsigned int sysvalid,
1147 struct sysval_uniform *uniform)
1148 {
1149 struct panfrost_context *ctx = batch->ctx;
1150 unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
1151 unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
1152 bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
1153 struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
1154
1155 assert(dim);
1156 uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
1157
1158 if (dim > 1)
1159 uniform->i[1] = u_minify(tex->texture->height0,
1160 tex->u.tex.first_level);
1161
1162 if (dim > 2)
1163 uniform->i[2] = u_minify(tex->texture->depth0,
1164 tex->u.tex.first_level);
1165
1166 if (is_array)
1167 uniform->i[dim] = tex->texture->array_size;
1168 }
1169
1170 static void
1171 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
1172 enum pipe_shader_type st,
1173 unsigned ssbo_id,
1174 struct sysval_uniform *uniform)
1175 {
1176 struct panfrost_context *ctx = batch->ctx;
1177
1178 assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
1179 struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
1180
1181 /* Compute address */
1182 struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
1183
1184 panfrost_batch_add_bo(batch, bo,
1185 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
1186 panfrost_bo_access_for_stage(st));
1187
1188 /* Upload address and size as sysval */
1189 uniform->du[0] = bo->gpu + sb.buffer_offset;
1190 uniform->u[2] = sb.buffer_size;
1191 }
1192
1193 static void
1194 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1195 enum pipe_shader_type st,
1196 unsigned samp_idx,
1197 struct sysval_uniform *uniform)
1198 {
1199 struct panfrost_context *ctx = batch->ctx;
1200 struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1201
1202 uniform->f[0] = sampl->min_lod;
1203 uniform->f[1] = sampl->max_lod;
1204 uniform->f[2] = sampl->lod_bias;
1205
1206 /* Even without any errata, Midgard represents "no mipmapping" as
1207 * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1208 * panfrost_create_sampler_state which also explains our choice of
1209 * epsilon value (again to keep behaviour consistent) */
1210
1211 if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1212 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1213 }
1214
1215 static void
1216 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1217 struct sysval_uniform *uniform)
1218 {
1219 struct panfrost_context *ctx = batch->ctx;
1220
1221 uniform->u[0] = ctx->compute_grid->grid[0];
1222 uniform->u[1] = ctx->compute_grid->grid[1];
1223 uniform->u[2] = ctx->compute_grid->grid[2];
1224 }
1225
1226 static void
1227 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1228 struct panfrost_shader_state *ss,
1229 enum pipe_shader_type st)
1230 {
1231 struct sysval_uniform *uniforms = (void *)buf;
1232
1233 for (unsigned i = 0; i < ss->sysval_count; ++i) {
1234 int sysval = ss->sysval[i];
1235
1236 switch (PAN_SYSVAL_TYPE(sysval)) {
1237 case PAN_SYSVAL_VIEWPORT_SCALE:
1238 panfrost_upload_viewport_scale_sysval(batch,
1239 &uniforms[i]);
1240 break;
1241 case PAN_SYSVAL_VIEWPORT_OFFSET:
1242 panfrost_upload_viewport_offset_sysval(batch,
1243 &uniforms[i]);
1244 break;
1245 case PAN_SYSVAL_TEXTURE_SIZE:
1246 panfrost_upload_txs_sysval(batch, st,
1247 PAN_SYSVAL_ID(sysval),
1248 &uniforms[i]);
1249 break;
1250 case PAN_SYSVAL_SSBO:
1251 panfrost_upload_ssbo_sysval(batch, st,
1252 PAN_SYSVAL_ID(sysval),
1253 &uniforms[i]);
1254 break;
1255 case PAN_SYSVAL_NUM_WORK_GROUPS:
1256 panfrost_upload_num_work_groups_sysval(batch,
1257 &uniforms[i]);
1258 break;
1259 case PAN_SYSVAL_SAMPLER:
1260 panfrost_upload_sampler_sysval(batch, st,
1261 PAN_SYSVAL_ID(sysval),
1262 &uniforms[i]);
1263 break;
1264 default:
1265 assert(0);
1266 }
1267 }
1268 }
1269
1270 static const void *
1271 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1272 unsigned index)
1273 {
1274 struct pipe_constant_buffer *cb = &buf->cb[index];
1275 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1276
1277 if (rsrc)
1278 return rsrc->bo->cpu;
1279 else if (cb->user_buffer)
1280 return cb->user_buffer;
1281 else
1282 unreachable("No constant buffer");
1283 }
1284
1285 void
1286 panfrost_emit_const_buf(struct panfrost_batch *batch,
1287 enum pipe_shader_type stage,
1288 struct mali_vertex_tiler_postfix *postfix)
1289 {
1290 struct panfrost_context *ctx = batch->ctx;
1291 struct panfrost_shader_variants *all = ctx->shader[stage];
1292
1293 if (!all)
1294 return;
1295
1296 struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1297
1298 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1299
1300 /* Uniforms are implicitly UBO #0 */
1301 bool has_uniforms = buf->enabled_mask & (1 << 0);
1302
1303 /* Allocate room for the sysval and the uniforms */
1304 size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1305 size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1306 size_t size = sys_size + uniform_size;
1307 struct panfrost_transfer transfer = panfrost_pool_alloc(&batch->pool,
1308 size);
1309
1310 /* Upload sysvals requested by the shader */
1311 panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1312
1313 /* Upload uniforms */
1314 if (has_uniforms && uniform_size) {
1315 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1316 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1317 }
1318
1319 /* Next up, attach UBOs. UBO #0 is the uniforms we just
1320 * uploaded */
1321
1322 unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1323 assert(ubo_count >= 1);
1324
1325 size_t sz = sizeof(uint64_t) * ubo_count;
1326 uint64_t ubos[PAN_MAX_CONST_BUFFERS];
1327 int uniform_count = ss->uniform_count;
1328
1329 /* Upload uniforms as a UBO */
1330 ubos[0] = MALI_MAKE_UBO(2 + uniform_count, transfer.gpu);
1331
1332 /* The rest are honest-to-goodness UBOs */
1333
1334 for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1335 size_t usz = buf->cb[ubo].buffer_size;
1336 bool enabled = buf->enabled_mask & (1 << ubo);
1337 bool empty = usz == 0;
1338
1339 if (!enabled || empty) {
1340 /* Stub out disabled UBOs to catch accesses */
1341 ubos[ubo] = MALI_MAKE_UBO(0, 0xDEAD0000);
1342 continue;
1343 }
1344
1345 mali_ptr gpu = panfrost_map_constant_buffer_gpu(batch, stage,
1346 buf, ubo);
1347
1348 unsigned bytes_per_field = 16;
1349 unsigned aligned = ALIGN_POT(usz, bytes_per_field);
1350 ubos[ubo] = MALI_MAKE_UBO(aligned / bytes_per_field, gpu);
1351 }
1352
1353 mali_ptr ubufs = panfrost_pool_upload(&batch->pool, ubos, sz);
1354 postfix->uniforms = transfer.gpu;
1355 postfix->uniform_buffers = ubufs;
1356
1357 buf->dirty_mask = 0;
1358 }
1359
1360 void
1361 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1362 const struct pipe_grid_info *info,
1363 struct midgard_payload_vertex_tiler *vtp)
1364 {
1365 struct panfrost_context *ctx = batch->ctx;
1366 struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1367 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1368 unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1369 128));
1370 unsigned shared_size = single_size * info->grid[0] * info->grid[1] *
1371 info->grid[2] * 4;
1372 struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1373 shared_size,
1374 1);
1375
1376 struct mali_shared_memory shared = {
1377 .shared_memory = bo->gpu,
1378 .shared_workgroup_count =
1379 util_logbase2_ceil(info->grid[0]) +
1380 util_logbase2_ceil(info->grid[1]) +
1381 util_logbase2_ceil(info->grid[2]),
1382 .shared_unk1 = 0x2,
1383 .shared_shift = util_logbase2(single_size) - 1
1384 };
1385
1386 vtp->postfix.shared_memory = panfrost_pool_upload(&batch->pool, &shared,
1387 sizeof(shared));
1388 }
1389
1390 static mali_ptr
1391 panfrost_get_tex_desc(struct panfrost_batch *batch,
1392 enum pipe_shader_type st,
1393 struct panfrost_sampler_view *view)
1394 {
1395 if (!view)
1396 return (mali_ptr) 0;
1397
1398 struct pipe_sampler_view *pview = &view->base;
1399 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1400
1401 /* Add the BO to the job so it's retained until the job is done. */
1402
1403 panfrost_batch_add_bo(batch, rsrc->bo,
1404 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1405 panfrost_bo_access_for_stage(st));
1406
1407 panfrost_batch_add_bo(batch, view->bo,
1408 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1409 panfrost_bo_access_for_stage(st));
1410
1411 return view->bo->gpu;
1412 }
1413
1414 static void
1415 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
1416 struct pipe_context *pctx)
1417 {
1418 struct panfrost_resource *rsrc = pan_resource(view->base.texture);
1419 if (view->texture_bo != rsrc->bo->gpu ||
1420 view->layout != rsrc->layout) {
1421 panfrost_bo_unreference(view->bo);
1422 panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);
1423 }
1424 }
1425
1426 void
1427 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1428 enum pipe_shader_type stage,
1429 struct mali_vertex_tiler_postfix *postfix)
1430 {
1431 struct panfrost_context *ctx = batch->ctx;
1432 struct panfrost_device *device = pan_device(ctx->base.screen);
1433
1434 if (!ctx->sampler_view_count[stage])
1435 return;
1436
1437 if (device->quirks & IS_BIFROST) {
1438 struct bifrost_texture_descriptor *descriptors;
1439
1440 descriptors = malloc(sizeof(struct bifrost_texture_descriptor) *
1441 ctx->sampler_view_count[stage]);
1442
1443 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1444 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1445 struct pipe_sampler_view *pview = &view->base;
1446 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1447 panfrost_update_sampler_view(view, &ctx->base);
1448
1449 /* Add the BOs to the job so they are retained until the job is done. */
1450
1451 panfrost_batch_add_bo(batch, rsrc->bo,
1452 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1453 panfrost_bo_access_for_stage(stage));
1454
1455 panfrost_batch_add_bo(batch, view->bo,
1456 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1457 panfrost_bo_access_for_stage(stage));
1458
1459 memcpy(&descriptors[i], view->bifrost_descriptor, sizeof(*view->bifrost_descriptor));
1460 }
1461
1462 postfix->textures = panfrost_pool_upload(&batch->pool,
1463 descriptors,
1464 sizeof(struct bifrost_texture_descriptor) *
1465 ctx->sampler_view_count[stage]);
1466
1467 free(descriptors);
1468 } else {
1469 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1470
1471 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1472 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1473
1474 panfrost_update_sampler_view(view, &ctx->base);
1475
1476 trampolines[i] = panfrost_get_tex_desc(batch, stage, view);
1477 }
1478
1479 postfix->textures = panfrost_pool_upload(&batch->pool,
1480 trampolines,
1481 sizeof(uint64_t) *
1482 ctx->sampler_view_count[stage]);
1483 }
1484 }
1485
1486 void
1487 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1488 enum pipe_shader_type stage,
1489 struct mali_vertex_tiler_postfix *postfix)
1490 {
1491 struct panfrost_context *ctx = batch->ctx;
1492 struct panfrost_device *device = pan_device(ctx->base.screen);
1493
1494 if (!ctx->sampler_count[stage])
1495 return;
1496
1497 if (device->quirks & IS_BIFROST) {
1498 size_t desc_size = sizeof(struct bifrost_sampler_descriptor);
1499 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1500 struct panfrost_transfer transfer = panfrost_pool_alloc(&batch->pool,
1501 transfer_size);
1502 struct bifrost_sampler_descriptor *desc = (struct bifrost_sampler_descriptor *)transfer.cpu;
1503
1504 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1505 desc[i] = ctx->samplers[stage][i]->bifrost_hw;
1506
1507 postfix->sampler_descriptor = transfer.gpu;
1508 } else {
1509 size_t desc_size = sizeof(struct mali_sampler_descriptor);
1510 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1511 struct panfrost_transfer transfer = panfrost_pool_alloc(&batch->pool,
1512 transfer_size);
1513 struct mali_sampler_descriptor *desc = (struct mali_sampler_descriptor *)transfer.cpu;
1514
1515 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1516 desc[i] = ctx->samplers[stage][i]->midgard_hw;
1517
1518 postfix->sampler_descriptor = transfer.gpu;
1519 }
1520 }
1521
1522 void
1523 panfrost_emit_vertex_attr_meta(struct panfrost_batch *batch,
1524 struct mali_vertex_tiler_postfix *vertex_postfix)
1525 {
1526 struct panfrost_context *ctx = batch->ctx;
1527
1528 if (!ctx->vertex)
1529 return;
1530
1531 struct panfrost_vertex_state *so = ctx->vertex;
1532
1533 panfrost_vertex_state_upd_attr_offs(ctx, vertex_postfix);
1534 vertex_postfix->attribute_meta = panfrost_pool_upload(&batch->pool, so->hw,
1535 sizeof(*so->hw) *
1536 PAN_MAX_ATTRIBUTE);
1537 }
1538
1539 void
1540 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1541 struct mali_vertex_tiler_postfix *vertex_postfix)
1542 {
1543 struct panfrost_context *ctx = batch->ctx;
1544 struct panfrost_vertex_state *so = ctx->vertex;
1545
1546 /* Staged mali_attr, and index into them. i =/= k, depending on the
1547 * vertex buffer mask and instancing. Twice as much room is allocated,
1548 * for a worst case of NPOT_DIVIDEs which take up extra slot */
1549 union mali_attr attrs[PIPE_MAX_ATTRIBS * 2];
1550 unsigned k = 0;
1551
1552 for (unsigned i = 0; i < so->num_elements; ++i) {
1553 /* We map a mali_attr to be 1:1 with the mali_attr_meta, which
1554 * means duplicating some vertex buffers (who cares? aside from
1555 * maybe some caching implications but I somehow doubt that
1556 * matters) */
1557
1558 struct pipe_vertex_element *elem = &so->pipe[i];
1559 unsigned vbi = elem->vertex_buffer_index;
1560
1561 /* The exception to 1:1 mapping is that we can have multiple
1562 * entries (NPOT divisors), so we fixup anyways */
1563
1564 so->hw[i].index = k;
1565
1566 if (!(ctx->vb_mask & (1 << vbi)))
1567 continue;
1568
1569 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1570 struct panfrost_resource *rsrc;
1571
1572 rsrc = pan_resource(buf->buffer.resource);
1573 if (!rsrc)
1574 continue;
1575
1576 /* Align to 64 bytes by masking off the lower bits. This
1577 * will be adjusted back when we fixup the src_offset in
1578 * mali_attr_meta */
1579
1580 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1581 mali_ptr addr = raw_addr & ~63;
1582 unsigned chopped_addr = raw_addr - addr;
1583
1584 /* Add a dependency of the batch on the vertex buffer */
1585 panfrost_batch_add_bo(batch, rsrc->bo,
1586 PAN_BO_ACCESS_SHARED |
1587 PAN_BO_ACCESS_READ |
1588 PAN_BO_ACCESS_VERTEX_TILER);
1589
1590 /* Set common fields */
1591 attrs[k].elements = addr;
1592 attrs[k].stride = buf->stride;
1593
1594 /* Since we advanced the base pointer, we shrink the buffer
1595 * size */
1596 attrs[k].size = rsrc->base.width0 - buf->buffer_offset;
1597
1598 /* We need to add the extra size we masked off (for
1599 * correctness) so the data doesn't get clamped away */
1600 attrs[k].size += chopped_addr;
1601
1602 /* For non-instancing make sure we initialize */
1603 attrs[k].shift = attrs[k].extra_flags = 0;
1604
1605 /* Instancing uses a dramatically different code path than
1606 * linear, so dispatch for the actual emission now that the
1607 * common code is finished */
1608
1609 unsigned divisor = elem->instance_divisor;
1610
1611 if (divisor && ctx->instance_count == 1) {
1612 /* Silly corner case where there's a divisor(=1) but
1613 * there's no legitimate instancing. So we want *every*
1614 * attribute to be the same. So set stride to zero so
1615 * we don't go anywhere. */
1616
1617 attrs[k].size = attrs[k].stride + chopped_addr;
1618 attrs[k].stride = 0;
1619 attrs[k++].elements |= MALI_ATTR_LINEAR;
1620 } else if (ctx->instance_count <= 1) {
1621 /* Normal, non-instanced attributes */
1622 attrs[k++].elements |= MALI_ATTR_LINEAR;
1623 } else {
1624 unsigned instance_shift = vertex_postfix->instance_shift;
1625 unsigned instance_odd = vertex_postfix->instance_odd;
1626
1627 k += panfrost_vertex_instanced(ctx->padded_count,
1628 instance_shift,
1629 instance_odd,
1630 divisor, &attrs[k]);
1631 }
1632 }
1633
1634 /* Add special gl_VertexID/gl_InstanceID buffers */
1635
1636 panfrost_vertex_id(ctx->padded_count, &attrs[k]);
1637 so->hw[PAN_VERTEX_ID].index = k++;
1638 panfrost_instance_id(ctx->padded_count, &attrs[k]);
1639 so->hw[PAN_INSTANCE_ID].index = k++;
1640
1641 /* Upload whatever we emitted and go */
1642
1643 vertex_postfix->attributes = panfrost_pool_upload(&batch->pool, attrs,
1644 k * sizeof(*attrs));
1645 }
1646
1647 static mali_ptr
1648 panfrost_emit_varyings(struct panfrost_batch *batch, union mali_attr *slot,
1649 unsigned stride, unsigned count)
1650 {
1651 /* Fill out the descriptor */
1652 slot->stride = stride;
1653 slot->size = stride * count;
1654 slot->shift = slot->extra_flags = 0;
1655
1656 struct panfrost_transfer transfer = panfrost_pool_alloc(&batch->pool,
1657 slot->size);
1658
1659 slot->elements = transfer.gpu | MALI_ATTR_LINEAR;
1660
1661 return transfer.gpu;
1662 }
1663
1664 static unsigned
1665 panfrost_streamout_offset(unsigned stride, unsigned offset,
1666 struct pipe_stream_output_target *target)
1667 {
1668 return (target->buffer_offset + (offset * stride * 4)) & 63;
1669 }
1670
1671 static void
1672 panfrost_emit_streamout(struct panfrost_batch *batch, union mali_attr *slot,
1673 unsigned stride, unsigned offset, unsigned count,
1674 struct pipe_stream_output_target *target)
1675 {
1676 /* Fill out the descriptor */
1677 slot->stride = stride * 4;
1678 slot->shift = slot->extra_flags = 0;
1679
1680 unsigned max_size = target->buffer_size;
1681 unsigned expected_size = slot->stride * count;
1682
1683 /* Grab the BO and bind it to the batch */
1684 struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1685
1686 /* Varyings are WRITE from the perspective of the VERTEX but READ from
1687 * the perspective of the TILER and FRAGMENT.
1688 */
1689 panfrost_batch_add_bo(batch, bo,
1690 PAN_BO_ACCESS_SHARED |
1691 PAN_BO_ACCESS_RW |
1692 PAN_BO_ACCESS_VERTEX_TILER |
1693 PAN_BO_ACCESS_FRAGMENT);
1694
1695 /* We will have an offset applied to get alignment */
1696 mali_ptr addr = bo->gpu + target->buffer_offset + (offset * slot->stride);
1697 slot->elements = (addr & ~63) | MALI_ATTR_LINEAR;
1698 slot->size = MIN2(max_size, expected_size) + (addr & 63);
1699 }
1700
1701 static bool
1702 has_point_coord(unsigned mask, gl_varying_slot loc)
1703 {
1704 if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1705 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1706 else if (loc == VARYING_SLOT_PNTC)
1707 return (mask & (1 << 8));
1708 else
1709 return false;
1710 }
1711
1712 /* Helpers for manipulating stream out information so we can pack varyings
1713 * accordingly. Compute the src_offset for a given captured varying */
1714
1715 static struct pipe_stream_output *
1716 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1717 {
1718 for (unsigned i = 0; i < info->num_outputs; ++i) {
1719 if (info->output[i].register_index == loc)
1720 return &info->output[i];
1721 }
1722
1723 unreachable("Varying not captured");
1724 }
1725
1726 static unsigned
1727 pan_varying_size(enum mali_format fmt)
1728 {
1729 unsigned type = MALI_EXTRACT_TYPE(fmt);
1730 unsigned chan = MALI_EXTRACT_CHANNELS(fmt);
1731 unsigned bits = MALI_EXTRACT_BITS(fmt);
1732 unsigned bpc = 0;
1733
1734 if (bits == MALI_CHANNEL_FLOAT) {
1735 /* No doubles */
1736 bool fp16 = (type == MALI_FORMAT_SINT);
1737 assert(fp16 || (type == MALI_FORMAT_UNORM));
1738
1739 bpc = fp16 ? 2 : 4;
1740 } else {
1741 assert(type >= MALI_FORMAT_SNORM && type <= MALI_FORMAT_SINT);
1742
1743 /* See the enums */
1744 bits = 1 << bits;
1745 assert(bits >= 8);
1746 bpc = bits / 8;
1747 }
1748
1749 return bpc * chan;
1750 }
1751
1752 /* Indices for named (non-XFB) varyings that are present. These are packed
1753 * tightly so they correspond to a bitfield present (P) indexed by (1 <<
1754 * PAN_VARY_*). This has the nice property that you can lookup the buffer index
1755 * of a given special field given a shift S by:
1756 *
1757 * idx = popcount(P & ((1 << S) - 1))
1758 *
1759 * That is... look at all of the varyings that come earlier and count them, the
1760 * count is the new index since plus one. Likewise, the total number of special
1761 * buffers required is simply popcount(P)
1762 */
1763
1764 enum pan_special_varying {
1765 PAN_VARY_GENERAL = 0,
1766 PAN_VARY_POSITION = 1,
1767 PAN_VARY_PSIZ = 2,
1768 PAN_VARY_PNTCOORD = 3,
1769 PAN_VARY_FACE = 4,
1770 PAN_VARY_FRAGCOORD = 5,
1771
1772 /* Keep last */
1773 PAN_VARY_MAX,
1774 };
1775
1776 /* Given a varying, figure out which index it correpsonds to */
1777
1778 static inline unsigned
1779 pan_varying_index(unsigned present, enum pan_special_varying v)
1780 {
1781 unsigned mask = (1 << v) - 1;
1782 return util_bitcount(present & mask);
1783 }
1784
1785 /* Get the base offset for XFB buffers, which by convention come after
1786 * everything else. Wrapper function for semantic reasons; by construction this
1787 * is just popcount. */
1788
1789 static inline unsigned
1790 pan_xfb_base(unsigned present)
1791 {
1792 return util_bitcount(present);
1793 }
1794
1795 /* Computes the present mask for varyings so we can start emitting varying records */
1796
1797 static inline unsigned
1798 pan_varying_present(
1799 struct panfrost_shader_state *vs,
1800 struct panfrost_shader_state *fs,
1801 unsigned quirks)
1802 {
1803 /* At the moment we always emit general and position buffers. Not
1804 * strictly necessary but usually harmless */
1805
1806 unsigned present = (1 << PAN_VARY_GENERAL) | (1 << PAN_VARY_POSITION);
1807
1808 /* Enable special buffers by the shader info */
1809
1810 if (vs->writes_point_size)
1811 present |= (1 << PAN_VARY_PSIZ);
1812
1813 if (fs->reads_point_coord)
1814 present |= (1 << PAN_VARY_PNTCOORD);
1815
1816 if (fs->reads_face)
1817 present |= (1 << PAN_VARY_FACE);
1818
1819 if (fs->reads_frag_coord && !(quirks & IS_BIFROST))
1820 present |= (1 << PAN_VARY_FRAGCOORD);
1821
1822 /* Also, if we have a point sprite, we need a point coord buffer */
1823
1824 for (unsigned i = 0; i < fs->varying_count; i++) {
1825 gl_varying_slot loc = fs->varyings_loc[i];
1826
1827 if (has_point_coord(fs->point_sprite_mask, loc))
1828 present |= (1 << PAN_VARY_PNTCOORD);
1829 }
1830
1831 return present;
1832 }
1833
1834 /* Emitters for varying records */
1835
1836 static struct mali_attr_meta
1837 pan_emit_vary(unsigned present, enum pan_special_varying buf,
1838 unsigned quirks, enum mali_format format,
1839 unsigned offset)
1840 {
1841 unsigned nr_channels = MALI_EXTRACT_CHANNELS(format);
1842
1843 struct mali_attr_meta meta = {
1844 .index = pan_varying_index(present, buf),
1845 .unknown1 = quirks & IS_BIFROST ? 0x0 : 0x2,
1846 .swizzle = quirks & HAS_SWIZZLES ?
1847 panfrost_get_default_swizzle(nr_channels) :
1848 panfrost_bifrost_swizzle(nr_channels),
1849 .format = format,
1850 .src_offset = offset
1851 };
1852
1853 return meta;
1854 }
1855
1856 /* General varying that is unused */
1857
1858 static struct mali_attr_meta
1859 pan_emit_vary_only(unsigned present, unsigned quirks)
1860 {
1861 return pan_emit_vary(present, 0, quirks, MALI_VARYING_DISCARD, 0);
1862 }
1863
1864 /* Special records */
1865
1866 static const enum mali_format pan_varying_formats[PAN_VARY_MAX] = {
1867 [PAN_VARY_POSITION] = MALI_VARYING_POS,
1868 [PAN_VARY_PSIZ] = MALI_R16F,
1869 [PAN_VARY_PNTCOORD] = MALI_R16F,
1870 [PAN_VARY_FACE] = MALI_R32I,
1871 [PAN_VARY_FRAGCOORD] = MALI_RGBA32F
1872 };
1873
1874 static struct mali_attr_meta
1875 pan_emit_vary_special(unsigned present, enum pan_special_varying buf,
1876 unsigned quirks)
1877 {
1878 assert(buf < PAN_VARY_MAX);
1879 return pan_emit_vary(present, buf, quirks, pan_varying_formats[buf], 0);
1880 }
1881
1882 static enum mali_format
1883 pan_xfb_format(enum mali_format format, unsigned nr)
1884 {
1885 if (MALI_EXTRACT_BITS(format) == MALI_CHANNEL_FLOAT)
1886 return MALI_R32F | MALI_NR_CHANNELS(nr);
1887 else
1888 return MALI_EXTRACT_TYPE(format) | MALI_NR_CHANNELS(nr) | MALI_CHANNEL_32;
1889 }
1890
1891 /* Transform feedback records. Note struct pipe_stream_output is (if packed as
1892 * a bitfield) 32-bit, smaller than a 64-bit pointer, so may as well pass by
1893 * value. */
1894
1895 static struct mali_attr_meta
1896 pan_emit_vary_xfb(unsigned present,
1897 unsigned max_xfb,
1898 unsigned *streamout_offsets,
1899 unsigned quirks,
1900 enum mali_format format,
1901 struct pipe_stream_output o)
1902 {
1903 /* Otherwise construct a record for it */
1904 struct mali_attr_meta meta = {
1905 /* XFB buffers come after everything else */
1906 .index = pan_xfb_base(present) + o.output_buffer,
1907
1908 /* As usual unknown bit */
1909 .unknown1 = quirks & IS_BIFROST ? 0x0 : 0x2,
1910
1911 /* Override swizzle with number of channels */
1912 .swizzle = quirks & HAS_SWIZZLES ?
1913 panfrost_get_default_swizzle(o.num_components) :
1914 panfrost_bifrost_swizzle(o.num_components),
1915
1916 /* Override number of channels and precision to highp */
1917 .format = pan_xfb_format(format, o.num_components),
1918
1919 /* Apply given offsets together */
1920 .src_offset = (o.dst_offset * 4) /* dwords */
1921 + streamout_offsets[o.output_buffer]
1922 };
1923
1924 return meta;
1925 }
1926
1927 /* Determine if we should capture a varying for XFB. This requires actually
1928 * having a buffer for it. If we don't capture it, we'll fallback to a general
1929 * varying path (linked or unlinked, possibly discarding the write) */
1930
1931 static bool
1932 panfrost_xfb_captured(struct panfrost_shader_state *xfb,
1933 unsigned loc, unsigned max_xfb)
1934 {
1935 if (!(xfb->so_mask & (1ll << loc)))
1936 return false;
1937
1938 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1939 return o->output_buffer < max_xfb;
1940 }
1941
1942 /* Higher-level wrapper around all of the above, classifying a varying into one
1943 * of the above types */
1944
1945 static struct mali_attr_meta
1946 panfrost_emit_varying(
1947 struct panfrost_shader_state *stage,
1948 struct panfrost_shader_state *other,
1949 struct panfrost_shader_state *xfb,
1950 unsigned present,
1951 unsigned max_xfb,
1952 unsigned *streamout_offsets,
1953 unsigned quirks,
1954 unsigned *gen_offsets,
1955 enum mali_format *gen_formats,
1956 unsigned *gen_stride,
1957 unsigned idx,
1958 bool should_alloc,
1959 bool is_fragment)
1960 {
1961 gl_varying_slot loc = stage->varyings_loc[idx];
1962 enum mali_format format = stage->varyings[idx];
1963
1964 /* Override format to match linkage */
1965 if (!should_alloc && gen_formats[idx])
1966 format = gen_formats[idx];
1967
1968 if (has_point_coord(stage->point_sprite_mask, loc)) {
1969 return pan_emit_vary_special(present, PAN_VARY_PNTCOORD, quirks);
1970 } else if (panfrost_xfb_captured(xfb, loc, max_xfb)) {
1971 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1972 return pan_emit_vary_xfb(present, max_xfb, streamout_offsets, quirks, format, *o);
1973 } else if (loc == VARYING_SLOT_POS) {
1974 if (is_fragment)
1975 return pan_emit_vary_special(present, PAN_VARY_FRAGCOORD, quirks);
1976 else
1977 return pan_emit_vary_special(present, PAN_VARY_POSITION, quirks);
1978 } else if (loc == VARYING_SLOT_PSIZ) {
1979 return pan_emit_vary_special(present, PAN_VARY_PSIZ, quirks);
1980 } else if (loc == VARYING_SLOT_PNTC) {
1981 return pan_emit_vary_special(present, PAN_VARY_PNTCOORD, quirks);
1982 } else if (loc == VARYING_SLOT_FACE) {
1983 return pan_emit_vary_special(present, PAN_VARY_FACE, quirks);
1984 }
1985
1986 /* We've exhausted special cases, so it's otherwise a general varying. Check if we're linked */
1987 signed other_idx = -1;
1988
1989 for (unsigned j = 0; j < other->varying_count; ++j) {
1990 if (other->varyings_loc[j] == loc) {
1991 other_idx = j;
1992 break;
1993 }
1994 }
1995
1996 if (other_idx < 0)
1997 return pan_emit_vary_only(present, quirks);
1998
1999 unsigned offset = gen_offsets[other_idx];
2000
2001 if (should_alloc) {
2002 /* We're linked, so allocate a space via a watermark allocation */
2003 enum mali_format alt = other->varyings[other_idx];
2004
2005 /* Do interpolation at minimum precision */
2006 unsigned size_main = pan_varying_size(format);
2007 unsigned size_alt = pan_varying_size(alt);
2008 unsigned size = MIN2(size_main, size_alt);
2009
2010 /* If a varying is marked for XFB but not actually captured, we
2011 * should match the format to the format that would otherwise
2012 * be used for XFB, since dEQP checks for invariance here. It's
2013 * unclear if this is required by the spec. */
2014
2015 if (xfb->so_mask & (1ull << loc)) {
2016 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
2017 format = pan_xfb_format(format, o->num_components);
2018 size = pan_varying_size(format);
2019 } else if (size == size_alt) {
2020 format = alt;
2021 }
2022
2023 gen_offsets[idx] = *gen_stride;
2024 gen_formats[other_idx] = format;
2025 offset = *gen_stride;
2026 *gen_stride += size;
2027 }
2028
2029 return pan_emit_vary(present, PAN_VARY_GENERAL,
2030 quirks, format, offset);
2031 }
2032
2033 static void
2034 pan_emit_special_input(union mali_attr *varyings,
2035 unsigned present,
2036 enum pan_special_varying v,
2037 mali_ptr addr)
2038 {
2039 if (present & (1 << v)) {
2040 /* Ensure we write exactly once for performance and with fields
2041 * zeroed appropriately to avoid flakes */
2042
2043 union mali_attr s = {
2044 .elements = addr
2045 };
2046
2047 varyings[pan_varying_index(present, v)] = s;
2048 }
2049 }
2050
2051 void
2052 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
2053 unsigned vertex_count,
2054 struct mali_vertex_tiler_postfix *vertex_postfix,
2055 struct mali_vertex_tiler_postfix *tiler_postfix,
2056 union midgard_primitive_size *primitive_size)
2057 {
2058 /* Load the shaders */
2059 struct panfrost_context *ctx = batch->ctx;
2060 struct panfrost_device *dev = pan_device(ctx->base.screen);
2061 struct panfrost_shader_state *vs, *fs;
2062 size_t vs_size, fs_size;
2063
2064 /* Allocate the varying descriptor */
2065
2066 vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
2067 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
2068 vs_size = sizeof(struct mali_attr_meta) * vs->varying_count;
2069 fs_size = sizeof(struct mali_attr_meta) * fs->varying_count;
2070
2071 struct panfrost_transfer trans = panfrost_pool_alloc(&batch->pool,
2072 vs_size +
2073 fs_size);
2074
2075 struct pipe_stream_output_info *so = &vs->stream_output;
2076 unsigned present = pan_varying_present(vs, fs, dev->quirks);
2077
2078 /* Check if this varying is linked by us. This is the case for
2079 * general-purpose, non-captured varyings. If it is, link it. If it's
2080 * not, use the provided stream out information to determine the
2081 * offset, since it was already linked for us. */
2082
2083 unsigned gen_offsets[32];
2084 enum mali_format gen_formats[32];
2085 memset(gen_offsets, 0, sizeof(gen_offsets));
2086 memset(gen_formats, 0, sizeof(gen_formats));
2087
2088 unsigned gen_stride = 0;
2089 assert(vs->varying_count < ARRAY_SIZE(gen_offsets));
2090 assert(fs->varying_count < ARRAY_SIZE(gen_offsets));
2091
2092 unsigned streamout_offsets[32];
2093
2094 for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
2095 streamout_offsets[i] = panfrost_streamout_offset(
2096 so->stride[i],
2097 ctx->streamout.offsets[i],
2098 ctx->streamout.targets[i]);
2099 }
2100
2101 struct mali_attr_meta *ovs = (struct mali_attr_meta *)trans.cpu;
2102 struct mali_attr_meta *ofs = ovs + vs->varying_count;
2103
2104 for (unsigned i = 0; i < vs->varying_count; i++) {
2105 ovs[i] = panfrost_emit_varying(vs, fs, vs, present,
2106 ctx->streamout.num_targets, streamout_offsets,
2107 dev->quirks,
2108 gen_offsets, gen_formats, &gen_stride, i, true, false);
2109 }
2110
2111 for (unsigned i = 0; i < fs->varying_count; i++) {
2112 ofs[i] = panfrost_emit_varying(fs, vs, vs, present,
2113 ctx->streamout.num_targets, streamout_offsets,
2114 dev->quirks,
2115 gen_offsets, gen_formats, &gen_stride, i, false, true);
2116 }
2117
2118 unsigned xfb_base = pan_xfb_base(present);
2119 struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool,
2120 sizeof(union mali_attr) * (xfb_base + ctx->streamout.num_targets));
2121 union mali_attr *varyings = (union mali_attr *) T.cpu;
2122
2123 /* Emit the stream out buffers */
2124
2125 unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
2126 ctx->vertex_count);
2127
2128 for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
2129 panfrost_emit_streamout(batch, &varyings[xfb_base + i],
2130 so->stride[i],
2131 ctx->streamout.offsets[i],
2132 out_count,
2133 ctx->streamout.targets[i]);
2134 }
2135
2136 panfrost_emit_varyings(batch,
2137 &varyings[pan_varying_index(present, PAN_VARY_GENERAL)],
2138 gen_stride, vertex_count);
2139
2140 /* fp32 vec4 gl_Position */
2141 tiler_postfix->position_varying = panfrost_emit_varyings(batch,
2142 &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
2143 sizeof(float) * 4, vertex_count);
2144
2145 if (present & (1 << PAN_VARY_PSIZ)) {
2146 primitive_size->pointer = panfrost_emit_varyings(batch,
2147 &varyings[pan_varying_index(present, PAN_VARY_PSIZ)],
2148 2, vertex_count);
2149 }
2150
2151 pan_emit_special_input(varyings, present, PAN_VARY_PNTCOORD, MALI_VARYING_POINT_COORD);
2152 pan_emit_special_input(varyings, present, PAN_VARY_FACE, MALI_VARYING_FRONT_FACING);
2153 pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD, MALI_VARYING_FRAG_COORD);
2154
2155 vertex_postfix->varyings = T.gpu;
2156 tiler_postfix->varyings = T.gpu;
2157
2158 vertex_postfix->varying_meta = trans.gpu;
2159 tiler_postfix->varying_meta = trans.gpu + vs_size;
2160 }
2161
2162 void
2163 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
2164 struct mali_vertex_tiler_prefix *vertex_prefix,
2165 struct mali_vertex_tiler_postfix *vertex_postfix,
2166 struct mali_vertex_tiler_prefix *tiler_prefix,
2167 struct mali_vertex_tiler_postfix *tiler_postfix,
2168 union midgard_primitive_size *primitive_size)
2169 {
2170 struct panfrost_context *ctx = batch->ctx;
2171 struct panfrost_device *device = pan_device(ctx->base.screen);
2172 bool wallpapering = ctx->wallpaper_batch && batch->scoreboard.tiler_dep;
2173 struct bifrost_payload_vertex bifrost_vertex = {0,};
2174 struct bifrost_payload_tiler bifrost_tiler = {0,};
2175 struct midgard_payload_vertex_tiler midgard_vertex = {0,};
2176 struct midgard_payload_vertex_tiler midgard_tiler = {0,};
2177 void *vp, *tp;
2178 size_t vp_size, tp_size;
2179
2180 if (device->quirks & IS_BIFROST) {
2181 bifrost_vertex.prefix = *vertex_prefix;
2182 bifrost_vertex.postfix = *vertex_postfix;
2183 vp = &bifrost_vertex;
2184 vp_size = sizeof(bifrost_vertex);
2185
2186 bifrost_tiler.prefix = *tiler_prefix;
2187 bifrost_tiler.tiler.primitive_size = *primitive_size;
2188 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
2189 bifrost_tiler.postfix = *tiler_postfix;
2190 tp = &bifrost_tiler;
2191 tp_size = sizeof(bifrost_tiler);
2192 } else {
2193 midgard_vertex.prefix = *vertex_prefix;
2194 midgard_vertex.postfix = *vertex_postfix;
2195 vp = &midgard_vertex;
2196 vp_size = sizeof(midgard_vertex);
2197
2198 midgard_tiler.prefix = *tiler_prefix;
2199 midgard_tiler.postfix = *tiler_postfix;
2200 midgard_tiler.primitive_size = *primitive_size;
2201 tp = &midgard_tiler;
2202 tp_size = sizeof(midgard_tiler);
2203 }
2204
2205 if (wallpapering) {
2206 /* Inject in reverse order, with "predicted" job indices.
2207 * THIS IS A HACK XXX */
2208 panfrost_new_job(&batch->pool, &batch->scoreboard, JOB_TYPE_TILER, false,
2209 batch->scoreboard.job_index + 2, tp, tp_size, true);
2210 panfrost_new_job(&batch->pool, &batch->scoreboard, JOB_TYPE_VERTEX, false, 0,
2211 vp, vp_size, true);
2212 return;
2213 }
2214
2215 /* If rasterizer discard is enable, only submit the vertex */
2216
2217 bool rasterizer_discard = ctx->rasterizer &&
2218 ctx->rasterizer->base.rasterizer_discard;
2219
2220 unsigned vertex = panfrost_new_job(&batch->pool, &batch->scoreboard, JOB_TYPE_VERTEX, false, 0,
2221 vp, vp_size, false);
2222
2223 if (rasterizer_discard)
2224 return;
2225
2226 panfrost_new_job(&batch->pool, &batch->scoreboard, JOB_TYPE_TILER, false, vertex, tp, tp_size,
2227 false);
2228 }
2229
2230 /* TODO: stop hardcoding this */
2231 mali_ptr
2232 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2233 {
2234 uint16_t locations[] = {
2235 128, 128,
2236 0, 256,
2237 0, 256,
2238 0, 256,
2239 0, 256,
2240 0, 256,
2241 0, 256,
2242 0, 256,
2243 0, 256,
2244 0, 256,
2245 0, 256,
2246 0, 256,
2247 0, 256,
2248 0, 256,
2249 0, 256,
2250 0, 256,
2251 0, 256,
2252 0, 256,
2253 0, 256,
2254 0, 256,
2255 0, 256,
2256 0, 256,
2257 0, 256,
2258 0, 256,
2259 0, 256,
2260 0, 256,
2261 0, 256,
2262 0, 256,
2263 0, 256,
2264 0, 256,
2265 0, 256,
2266 0, 256,
2267 128, 128,
2268 0, 0,
2269 0, 0,
2270 0, 0,
2271 0, 0,
2272 0, 0,
2273 0, 0,
2274 0, 0,
2275 0, 0,
2276 0, 0,
2277 0, 0,
2278 0, 0,
2279 0, 0,
2280 0, 0,
2281 0, 0,
2282 0, 0,
2283 };
2284
2285 return panfrost_pool_upload(&batch->pool, locations, 96 * sizeof(uint16_t));
2286 }