panfrost: Do fine-grained flushing for occlusion query results
[mesa.git] / src / gallium / drivers / panfrost / pan_cmdstream.c
1 /*
2 * Copyright (C) 2018 Alyssa Rosenzweig
3 * Copyright (C) 2020 Collabora Ltd.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "util/macros.h"
26 #include "util/u_prim.h"
27 #include "util/u_vbuf.h"
28
29 #include "panfrost-quirks.h"
30
31 #include "pan_allocate.h"
32 #include "pan_bo.h"
33 #include "pan_cmdstream.h"
34 #include "pan_context.h"
35 #include "pan_job.h"
36
37 /* If a BO is accessed for a particular shader stage, will it be in the primary
38 * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
39 * fragment will be primary, e.g. compute jobs will be considered
40 * "vertex/tiler" by analogy */
41
42 static inline uint32_t
43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
44 {
45 assert(stage == PIPE_SHADER_FRAGMENT ||
46 stage == PIPE_SHADER_VERTEX ||
47 stage == PIPE_SHADER_COMPUTE);
48
49 return stage == PIPE_SHADER_FRAGMENT ?
50 PAN_BO_ACCESS_FRAGMENT :
51 PAN_BO_ACCESS_VERTEX_TILER;
52 }
53
54 static void
55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
56 struct mali_vertex_tiler_postfix *postfix)
57 {
58 struct panfrost_device *dev = pan_device(ctx->base.screen);
59 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
60
61 unsigned shift = panfrost_get_stack_shift(batch->stack_size);
62 struct mali_shared_memory shared = {
63 .stack_shift = shift,
64 .scratchpad = panfrost_batch_get_scratchpad(batch, shift, dev->thread_tls_alloc, dev->core_count)->gpu,
65 .shared_workgroup_count = ~0,
66 };
67 postfix->shared_memory = panfrost_upload_transient(batch, &shared, sizeof(shared));
68 }
69
70 static void
71 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
72 struct mali_vertex_tiler_postfix *postfix)
73 {
74 struct panfrost_device *dev = pan_device(ctx->base.screen);
75 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
76
77 /* If we haven't, reserve space for the framebuffer */
78
79 if (!batch->framebuffer.gpu) {
80 unsigned size = (dev->quirks & MIDGARD_SFBD) ?
81 sizeof(struct mali_single_framebuffer) :
82 sizeof(struct mali_framebuffer);
83
84 batch->framebuffer = panfrost_allocate_transient(batch, size);
85
86 /* Tag the pointer */
87 if (!(dev->quirks & MIDGARD_SFBD))
88 batch->framebuffer.gpu |= MALI_MFBD;
89 }
90
91 postfix->shared_memory = batch->framebuffer.gpu;
92 }
93
94 static void
95 panfrost_vt_update_rasterizer(struct panfrost_context *ctx,
96 struct mali_vertex_tiler_prefix *prefix,
97 struct mali_vertex_tiler_postfix *postfix)
98 {
99 struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
100
101 postfix->gl_enables |= 0x7;
102 SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
103 rasterizer && rasterizer->base.front_ccw);
104 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
105 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_FRONT));
106 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
107 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_BACK));
108 SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
109 rasterizer && rasterizer->base.flatshade_first);
110 }
111
112 void
113 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
114 struct mali_vertex_tiler_prefix *prefix,
115 union midgard_primitive_size *primitive_size)
116 {
117 struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
118
119 if (!panfrost_writes_point_size(ctx)) {
120 bool points = prefix->draw_mode == MALI_POINTS;
121 float val = 0.0f;
122
123 if (rasterizer)
124 val = points ?
125 rasterizer->base.point_size :
126 rasterizer->base.line_width;
127
128 primitive_size->constant = val;
129 }
130 }
131
132 static void
133 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
134 struct mali_vertex_tiler_postfix *postfix)
135 {
136 SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
137 if (ctx->occlusion_query) {
138 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
139 panfrost_batch_add_bo(ctx->batch, ctx->occlusion_query->bo,
140 PAN_BO_ACCESS_SHARED |
141 PAN_BO_ACCESS_RW |
142 PAN_BO_ACCESS_FRAGMENT);
143 } else {
144 postfix->occlusion_counter = 0;
145 }
146 }
147
148 void
149 panfrost_vt_init(struct panfrost_context *ctx,
150 enum pipe_shader_type stage,
151 struct mali_vertex_tiler_prefix *prefix,
152 struct mali_vertex_tiler_postfix *postfix)
153 {
154 struct panfrost_device *device = pan_device(ctx->base.screen);
155
156 if (!ctx->shader[stage])
157 return;
158
159 memset(prefix, 0, sizeof(*prefix));
160 memset(postfix, 0, sizeof(*postfix));
161
162 if (device->quirks & IS_BIFROST) {
163 postfix->gl_enables = 0x2;
164 panfrost_vt_emit_shared_memory(ctx, postfix);
165 } else {
166 postfix->gl_enables = 0x6;
167 panfrost_vt_attach_framebuffer(ctx, postfix);
168 }
169
170 if (stage == PIPE_SHADER_FRAGMENT) {
171 panfrost_vt_update_occlusion_query(ctx, postfix);
172 panfrost_vt_update_rasterizer(ctx, prefix, postfix);
173 }
174 }
175
176 static unsigned
177 panfrost_translate_index_size(unsigned size)
178 {
179 switch (size) {
180 case 1:
181 return MALI_DRAW_INDEXED_UINT8;
182
183 case 2:
184 return MALI_DRAW_INDEXED_UINT16;
185
186 case 4:
187 return MALI_DRAW_INDEXED_UINT32;
188
189 default:
190 unreachable("Invalid index size");
191 }
192 }
193
194 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
195 * good for the duration of the draw (transient), could last longer. Also get
196 * the bounds on the index buffer for the range accessed by the draw. We do
197 * these operations together because there are natural optimizations which
198 * require them to be together. */
199
200 static mali_ptr
201 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
202 const struct pipe_draw_info *info,
203 unsigned *min_index, unsigned *max_index)
204 {
205 struct panfrost_resource *rsrc = pan_resource(info->index.resource);
206 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
207 off_t offset = info->start * info->index_size;
208 bool needs_indices = true;
209 mali_ptr out = 0;
210
211 if (info->max_index != ~0u) {
212 *min_index = info->min_index;
213 *max_index = info->max_index;
214 needs_indices = false;
215 }
216
217 if (!info->has_user_indices) {
218 /* Only resources can be directly mapped */
219 panfrost_batch_add_bo(batch, rsrc->bo,
220 PAN_BO_ACCESS_SHARED |
221 PAN_BO_ACCESS_READ |
222 PAN_BO_ACCESS_VERTEX_TILER);
223 out = rsrc->bo->gpu + offset;
224
225 /* Check the cache */
226 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
227 info->start,
228 info->count,
229 min_index,
230 max_index);
231 } else {
232 /* Otherwise, we need to upload to transient memory */
233 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
234 out = panfrost_upload_transient(batch, ibuf8 + offset,
235 info->count *
236 info->index_size);
237 }
238
239 if (needs_indices) {
240 /* Fallback */
241 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
242
243 if (!info->has_user_indices)
244 panfrost_minmax_cache_add(rsrc->index_cache,
245 info->start, info->count,
246 *min_index, *max_index);
247 }
248
249 return out;
250 }
251
252 void
253 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
254 const struct pipe_draw_info *info,
255 enum mali_draw_mode draw_mode,
256 struct mali_vertex_tiler_postfix *vertex_postfix,
257 struct mali_vertex_tiler_prefix *tiler_prefix,
258 struct mali_vertex_tiler_postfix *tiler_postfix,
259 unsigned *vertex_count,
260 unsigned *padded_count)
261 {
262 tiler_prefix->draw_mode = draw_mode;
263
264 unsigned draw_flags = 0;
265
266 if (panfrost_writes_point_size(ctx))
267 draw_flags |= MALI_DRAW_VARYING_SIZE;
268
269 if (info->primitive_restart)
270 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
271
272 /* These doesn't make much sense */
273
274 draw_flags |= 0x3000;
275
276 if (info->index_size) {
277 unsigned min_index = 0, max_index = 0;
278
279 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
280 info,
281 &min_index,
282 &max_index);
283
284 /* Use the corresponding values */
285 *vertex_count = max_index - min_index + 1;
286 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
287 tiler_prefix->offset_bias_correction = -min_index;
288 tiler_prefix->index_count = MALI_POSITIVE(info->count);
289 draw_flags |= panfrost_translate_index_size(info->index_size);
290 } else {
291 tiler_prefix->indices = 0;
292 *vertex_count = ctx->vertex_count;
293 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
294 tiler_prefix->offset_bias_correction = 0;
295 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
296 }
297
298 tiler_prefix->unknown_draw = draw_flags;
299
300 /* Encode the padded vertex count */
301
302 if (info->instance_count > 1) {
303 *padded_count = panfrost_padded_vertex_count(*vertex_count);
304
305 unsigned shift = __builtin_ctz(ctx->padded_count);
306 unsigned k = ctx->padded_count >> (shift + 1);
307
308 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
309 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
310 } else {
311 *padded_count = *vertex_count;
312
313 /* Reset instancing state */
314 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
315 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
316 }
317 }
318
319 static void
320 panfrost_shader_meta_init(struct panfrost_context *ctx,
321 enum pipe_shader_type st,
322 struct mali_shader_meta *meta)
323 {
324 const struct panfrost_device *dev = pan_device(ctx->base.screen);
325 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
326
327 memset(meta, 0, sizeof(*meta));
328 meta->shader = (ss->bo ? ss->bo->gpu : 0) | ss->first_tag;
329 meta->attribute_count = ss->attribute_count;
330 meta->varying_count = ss->varying_count;
331 meta->texture_count = ctx->sampler_view_count[st];
332 meta->sampler_count = ctx->sampler_count[st];
333
334 if (dev->quirks & IS_BIFROST) {
335 if (st == PIPE_SHADER_VERTEX)
336 meta->bifrost1.unk1 = 0x800000;
337 else {
338 /* First clause ATEST |= 0x4000000.
339 * Less than 32 regs |= 0x200 */
340 meta->bifrost1.unk1 = 0x950020;
341 }
342
343 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
344 if (st == PIPE_SHADER_VERTEX)
345 meta->bifrost2.preload_regs = 0xC0;
346 else {
347 meta->bifrost2.preload_regs = 0x1;
348 SET_BIT(meta->bifrost2.preload_regs, 0x10, ss->reads_frag_coord);
349 }
350
351 meta->bifrost2.uniform_count = MIN2(ss->uniform_count,
352 ss->uniform_cutoff);
353 } else {
354 meta->midgard1.uniform_count = MIN2(ss->uniform_count,
355 ss->uniform_cutoff);
356 meta->midgard1.work_count = ss->work_reg_count;
357
358 /* TODO: This is not conformant on ES3 */
359 meta->midgard1.flags_hi = MALI_SUPPRESS_INF_NAN;
360
361 meta->midgard1.flags_lo = 0x20;
362 meta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
363
364 SET_BIT(meta->midgard1.flags_hi, MALI_WRITES_GLOBAL, ss->writes_global);
365 }
366 }
367
368 static unsigned
369 panfrost_translate_compare_func(enum pipe_compare_func in)
370 {
371 switch (in) {
372 case PIPE_FUNC_NEVER:
373 return MALI_FUNC_NEVER;
374
375 case PIPE_FUNC_LESS:
376 return MALI_FUNC_LESS;
377
378 case PIPE_FUNC_EQUAL:
379 return MALI_FUNC_EQUAL;
380
381 case PIPE_FUNC_LEQUAL:
382 return MALI_FUNC_LEQUAL;
383
384 case PIPE_FUNC_GREATER:
385 return MALI_FUNC_GREATER;
386
387 case PIPE_FUNC_NOTEQUAL:
388 return MALI_FUNC_NOTEQUAL;
389
390 case PIPE_FUNC_GEQUAL:
391 return MALI_FUNC_GEQUAL;
392
393 case PIPE_FUNC_ALWAYS:
394 return MALI_FUNC_ALWAYS;
395
396 default:
397 unreachable("Invalid func");
398 }
399 }
400
401 static unsigned
402 panfrost_translate_stencil_op(enum pipe_stencil_op in)
403 {
404 switch (in) {
405 case PIPE_STENCIL_OP_KEEP:
406 return MALI_STENCIL_KEEP;
407
408 case PIPE_STENCIL_OP_ZERO:
409 return MALI_STENCIL_ZERO;
410
411 case PIPE_STENCIL_OP_REPLACE:
412 return MALI_STENCIL_REPLACE;
413
414 case PIPE_STENCIL_OP_INCR:
415 return MALI_STENCIL_INCR;
416
417 case PIPE_STENCIL_OP_DECR:
418 return MALI_STENCIL_DECR;
419
420 case PIPE_STENCIL_OP_INCR_WRAP:
421 return MALI_STENCIL_INCR_WRAP;
422
423 case PIPE_STENCIL_OP_DECR_WRAP:
424 return MALI_STENCIL_DECR_WRAP;
425
426 case PIPE_STENCIL_OP_INVERT:
427 return MALI_STENCIL_INVERT;
428
429 default:
430 unreachable("Invalid stencil op");
431 }
432 }
433
434 static unsigned
435 translate_tex_wrap(enum pipe_tex_wrap w)
436 {
437 switch (w) {
438 case PIPE_TEX_WRAP_REPEAT:
439 return MALI_WRAP_REPEAT;
440
441 case PIPE_TEX_WRAP_CLAMP:
442 return MALI_WRAP_CLAMP;
443
444 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
445 return MALI_WRAP_CLAMP_TO_EDGE;
446
447 case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
448 return MALI_WRAP_CLAMP_TO_BORDER;
449
450 case PIPE_TEX_WRAP_MIRROR_REPEAT:
451 return MALI_WRAP_MIRRORED_REPEAT;
452
453 case PIPE_TEX_WRAP_MIRROR_CLAMP:
454 return MALI_WRAP_MIRRORED_CLAMP;
455
456 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
457 return MALI_WRAP_MIRRORED_CLAMP_TO_EDGE;
458
459 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
460 return MALI_WRAP_MIRRORED_CLAMP_TO_BORDER;
461
462 default:
463 unreachable("Invalid wrap");
464 }
465 }
466
467 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
468 struct mali_sampler_descriptor *hw)
469 {
470 unsigned func = panfrost_translate_compare_func(cso->compare_func);
471 bool min_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
472 bool mag_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
473 bool mip_linear = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR;
474 unsigned min_filter = min_nearest ? MALI_SAMP_MIN_NEAREST : 0;
475 unsigned mag_filter = mag_nearest ? MALI_SAMP_MAG_NEAREST : 0;
476 unsigned mip_filter = mip_linear ?
477 (MALI_SAMP_MIP_LINEAR_1 | MALI_SAMP_MIP_LINEAR_2) : 0;
478 unsigned normalized = cso->normalized_coords ? MALI_SAMP_NORM_COORDS : 0;
479
480 *hw = (struct mali_sampler_descriptor) {
481 .filter_mode = min_filter | mag_filter | mip_filter |
482 normalized,
483 .wrap_s = translate_tex_wrap(cso->wrap_s),
484 .wrap_t = translate_tex_wrap(cso->wrap_t),
485 .wrap_r = translate_tex_wrap(cso->wrap_r),
486 .compare_func = panfrost_flip_compare_func(func),
487 .border_color = {
488 cso->border_color.f[0],
489 cso->border_color.f[1],
490 cso->border_color.f[2],
491 cso->border_color.f[3]
492 },
493 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
494 .max_lod = FIXED_16(cso->max_lod, false),
495 .lod_bias = FIXED_16(cso->lod_bias, true), /* can be negative */
496 .seamless_cube_map = cso->seamless_cube_map,
497 };
498
499 /* If necessary, we disable mipmapping in the sampler descriptor by
500 * clamping the LOD as tight as possible (from 0 to epsilon,
501 * essentially -- remember these are fixed point numbers, so
502 * epsilon=1/256) */
503
504 if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
505 hw->max_lod = hw->min_lod + 1;
506 }
507
508 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
509 struct bifrost_sampler_descriptor *hw)
510 {
511 *hw = (struct bifrost_sampler_descriptor) {
512 .unk1 = 0x1,
513 .wrap_s = translate_tex_wrap(cso->wrap_s),
514 .wrap_t = translate_tex_wrap(cso->wrap_t),
515 .wrap_r = translate_tex_wrap(cso->wrap_r),
516 .unk8 = 0x8,
517 .min_filter = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST,
518 .norm_coords = cso->normalized_coords,
519 .mip_filter = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR,
520 .mag_filter = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR,
521 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
522 .max_lod = FIXED_16(cso->max_lod, false),
523 };
524
525 /* If necessary, we disable mipmapping in the sampler descriptor by
526 * clamping the LOD as tight as possible (from 0 to epsilon,
527 * essentially -- remember these are fixed point numbers, so
528 * epsilon=1/256) */
529
530 if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
531 hw->max_lod = hw->min_lod + 1;
532 }
533
534 static void
535 panfrost_make_stencil_state(const struct pipe_stencil_state *in,
536 struct mali_stencil_test *out)
537 {
538 out->ref = 0; /* Gallium gets it from elsewhere */
539
540 out->mask = in->valuemask;
541 out->func = panfrost_translate_compare_func(in->func);
542 out->sfail = panfrost_translate_stencil_op(in->fail_op);
543 out->dpfail = panfrost_translate_stencil_op(in->zfail_op);
544 out->dppass = panfrost_translate_stencil_op(in->zpass_op);
545 }
546
547 static void
548 panfrost_frag_meta_rasterizer_update(struct panfrost_context *ctx,
549 struct mali_shader_meta *fragmeta)
550 {
551 if (!ctx->rasterizer) {
552 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, true);
553 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, false);
554 fragmeta->depth_units = 0.0f;
555 fragmeta->depth_factor = 0.0f;
556 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, false);
557 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, false);
558 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_NEAR, true);
559 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_FAR, true);
560 return;
561 }
562
563 struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
564
565 bool msaa = rast->multisample;
566
567 /* TODO: Sample size */
568 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
569 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
570 fragmeta->depth_units = rast->offset_units * 2.0f;
571 fragmeta->depth_factor = rast->offset_scale;
572
573 /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
574
575 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, rast->offset_tri);
576 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, rast->offset_tri);
577
578 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_NEAR, rast->depth_clip_near);
579 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_FAR, rast->depth_clip_far);
580 }
581
582 static void
583 panfrost_frag_meta_zsa_update(struct panfrost_context *ctx,
584 struct mali_shader_meta *fragmeta)
585 {
586 const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
587 int zfunc = PIPE_FUNC_ALWAYS;
588
589 if (!zsa) {
590 struct pipe_stencil_state default_stencil = {
591 .enabled = 0,
592 .func = PIPE_FUNC_ALWAYS,
593 .fail_op = MALI_STENCIL_KEEP,
594 .zfail_op = MALI_STENCIL_KEEP,
595 .zpass_op = MALI_STENCIL_KEEP,
596 .writemask = 0xFF,
597 .valuemask = 0xFF
598 };
599
600 panfrost_make_stencil_state(&default_stencil,
601 &fragmeta->stencil_front);
602 fragmeta->stencil_mask_front = default_stencil.writemask;
603 fragmeta->stencil_back = fragmeta->stencil_front;
604 fragmeta->stencil_mask_back = default_stencil.writemask;
605 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST, false);
606 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK, false);
607 } else {
608 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
609 zsa->stencil[0].enabled);
610 panfrost_make_stencil_state(&zsa->stencil[0],
611 &fragmeta->stencil_front);
612 fragmeta->stencil_mask_front = zsa->stencil[0].writemask;
613 fragmeta->stencil_front.ref = ctx->stencil_ref.ref_value[0];
614
615 /* If back-stencil is not enabled, use the front values */
616
617 if (zsa->stencil[1].enabled) {
618 panfrost_make_stencil_state(&zsa->stencil[1],
619 &fragmeta->stencil_back);
620 fragmeta->stencil_mask_back = zsa->stencil[1].writemask;
621 fragmeta->stencil_back.ref = ctx->stencil_ref.ref_value[1];
622 } else {
623 fragmeta->stencil_back = fragmeta->stencil_front;
624 fragmeta->stencil_mask_back = fragmeta->stencil_mask_front;
625 fragmeta->stencil_back.ref = fragmeta->stencil_front.ref;
626 }
627
628 if (zsa->depth.enabled)
629 zfunc = zsa->depth.func;
630
631 /* Depth state (TODO: Refactor) */
632
633 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
634 zsa->depth.writemask);
635 }
636
637 fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
638 fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(zfunc));
639 }
640
641 static bool
642 panfrost_fs_required(
643 struct panfrost_shader_state *fs,
644 struct panfrost_blend_final *blend,
645 unsigned rt_count)
646 {
647 /* If we generally have side effects */
648 if (fs->fs_sidefx)
649 return true;
650
651 /* If colour is written we need to execute */
652 for (unsigned i = 0; i < rt_count; ++i) {
653 if (!blend[i].no_colour)
654 return true;
655 }
656
657 /* If depth is written and not implied we need to execute.
658 * TODO: Predicate on Z/S writes being enabled */
659 return (fs->writes_depth || fs->writes_stencil);
660 }
661
662 static void
663 panfrost_frag_meta_blend_update(struct panfrost_context *ctx,
664 struct mali_shader_meta *fragmeta,
665 void *rts)
666 {
667 const struct panfrost_device *dev = pan_device(ctx->base.screen);
668 struct panfrost_shader_state *fs;
669 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
670
671 SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
672 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
673 !ctx->blend->base.dither);
674
675 /* Get blending setup */
676 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
677
678 struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
679 unsigned shader_offset = 0;
680 struct panfrost_bo *shader_bo = NULL;
681
682 for (unsigned c = 0; c < rt_count; ++c)
683 blend[c] = panfrost_get_blend_for_context(ctx, c, &shader_bo,
684 &shader_offset);
685
686 /* Disable shader execution if we can */
687 if (dev->quirks & MIDGARD_SHADERLESS
688 && !panfrost_fs_required(fs, blend, rt_count)) {
689 fragmeta->shader = 0;
690 fragmeta->attribute_count = 0;
691 fragmeta->varying_count = 0;
692 fragmeta->texture_count = 0;
693 fragmeta->sampler_count = 0;
694
695 /* This feature is not known to work on Bifrost */
696 fragmeta->midgard1.work_count = 1;
697 fragmeta->midgard1.uniform_count = 0;
698 fragmeta->midgard1.uniform_buffer_count = 0;
699 }
700
701 /* If there is a blend shader, work registers are shared. We impose 8
702 * work registers as a limit for blend shaders. Should be lower XXX */
703
704 if (!(dev->quirks & IS_BIFROST)) {
705 for (unsigned c = 0; c < rt_count; ++c) {
706 if (blend[c].is_shader) {
707 fragmeta->midgard1.work_count =
708 MAX2(fragmeta->midgard1.work_count, 8);
709 }
710 }
711 }
712
713 /* Even on MFBD, the shader descriptor gets blend shaders. It's *also*
714 * copied to the blend_meta appended (by convention), but this is the
715 * field actually read by the hardware. (Or maybe both are read...?).
716 * Specify the last RTi with a blend shader. */
717
718 fragmeta->blend.shader = 0;
719
720 for (signed rt = (rt_count - 1); rt >= 0; --rt) {
721 if (!blend[rt].is_shader)
722 continue;
723
724 fragmeta->blend.shader = blend[rt].shader.gpu |
725 blend[rt].shader.first_tag;
726 break;
727 }
728
729 if (dev->quirks & MIDGARD_SFBD) {
730 /* When only a single render target platform is used, the blend
731 * information is inside the shader meta itself. We additionally
732 * need to signal CAN_DISCARD for nontrivial blend modes (so
733 * we're able to read back the destination buffer) */
734
735 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
736 blend[0].is_shader);
737
738 if (!blend[0].is_shader) {
739 fragmeta->blend.equation = *blend[0].equation.equation;
740 fragmeta->blend.constant = blend[0].equation.constant;
741 }
742
743 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
744 !blend[0].no_blending || fs->can_discard);
745 return;
746 }
747
748 if (dev->quirks & IS_BIFROST) {
749 bool no_blend = true;
750
751 for (unsigned i = 0; i < rt_count; ++i)
752 no_blend &= (blend[i].no_blending | blend[i].no_colour);
753
754 SET_BIT(fragmeta->bifrost1.unk1, MALI_BIFROST_EARLY_Z,
755 !fs->can_discard && !fs->writes_depth && no_blend);
756 }
757
758 /* Additional blend descriptor tacked on for jobs using MFBD */
759
760 for (unsigned i = 0; i < rt_count; ++i) {
761 unsigned flags = 0;
762
763 if (ctx->pipe_framebuffer.nr_cbufs > i && !blend[i].no_colour) {
764 flags = 0x200;
765
766 bool is_srgb = (ctx->pipe_framebuffer.nr_cbufs > i) &&
767 (ctx->pipe_framebuffer.cbufs[i]) &&
768 util_format_is_srgb(ctx->pipe_framebuffer.cbufs[i]->format);
769
770 SET_BIT(flags, MALI_BLEND_MRT_SHADER, blend[i].is_shader);
771 SET_BIT(flags, MALI_BLEND_LOAD_TIB, !blend[i].no_blending);
772 SET_BIT(flags, MALI_BLEND_SRGB, is_srgb);
773 SET_BIT(flags, MALI_BLEND_NO_DITHER, !ctx->blend->base.dither);
774 }
775
776 if (dev->quirks & IS_BIFROST) {
777 struct bifrost_blend_rt *brts = rts;
778
779 brts[i].flags = flags;
780
781 if (blend[i].is_shader) {
782 /* The blend shader's address needs to be at
783 * the same top 32 bit as the fragment shader.
784 * TODO: Ensure that's always the case.
785 */
786 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
787 (fs->bo->gpu & (0xffffffffull << 32)));
788 brts[i].shader = blend[i].shader.gpu;
789 brts[i].unk2 = 0x0;
790 } else if (ctx->pipe_framebuffer.nr_cbufs > i) {
791 enum pipe_format format = ctx->pipe_framebuffer.cbufs[i]->format;
792 const struct util_format_description *format_desc;
793 format_desc = util_format_description(format);
794
795 brts[i].equation = *blend[i].equation.equation;
796
797 /* TODO: this is a bit more complicated */
798 brts[i].constant = blend[i].equation.constant;
799
800 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
801
802 /* 0x19 disables blending and forces REPLACE
803 * mode (equivalent to rgb_mode = alpha_mode =
804 * x122, colour mask = 0xF). 0x1a allows
805 * blending. */
806 brts[i].unk2 = blend[i].no_blending ? 0x19 : 0x1a;
807
808 brts[i].shader_type = fs->blend_types[i];
809 } else {
810 /* Dummy attachment for depth-only */
811 brts[i].unk2 = 0x3;
812 brts[i].shader_type = fs->blend_types[i];
813 }
814 } else {
815 struct midgard_blend_rt *mrts = rts;
816 mrts[i].flags = flags;
817
818 if (blend[i].is_shader) {
819 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
820 } else {
821 mrts[i].blend.equation = *blend[i].equation.equation;
822 mrts[i].blend.constant = blend[i].equation.constant;
823 }
824 }
825 }
826 }
827
828 static void
829 panfrost_frag_shader_meta_init(struct panfrost_context *ctx,
830 struct mali_shader_meta *fragmeta,
831 void *rts)
832 {
833 const struct panfrost_device *dev = pan_device(ctx->base.screen);
834 struct panfrost_shader_state *fs;
835
836 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
837
838 fragmeta->alpha_coverage = ~MALI_ALPHA_COVERAGE(0.000000);
839 fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x10;
840 fragmeta->unknown2_4 = 0x4e0;
841
842 /* unknown2_4 has 0x10 bit set on T6XX and T720. We don't know why this
843 * is required (independent of 32-bit/64-bit descriptors), or why it's
844 * not used on later GPU revisions. Otherwise, all shader jobs fault on
845 * these earlier chips (perhaps this is a chicken bit of some kind).
846 * More investigation is needed. */
847
848 SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
849
850 if (dev->quirks & IS_BIFROST) {
851 /* TODO */
852 } else {
853 /* Depending on whether it's legal to in the given shader, we try to
854 * enable early-z testing. TODO: respect e-z force */
855
856 SET_BIT(fragmeta->midgard1.flags_lo, MALI_EARLY_Z,
857 !fs->can_discard && !fs->writes_global &&
858 !fs->writes_depth && !fs->writes_stencil);
859
860 /* Add the writes Z/S flags if needed. */
861 SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_Z, fs->writes_depth);
862 SET_BIT(fragmeta->midgard1.flags_hi, MALI_WRITES_S, fs->writes_stencil);
863
864 /* Any time texturing is used, derivatives are implicitly calculated,
865 * so we need to enable helper invocations */
866
867 SET_BIT(fragmeta->midgard1.flags_lo, MALI_HELPER_INVOCATIONS,
868 fs->helper_invocations);
869
870 const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
871
872 bool depth_enabled = fs->writes_depth ||
873 (zsa && zsa->depth.enabled && zsa->depth.func != PIPE_FUNC_ALWAYS);
874
875 SET_BIT(fragmeta->midgard1.flags_lo, 0x400, !depth_enabled && fs->can_discard);
876 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_ZS, depth_enabled && fs->can_discard);
877 }
878
879 panfrost_frag_meta_rasterizer_update(ctx, fragmeta);
880 panfrost_frag_meta_zsa_update(ctx, fragmeta);
881 panfrost_frag_meta_blend_update(ctx, fragmeta, rts);
882 }
883
884 void
885 panfrost_emit_shader_meta(struct panfrost_batch *batch,
886 enum pipe_shader_type st,
887 struct mali_vertex_tiler_postfix *postfix)
888 {
889 struct panfrost_context *ctx = batch->ctx;
890 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
891
892 if (!ss) {
893 postfix->shader = 0;
894 return;
895 }
896
897 struct mali_shader_meta meta;
898
899 panfrost_shader_meta_init(ctx, st, &meta);
900
901 /* Add the shader BO to the batch. */
902 panfrost_batch_add_bo(batch, ss->bo,
903 PAN_BO_ACCESS_PRIVATE |
904 PAN_BO_ACCESS_READ |
905 panfrost_bo_access_for_stage(st));
906
907 mali_ptr shader_ptr;
908
909 if (st == PIPE_SHADER_FRAGMENT) {
910 struct panfrost_device *dev = pan_device(ctx->base.screen);
911 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
912 size_t desc_size = sizeof(meta);
913 void *rts = NULL;
914 struct panfrost_transfer xfer;
915 unsigned rt_size;
916
917 if (dev->quirks & MIDGARD_SFBD)
918 rt_size = 0;
919 else if (dev->quirks & IS_BIFROST)
920 rt_size = sizeof(struct bifrost_blend_rt);
921 else
922 rt_size = sizeof(struct midgard_blend_rt);
923
924 desc_size += rt_size * rt_count;
925
926 if (rt_size)
927 rts = rzalloc_size(ctx, rt_size * rt_count);
928
929 panfrost_frag_shader_meta_init(ctx, &meta, rts);
930
931 xfer = panfrost_allocate_transient(batch, desc_size);
932
933 memcpy(xfer.cpu, &meta, sizeof(meta));
934 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
935
936 if (rt_size)
937 ralloc_free(rts);
938
939 shader_ptr = xfer.gpu;
940 } else {
941 shader_ptr = panfrost_upload_transient(batch, &meta,
942 sizeof(meta));
943 }
944
945 postfix->shader = shader_ptr;
946 }
947
948 static void
949 panfrost_mali_viewport_init(struct panfrost_context *ctx,
950 struct mali_viewport *mvp)
951 {
952 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
953
954 /* Clip bounds are encoded as floats. The viewport itself is encoded as
955 * (somewhat) asymmetric ints. */
956
957 const struct pipe_scissor_state *ss = &ctx->scissor;
958
959 memset(mvp, 0, sizeof(*mvp));
960
961 /* By default, do no viewport clipping, i.e. clip to (-inf, inf) in
962 * each direction. Clipping to the viewport in theory should work, but
963 * in practice causes issues when we're not explicitly trying to
964 * scissor */
965
966 *mvp = (struct mali_viewport) {
967 .clip_minx = -INFINITY,
968 .clip_miny = -INFINITY,
969 .clip_maxx = INFINITY,
970 .clip_maxy = INFINITY,
971 };
972
973 /* Always scissor to the viewport by default. */
974 float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
975 float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
976
977 float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
978 float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
979
980 float minz = (vp->translate[2] - fabsf(vp->scale[2]));
981 float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
982
983 /* Apply the scissor test */
984
985 unsigned minx, miny, maxx, maxy;
986
987 if (ss && ctx->rasterizer && ctx->rasterizer->base.scissor) {
988 minx = MAX2(ss->minx, vp_minx);
989 miny = MAX2(ss->miny, vp_miny);
990 maxx = MIN2(ss->maxx, vp_maxx);
991 maxy = MIN2(ss->maxy, vp_maxy);
992 } else {
993 minx = vp_minx;
994 miny = vp_miny;
995 maxx = vp_maxx;
996 maxy = vp_maxy;
997 }
998
999 /* Hardware needs the min/max to be strictly ordered, so flip if we
1000 * need to. The viewport transformation in the vertex shader will
1001 * handle the negatives if we don't */
1002
1003 if (miny > maxy) {
1004 unsigned temp = miny;
1005 miny = maxy;
1006 maxy = temp;
1007 }
1008
1009 if (minx > maxx) {
1010 unsigned temp = minx;
1011 minx = maxx;
1012 maxx = temp;
1013 }
1014
1015 if (minz > maxz) {
1016 float temp = minz;
1017 minz = maxz;
1018 maxz = temp;
1019 }
1020
1021 /* Clamp to the framebuffer size as a last check */
1022
1023 minx = MIN2(ctx->pipe_framebuffer.width, minx);
1024 maxx = MIN2(ctx->pipe_framebuffer.width, maxx);
1025
1026 miny = MIN2(ctx->pipe_framebuffer.height, miny);
1027 maxy = MIN2(ctx->pipe_framebuffer.height, maxy);
1028
1029 /* Upload */
1030
1031 mvp->viewport0[0] = minx;
1032 mvp->viewport1[0] = MALI_POSITIVE(maxx);
1033
1034 mvp->viewport0[1] = miny;
1035 mvp->viewport1[1] = MALI_POSITIVE(maxy);
1036
1037 bool clip_near = true;
1038 bool clip_far = true;
1039
1040 if (ctx->rasterizer) {
1041 clip_near = ctx->rasterizer->base.depth_clip_near;
1042 clip_far = ctx->rasterizer->base.depth_clip_far;
1043 }
1044
1045 mvp->clip_minz = clip_near ? minz : -INFINITY;
1046 mvp->clip_maxz = clip_far ? maxz : INFINITY;
1047 }
1048
1049 void
1050 panfrost_emit_viewport(struct panfrost_batch *batch,
1051 struct mali_vertex_tiler_postfix *tiler_postfix)
1052 {
1053 struct panfrost_context *ctx = batch->ctx;
1054 struct mali_viewport mvp;
1055
1056 panfrost_mali_viewport_init(batch->ctx, &mvp);
1057
1058 /* Update the job, unless we're doing wallpapering (whose lack of
1059 * scissor we can ignore, since if we "miss" a tile of wallpaper, it'll
1060 * just... be faster :) */
1061
1062 if (!ctx->wallpaper_batch)
1063 panfrost_batch_union_scissor(batch, mvp.viewport0[0],
1064 mvp.viewport0[1],
1065 mvp.viewport1[0] + 1,
1066 mvp.viewport1[1] + 1);
1067
1068 tiler_postfix->viewport = panfrost_upload_transient(batch, &mvp,
1069 sizeof(mvp));
1070 }
1071
1072 static mali_ptr
1073 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
1074 enum pipe_shader_type st,
1075 struct panfrost_constant_buffer *buf,
1076 unsigned index)
1077 {
1078 struct pipe_constant_buffer *cb = &buf->cb[index];
1079 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1080
1081 if (rsrc) {
1082 panfrost_batch_add_bo(batch, rsrc->bo,
1083 PAN_BO_ACCESS_SHARED |
1084 PAN_BO_ACCESS_READ |
1085 panfrost_bo_access_for_stage(st));
1086
1087 /* Alignment gauranteed by
1088 * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
1089 return rsrc->bo->gpu + cb->buffer_offset;
1090 } else if (cb->user_buffer) {
1091 return panfrost_upload_transient(batch,
1092 cb->user_buffer +
1093 cb->buffer_offset,
1094 cb->buffer_size);
1095 } else {
1096 unreachable("No constant buffer");
1097 }
1098 }
1099
1100 struct sysval_uniform {
1101 union {
1102 float f[4];
1103 int32_t i[4];
1104 uint32_t u[4];
1105 uint64_t du[2];
1106 };
1107 };
1108
1109 static void
1110 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
1111 struct sysval_uniform *uniform)
1112 {
1113 struct panfrost_context *ctx = batch->ctx;
1114 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1115
1116 uniform->f[0] = vp->scale[0];
1117 uniform->f[1] = vp->scale[1];
1118 uniform->f[2] = vp->scale[2];
1119 }
1120
1121 static void
1122 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
1123 struct sysval_uniform *uniform)
1124 {
1125 struct panfrost_context *ctx = batch->ctx;
1126 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1127
1128 uniform->f[0] = vp->translate[0];
1129 uniform->f[1] = vp->translate[1];
1130 uniform->f[2] = vp->translate[2];
1131 }
1132
1133 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
1134 enum pipe_shader_type st,
1135 unsigned int sysvalid,
1136 struct sysval_uniform *uniform)
1137 {
1138 struct panfrost_context *ctx = batch->ctx;
1139 unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
1140 unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
1141 bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
1142 struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
1143
1144 assert(dim);
1145 uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
1146
1147 if (dim > 1)
1148 uniform->i[1] = u_minify(tex->texture->height0,
1149 tex->u.tex.first_level);
1150
1151 if (dim > 2)
1152 uniform->i[2] = u_minify(tex->texture->depth0,
1153 tex->u.tex.first_level);
1154
1155 if (is_array)
1156 uniform->i[dim] = tex->texture->array_size;
1157 }
1158
1159 static void
1160 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
1161 enum pipe_shader_type st,
1162 unsigned ssbo_id,
1163 struct sysval_uniform *uniform)
1164 {
1165 struct panfrost_context *ctx = batch->ctx;
1166
1167 assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
1168 struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
1169
1170 /* Compute address */
1171 struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
1172
1173 panfrost_batch_add_bo(batch, bo,
1174 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
1175 panfrost_bo_access_for_stage(st));
1176
1177 /* Upload address and size as sysval */
1178 uniform->du[0] = bo->gpu + sb.buffer_offset;
1179 uniform->u[2] = sb.buffer_size;
1180 }
1181
1182 static void
1183 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1184 enum pipe_shader_type st,
1185 unsigned samp_idx,
1186 struct sysval_uniform *uniform)
1187 {
1188 struct panfrost_context *ctx = batch->ctx;
1189 struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1190
1191 uniform->f[0] = sampl->min_lod;
1192 uniform->f[1] = sampl->max_lod;
1193 uniform->f[2] = sampl->lod_bias;
1194
1195 /* Even without any errata, Midgard represents "no mipmapping" as
1196 * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1197 * panfrost_create_sampler_state which also explains our choice of
1198 * epsilon value (again to keep behaviour consistent) */
1199
1200 if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1201 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1202 }
1203
1204 static void
1205 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1206 struct sysval_uniform *uniform)
1207 {
1208 struct panfrost_context *ctx = batch->ctx;
1209
1210 uniform->u[0] = ctx->compute_grid->grid[0];
1211 uniform->u[1] = ctx->compute_grid->grid[1];
1212 uniform->u[2] = ctx->compute_grid->grid[2];
1213 }
1214
1215 static void
1216 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1217 struct panfrost_shader_state *ss,
1218 enum pipe_shader_type st)
1219 {
1220 struct sysval_uniform *uniforms = (void *)buf;
1221
1222 for (unsigned i = 0; i < ss->sysval_count; ++i) {
1223 int sysval = ss->sysval[i];
1224
1225 switch (PAN_SYSVAL_TYPE(sysval)) {
1226 case PAN_SYSVAL_VIEWPORT_SCALE:
1227 panfrost_upload_viewport_scale_sysval(batch,
1228 &uniforms[i]);
1229 break;
1230 case PAN_SYSVAL_VIEWPORT_OFFSET:
1231 panfrost_upload_viewport_offset_sysval(batch,
1232 &uniforms[i]);
1233 break;
1234 case PAN_SYSVAL_TEXTURE_SIZE:
1235 panfrost_upload_txs_sysval(batch, st,
1236 PAN_SYSVAL_ID(sysval),
1237 &uniforms[i]);
1238 break;
1239 case PAN_SYSVAL_SSBO:
1240 panfrost_upload_ssbo_sysval(batch, st,
1241 PAN_SYSVAL_ID(sysval),
1242 &uniforms[i]);
1243 break;
1244 case PAN_SYSVAL_NUM_WORK_GROUPS:
1245 panfrost_upload_num_work_groups_sysval(batch,
1246 &uniforms[i]);
1247 break;
1248 case PAN_SYSVAL_SAMPLER:
1249 panfrost_upload_sampler_sysval(batch, st,
1250 PAN_SYSVAL_ID(sysval),
1251 &uniforms[i]);
1252 break;
1253 default:
1254 assert(0);
1255 }
1256 }
1257 }
1258
1259 static const void *
1260 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1261 unsigned index)
1262 {
1263 struct pipe_constant_buffer *cb = &buf->cb[index];
1264 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1265
1266 if (rsrc)
1267 return rsrc->bo->cpu;
1268 else if (cb->user_buffer)
1269 return cb->user_buffer;
1270 else
1271 unreachable("No constant buffer");
1272 }
1273
1274 void
1275 panfrost_emit_const_buf(struct panfrost_batch *batch,
1276 enum pipe_shader_type stage,
1277 struct mali_vertex_tiler_postfix *postfix)
1278 {
1279 struct panfrost_context *ctx = batch->ctx;
1280 struct panfrost_shader_variants *all = ctx->shader[stage];
1281
1282 if (!all)
1283 return;
1284
1285 struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1286
1287 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1288
1289 /* Uniforms are implicitly UBO #0 */
1290 bool has_uniforms = buf->enabled_mask & (1 << 0);
1291
1292 /* Allocate room for the sysval and the uniforms */
1293 size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1294 size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1295 size_t size = sys_size + uniform_size;
1296 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1297 size);
1298
1299 /* Upload sysvals requested by the shader */
1300 panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1301
1302 /* Upload uniforms */
1303 if (has_uniforms && uniform_size) {
1304 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1305 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1306 }
1307
1308 /* Next up, attach UBOs. UBO #0 is the uniforms we just
1309 * uploaded */
1310
1311 unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1312 assert(ubo_count >= 1);
1313
1314 size_t sz = sizeof(uint64_t) * ubo_count;
1315 uint64_t ubos[PAN_MAX_CONST_BUFFERS];
1316 int uniform_count = ss->uniform_count;
1317
1318 /* Upload uniforms as a UBO */
1319 ubos[0] = MALI_MAKE_UBO(2 + uniform_count, transfer.gpu);
1320
1321 /* The rest are honest-to-goodness UBOs */
1322
1323 for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1324 size_t usz = buf->cb[ubo].buffer_size;
1325 bool enabled = buf->enabled_mask & (1 << ubo);
1326 bool empty = usz == 0;
1327
1328 if (!enabled || empty) {
1329 /* Stub out disabled UBOs to catch accesses */
1330 ubos[ubo] = MALI_MAKE_UBO(0, 0xDEAD0000);
1331 continue;
1332 }
1333
1334 mali_ptr gpu = panfrost_map_constant_buffer_gpu(batch, stage,
1335 buf, ubo);
1336
1337 unsigned bytes_per_field = 16;
1338 unsigned aligned = ALIGN_POT(usz, bytes_per_field);
1339 ubos[ubo] = MALI_MAKE_UBO(aligned / bytes_per_field, gpu);
1340 }
1341
1342 mali_ptr ubufs = panfrost_upload_transient(batch, ubos, sz);
1343 postfix->uniforms = transfer.gpu;
1344 postfix->uniform_buffers = ubufs;
1345
1346 buf->dirty_mask = 0;
1347 }
1348
1349 void
1350 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1351 const struct pipe_grid_info *info,
1352 struct midgard_payload_vertex_tiler *vtp)
1353 {
1354 struct panfrost_context *ctx = batch->ctx;
1355 struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1356 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1357 unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1358 128));
1359 unsigned shared_size = single_size * info->grid[0] * info->grid[1] *
1360 info->grid[2] * 4;
1361 struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1362 shared_size,
1363 1);
1364
1365 struct mali_shared_memory shared = {
1366 .shared_memory = bo->gpu,
1367 .shared_workgroup_count =
1368 util_logbase2_ceil(info->grid[0]) +
1369 util_logbase2_ceil(info->grid[1]) +
1370 util_logbase2_ceil(info->grid[2]),
1371 .shared_unk1 = 0x2,
1372 .shared_shift = util_logbase2(single_size) - 1
1373 };
1374
1375 vtp->postfix.shared_memory = panfrost_upload_transient(batch, &shared,
1376 sizeof(shared));
1377 }
1378
1379 static mali_ptr
1380 panfrost_get_tex_desc(struct panfrost_batch *batch,
1381 enum pipe_shader_type st,
1382 struct panfrost_sampler_view *view)
1383 {
1384 if (!view)
1385 return (mali_ptr) 0;
1386
1387 struct pipe_sampler_view *pview = &view->base;
1388 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1389
1390 /* Add the BO to the job so it's retained until the job is done. */
1391
1392 panfrost_batch_add_bo(batch, rsrc->bo,
1393 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1394 panfrost_bo_access_for_stage(st));
1395
1396 panfrost_batch_add_bo(batch, view->bo,
1397 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1398 panfrost_bo_access_for_stage(st));
1399
1400 return view->bo->gpu;
1401 }
1402
1403 static void
1404 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
1405 struct pipe_context *pctx)
1406 {
1407 struct panfrost_resource *rsrc = pan_resource(view->base.texture);
1408 if (view->texture_bo != rsrc->bo->gpu ||
1409 view->layout != rsrc->layout) {
1410 panfrost_bo_unreference(view->bo);
1411 panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);
1412 }
1413 }
1414
1415 void
1416 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1417 enum pipe_shader_type stage,
1418 struct mali_vertex_tiler_postfix *postfix)
1419 {
1420 struct panfrost_context *ctx = batch->ctx;
1421 struct panfrost_device *device = pan_device(ctx->base.screen);
1422
1423 if (!ctx->sampler_view_count[stage])
1424 return;
1425
1426 if (device->quirks & IS_BIFROST) {
1427 struct bifrost_texture_descriptor *descriptors;
1428
1429 descriptors = malloc(sizeof(struct bifrost_texture_descriptor) *
1430 ctx->sampler_view_count[stage]);
1431
1432 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1433 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1434 struct pipe_sampler_view *pview = &view->base;
1435 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1436 panfrost_update_sampler_view(view, &ctx->base);
1437
1438 /* Add the BOs to the job so they are retained until the job is done. */
1439
1440 panfrost_batch_add_bo(batch, rsrc->bo,
1441 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1442 panfrost_bo_access_for_stage(stage));
1443
1444 panfrost_batch_add_bo(batch, view->bo,
1445 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1446 panfrost_bo_access_for_stage(stage));
1447
1448 memcpy(&descriptors[i], view->bifrost_descriptor, sizeof(*view->bifrost_descriptor));
1449 }
1450
1451 postfix->textures = panfrost_upload_transient(batch,
1452 descriptors,
1453 sizeof(struct bifrost_texture_descriptor) *
1454 ctx->sampler_view_count[stage]);
1455
1456 free(descriptors);
1457 } else {
1458 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1459
1460 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1461 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1462
1463 panfrost_update_sampler_view(view, &ctx->base);
1464
1465 trampolines[i] = panfrost_get_tex_desc(batch, stage, view);
1466 }
1467
1468 postfix->textures = panfrost_upload_transient(batch,
1469 trampolines,
1470 sizeof(uint64_t) *
1471 ctx->sampler_view_count[stage]);
1472 }
1473 }
1474
1475 void
1476 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1477 enum pipe_shader_type stage,
1478 struct mali_vertex_tiler_postfix *postfix)
1479 {
1480 struct panfrost_context *ctx = batch->ctx;
1481 struct panfrost_device *device = pan_device(ctx->base.screen);
1482
1483 if (!ctx->sampler_count[stage])
1484 return;
1485
1486 if (device->quirks & IS_BIFROST) {
1487 size_t desc_size = sizeof(struct bifrost_sampler_descriptor);
1488 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1489 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1490 transfer_size);
1491 struct bifrost_sampler_descriptor *desc = (struct bifrost_sampler_descriptor *)transfer.cpu;
1492
1493 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1494 desc[i] = ctx->samplers[stage][i]->bifrost_hw;
1495
1496 postfix->sampler_descriptor = transfer.gpu;
1497 } else {
1498 size_t desc_size = sizeof(struct mali_sampler_descriptor);
1499 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1500 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1501 transfer_size);
1502 struct mali_sampler_descriptor *desc = (struct mali_sampler_descriptor *)transfer.cpu;
1503
1504 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1505 desc[i] = ctx->samplers[stage][i]->midgard_hw;
1506
1507 postfix->sampler_descriptor = transfer.gpu;
1508 }
1509 }
1510
1511 void
1512 panfrost_emit_vertex_attr_meta(struct panfrost_batch *batch,
1513 struct mali_vertex_tiler_postfix *vertex_postfix)
1514 {
1515 struct panfrost_context *ctx = batch->ctx;
1516
1517 if (!ctx->vertex)
1518 return;
1519
1520 struct panfrost_vertex_state *so = ctx->vertex;
1521
1522 panfrost_vertex_state_upd_attr_offs(ctx, vertex_postfix);
1523 vertex_postfix->attribute_meta = panfrost_upload_transient(batch, so->hw,
1524 sizeof(*so->hw) *
1525 PAN_MAX_ATTRIBUTE);
1526 }
1527
1528 void
1529 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1530 struct mali_vertex_tiler_postfix *vertex_postfix)
1531 {
1532 struct panfrost_context *ctx = batch->ctx;
1533 struct panfrost_vertex_state *so = ctx->vertex;
1534
1535 /* Staged mali_attr, and index into them. i =/= k, depending on the
1536 * vertex buffer mask and instancing. Twice as much room is allocated,
1537 * for a worst case of NPOT_DIVIDEs which take up extra slot */
1538 union mali_attr attrs[PIPE_MAX_ATTRIBS * 2];
1539 unsigned k = 0;
1540
1541 for (unsigned i = 0; i < so->num_elements; ++i) {
1542 /* We map a mali_attr to be 1:1 with the mali_attr_meta, which
1543 * means duplicating some vertex buffers (who cares? aside from
1544 * maybe some caching implications but I somehow doubt that
1545 * matters) */
1546
1547 struct pipe_vertex_element *elem = &so->pipe[i];
1548 unsigned vbi = elem->vertex_buffer_index;
1549
1550 /* The exception to 1:1 mapping is that we can have multiple
1551 * entries (NPOT divisors), so we fixup anyways */
1552
1553 so->hw[i].index = k;
1554
1555 if (!(ctx->vb_mask & (1 << vbi)))
1556 continue;
1557
1558 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1559 struct panfrost_resource *rsrc;
1560
1561 rsrc = pan_resource(buf->buffer.resource);
1562 if (!rsrc)
1563 continue;
1564
1565 /* Align to 64 bytes by masking off the lower bits. This
1566 * will be adjusted back when we fixup the src_offset in
1567 * mali_attr_meta */
1568
1569 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1570 mali_ptr addr = raw_addr & ~63;
1571 unsigned chopped_addr = raw_addr - addr;
1572
1573 /* Add a dependency of the batch on the vertex buffer */
1574 panfrost_batch_add_bo(batch, rsrc->bo,
1575 PAN_BO_ACCESS_SHARED |
1576 PAN_BO_ACCESS_READ |
1577 PAN_BO_ACCESS_VERTEX_TILER);
1578
1579 /* Set common fields */
1580 attrs[k].elements = addr;
1581 attrs[k].stride = buf->stride;
1582
1583 /* Since we advanced the base pointer, we shrink the buffer
1584 * size */
1585 attrs[k].size = rsrc->base.width0 - buf->buffer_offset;
1586
1587 /* We need to add the extra size we masked off (for
1588 * correctness) so the data doesn't get clamped away */
1589 attrs[k].size += chopped_addr;
1590
1591 /* For non-instancing make sure we initialize */
1592 attrs[k].shift = attrs[k].extra_flags = 0;
1593
1594 /* Instancing uses a dramatically different code path than
1595 * linear, so dispatch for the actual emission now that the
1596 * common code is finished */
1597
1598 unsigned divisor = elem->instance_divisor;
1599
1600 if (divisor && ctx->instance_count == 1) {
1601 /* Silly corner case where there's a divisor(=1) but
1602 * there's no legitimate instancing. So we want *every*
1603 * attribute to be the same. So set stride to zero so
1604 * we don't go anywhere. */
1605
1606 attrs[k].size = attrs[k].stride + chopped_addr;
1607 attrs[k].stride = 0;
1608 attrs[k++].elements |= MALI_ATTR_LINEAR;
1609 } else if (ctx->instance_count <= 1) {
1610 /* Normal, non-instanced attributes */
1611 attrs[k++].elements |= MALI_ATTR_LINEAR;
1612 } else {
1613 unsigned instance_shift = vertex_postfix->instance_shift;
1614 unsigned instance_odd = vertex_postfix->instance_odd;
1615
1616 k += panfrost_vertex_instanced(ctx->padded_count,
1617 instance_shift,
1618 instance_odd,
1619 divisor, &attrs[k]);
1620 }
1621 }
1622
1623 /* Add special gl_VertexID/gl_InstanceID buffers */
1624
1625 panfrost_vertex_id(ctx->padded_count, &attrs[k]);
1626 so->hw[PAN_VERTEX_ID].index = k++;
1627 panfrost_instance_id(ctx->padded_count, &attrs[k]);
1628 so->hw[PAN_INSTANCE_ID].index = k++;
1629
1630 /* Upload whatever we emitted and go */
1631
1632 vertex_postfix->attributes = panfrost_upload_transient(batch, attrs,
1633 k * sizeof(*attrs));
1634 }
1635
1636 static mali_ptr
1637 panfrost_emit_varyings(struct panfrost_batch *batch, union mali_attr *slot,
1638 unsigned stride, unsigned count)
1639 {
1640 /* Fill out the descriptor */
1641 slot->stride = stride;
1642 slot->size = stride * count;
1643 slot->shift = slot->extra_flags = 0;
1644
1645 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1646 slot->size);
1647
1648 slot->elements = transfer.gpu | MALI_ATTR_LINEAR;
1649
1650 return transfer.gpu;
1651 }
1652
1653 static unsigned
1654 panfrost_streamout_offset(unsigned stride, unsigned offset,
1655 struct pipe_stream_output_target *target)
1656 {
1657 return (target->buffer_offset + (offset * stride * 4)) & 63;
1658 }
1659
1660 static void
1661 panfrost_emit_streamout(struct panfrost_batch *batch, union mali_attr *slot,
1662 unsigned stride, unsigned offset, unsigned count,
1663 struct pipe_stream_output_target *target)
1664 {
1665 /* Fill out the descriptor */
1666 slot->stride = stride * 4;
1667 slot->shift = slot->extra_flags = 0;
1668
1669 unsigned max_size = target->buffer_size;
1670 unsigned expected_size = slot->stride * count;
1671
1672 /* Grab the BO and bind it to the batch */
1673 struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1674
1675 /* Varyings are WRITE from the perspective of the VERTEX but READ from
1676 * the perspective of the TILER and FRAGMENT.
1677 */
1678 panfrost_batch_add_bo(batch, bo,
1679 PAN_BO_ACCESS_SHARED |
1680 PAN_BO_ACCESS_RW |
1681 PAN_BO_ACCESS_VERTEX_TILER |
1682 PAN_BO_ACCESS_FRAGMENT);
1683
1684 /* We will have an offset applied to get alignment */
1685 mali_ptr addr = bo->gpu + target->buffer_offset + (offset * slot->stride);
1686 slot->elements = (addr & ~63) | MALI_ATTR_LINEAR;
1687 slot->size = MIN2(max_size, expected_size) + (addr & 63);
1688 }
1689
1690 static bool
1691 has_point_coord(unsigned mask, gl_varying_slot loc)
1692 {
1693 if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1694 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1695 else if (loc == VARYING_SLOT_PNTC)
1696 return (mask & (1 << 8));
1697 else
1698 return false;
1699 }
1700
1701 /* Helpers for manipulating stream out information so we can pack varyings
1702 * accordingly. Compute the src_offset for a given captured varying */
1703
1704 static struct pipe_stream_output *
1705 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1706 {
1707 for (unsigned i = 0; i < info->num_outputs; ++i) {
1708 if (info->output[i].register_index == loc)
1709 return &info->output[i];
1710 }
1711
1712 unreachable("Varying not captured");
1713 }
1714
1715 static unsigned
1716 pan_varying_size(enum mali_format fmt)
1717 {
1718 unsigned type = MALI_EXTRACT_TYPE(fmt);
1719 unsigned chan = MALI_EXTRACT_CHANNELS(fmt);
1720 unsigned bits = MALI_EXTRACT_BITS(fmt);
1721 unsigned bpc = 0;
1722
1723 if (bits == MALI_CHANNEL_FLOAT) {
1724 /* No doubles */
1725 bool fp16 = (type == MALI_FORMAT_SINT);
1726 assert(fp16 || (type == MALI_FORMAT_UNORM));
1727
1728 bpc = fp16 ? 2 : 4;
1729 } else {
1730 assert(type >= MALI_FORMAT_SNORM && type <= MALI_FORMAT_SINT);
1731
1732 /* See the enums */
1733 bits = 1 << bits;
1734 assert(bits >= 8);
1735 bpc = bits / 8;
1736 }
1737
1738 return bpc * chan;
1739 }
1740
1741 /* Indices for named (non-XFB) varyings that are present. These are packed
1742 * tightly so they correspond to a bitfield present (P) indexed by (1 <<
1743 * PAN_VARY_*). This has the nice property that you can lookup the buffer index
1744 * of a given special field given a shift S by:
1745 *
1746 * idx = popcount(P & ((1 << S) - 1))
1747 *
1748 * That is... look at all of the varyings that come earlier and count them, the
1749 * count is the new index since plus one. Likewise, the total number of special
1750 * buffers required is simply popcount(P)
1751 */
1752
1753 enum pan_special_varying {
1754 PAN_VARY_GENERAL = 0,
1755 PAN_VARY_POSITION = 1,
1756 PAN_VARY_PSIZ = 2,
1757 PAN_VARY_PNTCOORD = 3,
1758 PAN_VARY_FACE = 4,
1759 PAN_VARY_FRAGCOORD = 5,
1760
1761 /* Keep last */
1762 PAN_VARY_MAX,
1763 };
1764
1765 /* Given a varying, figure out which index it correpsonds to */
1766
1767 static inline unsigned
1768 pan_varying_index(unsigned present, enum pan_special_varying v)
1769 {
1770 unsigned mask = (1 << v) - 1;
1771 return util_bitcount(present & mask);
1772 }
1773
1774 /* Get the base offset for XFB buffers, which by convention come after
1775 * everything else. Wrapper function for semantic reasons; by construction this
1776 * is just popcount. */
1777
1778 static inline unsigned
1779 pan_xfb_base(unsigned present)
1780 {
1781 return util_bitcount(present);
1782 }
1783
1784 /* Computes the present mask for varyings so we can start emitting varying records */
1785
1786 static inline unsigned
1787 pan_varying_present(
1788 struct panfrost_shader_state *vs,
1789 struct panfrost_shader_state *fs,
1790 unsigned quirks)
1791 {
1792 /* At the moment we always emit general and position buffers. Not
1793 * strictly necessary but usually harmless */
1794
1795 unsigned present = (1 << PAN_VARY_GENERAL) | (1 << PAN_VARY_POSITION);
1796
1797 /* Enable special buffers by the shader info */
1798
1799 if (vs->writes_point_size)
1800 present |= (1 << PAN_VARY_PSIZ);
1801
1802 if (fs->reads_point_coord)
1803 present |= (1 << PAN_VARY_PNTCOORD);
1804
1805 if (fs->reads_face)
1806 present |= (1 << PAN_VARY_FACE);
1807
1808 if (fs->reads_frag_coord && !(quirks & IS_BIFROST))
1809 present |= (1 << PAN_VARY_FRAGCOORD);
1810
1811 /* Also, if we have a point sprite, we need a point coord buffer */
1812
1813 for (unsigned i = 0; i < fs->varying_count; i++) {
1814 gl_varying_slot loc = fs->varyings_loc[i];
1815
1816 if (has_point_coord(fs->point_sprite_mask, loc))
1817 present |= (1 << PAN_VARY_PNTCOORD);
1818 }
1819
1820 return present;
1821 }
1822
1823 /* Emitters for varying records */
1824
1825 static struct mali_attr_meta
1826 pan_emit_vary(unsigned present, enum pan_special_varying buf,
1827 unsigned quirks, enum mali_format format,
1828 unsigned offset)
1829 {
1830 unsigned nr_channels = MALI_EXTRACT_CHANNELS(format);
1831
1832 struct mali_attr_meta meta = {
1833 .index = pan_varying_index(present, buf),
1834 .unknown1 = quirks & IS_BIFROST ? 0x0 : 0x2,
1835 .swizzle = quirks & HAS_SWIZZLES ?
1836 panfrost_get_default_swizzle(nr_channels) :
1837 panfrost_bifrost_swizzle(nr_channels),
1838 .format = format,
1839 .src_offset = offset
1840 };
1841
1842 return meta;
1843 }
1844
1845 /* General varying that is unused */
1846
1847 static struct mali_attr_meta
1848 pan_emit_vary_only(unsigned present, unsigned quirks)
1849 {
1850 return pan_emit_vary(present, 0, quirks, MALI_VARYING_DISCARD, 0);
1851 }
1852
1853 /* Special records */
1854
1855 static const enum mali_format pan_varying_formats[PAN_VARY_MAX] = {
1856 [PAN_VARY_POSITION] = MALI_VARYING_POS,
1857 [PAN_VARY_PSIZ] = MALI_R16F,
1858 [PAN_VARY_PNTCOORD] = MALI_R16F,
1859 [PAN_VARY_FACE] = MALI_R32I,
1860 [PAN_VARY_FRAGCOORD] = MALI_RGBA32F
1861 };
1862
1863 static struct mali_attr_meta
1864 pan_emit_vary_special(unsigned present, enum pan_special_varying buf,
1865 unsigned quirks)
1866 {
1867 assert(buf < PAN_VARY_MAX);
1868 return pan_emit_vary(present, buf, quirks, pan_varying_formats[buf], 0);
1869 }
1870
1871 static enum mali_format
1872 pan_xfb_format(enum mali_format format, unsigned nr)
1873 {
1874 if (MALI_EXTRACT_BITS(format) == MALI_CHANNEL_FLOAT)
1875 return MALI_R32F | MALI_NR_CHANNELS(nr);
1876 else
1877 return MALI_EXTRACT_TYPE(format) | MALI_NR_CHANNELS(nr) | MALI_CHANNEL_32;
1878 }
1879
1880 /* Transform feedback records. Note struct pipe_stream_output is (if packed as
1881 * a bitfield) 32-bit, smaller than a 64-bit pointer, so may as well pass by
1882 * value. */
1883
1884 static struct mali_attr_meta
1885 pan_emit_vary_xfb(unsigned present,
1886 unsigned max_xfb,
1887 unsigned *streamout_offsets,
1888 unsigned quirks,
1889 enum mali_format format,
1890 struct pipe_stream_output o)
1891 {
1892 /* Otherwise construct a record for it */
1893 struct mali_attr_meta meta = {
1894 /* XFB buffers come after everything else */
1895 .index = pan_xfb_base(present) + o.output_buffer,
1896
1897 /* As usual unknown bit */
1898 .unknown1 = quirks & IS_BIFROST ? 0x0 : 0x2,
1899
1900 /* Override swizzle with number of channels */
1901 .swizzle = quirks & HAS_SWIZZLES ?
1902 panfrost_get_default_swizzle(o.num_components) :
1903 panfrost_bifrost_swizzle(o.num_components),
1904
1905 /* Override number of channels and precision to highp */
1906 .format = pan_xfb_format(format, o.num_components),
1907
1908 /* Apply given offsets together */
1909 .src_offset = (o.dst_offset * 4) /* dwords */
1910 + streamout_offsets[o.output_buffer]
1911 };
1912
1913 return meta;
1914 }
1915
1916 /* Determine if we should capture a varying for XFB. This requires actually
1917 * having a buffer for it. If we don't capture it, we'll fallback to a general
1918 * varying path (linked or unlinked, possibly discarding the write) */
1919
1920 static bool
1921 panfrost_xfb_captured(struct panfrost_shader_state *xfb,
1922 unsigned loc, unsigned max_xfb)
1923 {
1924 if (!(xfb->so_mask & (1ll << loc)))
1925 return false;
1926
1927 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1928 return o->output_buffer < max_xfb;
1929 }
1930
1931 /* Higher-level wrapper around all of the above, classifying a varying into one
1932 * of the above types */
1933
1934 static struct mali_attr_meta
1935 panfrost_emit_varying(
1936 struct panfrost_shader_state *stage,
1937 struct panfrost_shader_state *other,
1938 struct panfrost_shader_state *xfb,
1939 unsigned present,
1940 unsigned max_xfb,
1941 unsigned *streamout_offsets,
1942 unsigned quirks,
1943 unsigned *gen_offsets,
1944 enum mali_format *gen_formats,
1945 unsigned *gen_stride,
1946 unsigned idx,
1947 bool should_alloc,
1948 bool is_fragment)
1949 {
1950 gl_varying_slot loc = stage->varyings_loc[idx];
1951 enum mali_format format = stage->varyings[idx];
1952
1953 /* Override format to match linkage */
1954 if (!should_alloc && gen_formats[idx])
1955 format = gen_formats[idx];
1956
1957 if (has_point_coord(stage->point_sprite_mask, loc)) {
1958 return pan_emit_vary_special(present, PAN_VARY_PNTCOORD, quirks);
1959 } else if (panfrost_xfb_captured(xfb, loc, max_xfb)) {
1960 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1961 return pan_emit_vary_xfb(present, max_xfb, streamout_offsets, quirks, format, *o);
1962 } else if (loc == VARYING_SLOT_POS) {
1963 if (is_fragment)
1964 return pan_emit_vary_special(present, PAN_VARY_FRAGCOORD, quirks);
1965 else
1966 return pan_emit_vary_special(present, PAN_VARY_POSITION, quirks);
1967 } else if (loc == VARYING_SLOT_PSIZ) {
1968 return pan_emit_vary_special(present, PAN_VARY_PSIZ, quirks);
1969 } else if (loc == VARYING_SLOT_PNTC) {
1970 return pan_emit_vary_special(present, PAN_VARY_PNTCOORD, quirks);
1971 } else if (loc == VARYING_SLOT_FACE) {
1972 return pan_emit_vary_special(present, PAN_VARY_FACE, quirks);
1973 }
1974
1975 /* We've exhausted special cases, so it's otherwise a general varying. Check if we're linked */
1976 signed other_idx = -1;
1977
1978 for (unsigned j = 0; j < other->varying_count; ++j) {
1979 if (other->varyings_loc[j] == loc) {
1980 other_idx = j;
1981 break;
1982 }
1983 }
1984
1985 if (other_idx < 0)
1986 return pan_emit_vary_only(present, quirks);
1987
1988 unsigned offset = gen_offsets[other_idx];
1989
1990 if (should_alloc) {
1991 /* We're linked, so allocate a space via a watermark allocation */
1992 enum mali_format alt = other->varyings[other_idx];
1993
1994 /* Do interpolation at minimum precision */
1995 unsigned size_main = pan_varying_size(format);
1996 unsigned size_alt = pan_varying_size(alt);
1997 unsigned size = MIN2(size_main, size_alt);
1998
1999 /* If a varying is marked for XFB but not actually captured, we
2000 * should match the format to the format that would otherwise
2001 * be used for XFB, since dEQP checks for invariance here. It's
2002 * unclear if this is required by the spec. */
2003
2004 if (xfb->so_mask & (1ull << loc)) {
2005 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
2006 format = pan_xfb_format(format, o->num_components);
2007 size = pan_varying_size(format);
2008 } else if (size == size_alt) {
2009 format = alt;
2010 }
2011
2012 gen_offsets[idx] = *gen_stride;
2013 gen_formats[other_idx] = format;
2014 offset = *gen_stride;
2015 *gen_stride += size;
2016 }
2017
2018 return pan_emit_vary(present, PAN_VARY_GENERAL,
2019 quirks, format, offset);
2020 }
2021
2022 static void
2023 pan_emit_special_input(union mali_attr *varyings,
2024 unsigned present,
2025 enum pan_special_varying v,
2026 mali_ptr addr)
2027 {
2028 if (present & (1 << v)) {
2029 /* Ensure we write exactly once for performance and with fields
2030 * zeroed appropriately to avoid flakes */
2031
2032 union mali_attr s = {
2033 .elements = addr
2034 };
2035
2036 varyings[pan_varying_index(present, v)] = s;
2037 }
2038 }
2039
2040 void
2041 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
2042 unsigned vertex_count,
2043 struct mali_vertex_tiler_postfix *vertex_postfix,
2044 struct mali_vertex_tiler_postfix *tiler_postfix,
2045 union midgard_primitive_size *primitive_size)
2046 {
2047 /* Load the shaders */
2048 struct panfrost_context *ctx = batch->ctx;
2049 struct panfrost_device *dev = pan_device(ctx->base.screen);
2050 struct panfrost_shader_state *vs, *fs;
2051 size_t vs_size, fs_size;
2052
2053 /* Allocate the varying descriptor */
2054
2055 vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
2056 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
2057 vs_size = sizeof(struct mali_attr_meta) * vs->varying_count;
2058 fs_size = sizeof(struct mali_attr_meta) * fs->varying_count;
2059
2060 struct panfrost_transfer trans = panfrost_allocate_transient(batch,
2061 vs_size +
2062 fs_size);
2063
2064 struct pipe_stream_output_info *so = &vs->stream_output;
2065 unsigned present = pan_varying_present(vs, fs, dev->quirks);
2066
2067 /* Check if this varying is linked by us. This is the case for
2068 * general-purpose, non-captured varyings. If it is, link it. If it's
2069 * not, use the provided stream out information to determine the
2070 * offset, since it was already linked for us. */
2071
2072 unsigned gen_offsets[32];
2073 enum mali_format gen_formats[32];
2074 memset(gen_offsets, 0, sizeof(gen_offsets));
2075 memset(gen_formats, 0, sizeof(gen_formats));
2076
2077 unsigned gen_stride = 0;
2078 assert(vs->varying_count < ARRAY_SIZE(gen_offsets));
2079 assert(fs->varying_count < ARRAY_SIZE(gen_offsets));
2080
2081 unsigned streamout_offsets[32];
2082
2083 for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
2084 streamout_offsets[i] = panfrost_streamout_offset(
2085 so->stride[i],
2086 ctx->streamout.offsets[i],
2087 ctx->streamout.targets[i]);
2088 }
2089
2090 struct mali_attr_meta *ovs = (struct mali_attr_meta *)trans.cpu;
2091 struct mali_attr_meta *ofs = ovs + vs->varying_count;
2092
2093 for (unsigned i = 0; i < vs->varying_count; i++) {
2094 ovs[i] = panfrost_emit_varying(vs, fs, vs, present,
2095 ctx->streamout.num_targets, streamout_offsets,
2096 dev->quirks,
2097 gen_offsets, gen_formats, &gen_stride, i, true, false);
2098 }
2099
2100 for (unsigned i = 0; i < fs->varying_count; i++) {
2101 ofs[i] = panfrost_emit_varying(fs, vs, vs, present,
2102 ctx->streamout.num_targets, streamout_offsets,
2103 dev->quirks,
2104 gen_offsets, gen_formats, &gen_stride, i, false, true);
2105 }
2106
2107 unsigned xfb_base = pan_xfb_base(present);
2108 struct panfrost_transfer T = panfrost_allocate_transient(batch,
2109 sizeof(union mali_attr) * (xfb_base + ctx->streamout.num_targets));
2110 union mali_attr *varyings = (union mali_attr *) T.cpu;
2111
2112 /* Emit the stream out buffers */
2113
2114 unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
2115 ctx->vertex_count);
2116
2117 for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
2118 panfrost_emit_streamout(batch, &varyings[xfb_base + i],
2119 so->stride[i],
2120 ctx->streamout.offsets[i],
2121 out_count,
2122 ctx->streamout.targets[i]);
2123 }
2124
2125 panfrost_emit_varyings(batch,
2126 &varyings[pan_varying_index(present, PAN_VARY_GENERAL)],
2127 gen_stride, vertex_count);
2128
2129 /* fp32 vec4 gl_Position */
2130 tiler_postfix->position_varying = panfrost_emit_varyings(batch,
2131 &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
2132 sizeof(float) * 4, vertex_count);
2133
2134 if (present & (1 << PAN_VARY_PSIZ)) {
2135 primitive_size->pointer = panfrost_emit_varyings(batch,
2136 &varyings[pan_varying_index(present, PAN_VARY_PSIZ)],
2137 2, vertex_count);
2138 }
2139
2140 pan_emit_special_input(varyings, present, PAN_VARY_PNTCOORD, MALI_VARYING_POINT_COORD);
2141 pan_emit_special_input(varyings, present, PAN_VARY_FACE, MALI_VARYING_FRONT_FACING);
2142 pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD, MALI_VARYING_FRAG_COORD);
2143
2144 vertex_postfix->varyings = T.gpu;
2145 tiler_postfix->varyings = T.gpu;
2146
2147 vertex_postfix->varying_meta = trans.gpu;
2148 tiler_postfix->varying_meta = trans.gpu + vs_size;
2149 }
2150
2151 void
2152 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
2153 struct mali_vertex_tiler_prefix *vertex_prefix,
2154 struct mali_vertex_tiler_postfix *vertex_postfix,
2155 struct mali_vertex_tiler_prefix *tiler_prefix,
2156 struct mali_vertex_tiler_postfix *tiler_postfix,
2157 union midgard_primitive_size *primitive_size)
2158 {
2159 struct panfrost_context *ctx = batch->ctx;
2160 struct panfrost_device *device = pan_device(ctx->base.screen);
2161 bool wallpapering = ctx->wallpaper_batch && batch->tiler_dep;
2162 struct bifrost_payload_vertex bifrost_vertex = {0,};
2163 struct bifrost_payload_tiler bifrost_tiler = {0,};
2164 struct midgard_payload_vertex_tiler midgard_vertex = {0,};
2165 struct midgard_payload_vertex_tiler midgard_tiler = {0,};
2166 void *vp, *tp;
2167 size_t vp_size, tp_size;
2168
2169 if (device->quirks & IS_BIFROST) {
2170 bifrost_vertex.prefix = *vertex_prefix;
2171 bifrost_vertex.postfix = *vertex_postfix;
2172 vp = &bifrost_vertex;
2173 vp_size = sizeof(bifrost_vertex);
2174
2175 bifrost_tiler.prefix = *tiler_prefix;
2176 bifrost_tiler.tiler.primitive_size = *primitive_size;
2177 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
2178 bifrost_tiler.postfix = *tiler_postfix;
2179 tp = &bifrost_tiler;
2180 tp_size = sizeof(bifrost_tiler);
2181 } else {
2182 midgard_vertex.prefix = *vertex_prefix;
2183 midgard_vertex.postfix = *vertex_postfix;
2184 vp = &midgard_vertex;
2185 vp_size = sizeof(midgard_vertex);
2186
2187 midgard_tiler.prefix = *tiler_prefix;
2188 midgard_tiler.postfix = *tiler_postfix;
2189 midgard_tiler.primitive_size = *primitive_size;
2190 tp = &midgard_tiler;
2191 tp_size = sizeof(midgard_tiler);
2192 }
2193
2194 if (wallpapering) {
2195 /* Inject in reverse order, with "predicted" job indices.
2196 * THIS IS A HACK XXX */
2197 panfrost_new_job(batch, JOB_TYPE_TILER, false,
2198 batch->job_index + 2, tp, tp_size, true);
2199 panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
2200 vp, vp_size, true);
2201 return;
2202 }
2203
2204 /* If rasterizer discard is enable, only submit the vertex */
2205
2206 bool rasterizer_discard = ctx->rasterizer &&
2207 ctx->rasterizer->base.rasterizer_discard;
2208
2209 unsigned vertex = panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
2210 vp, vp_size, false);
2211
2212 if (rasterizer_discard)
2213 return;
2214
2215 panfrost_new_job(batch, JOB_TYPE_TILER, false, vertex, tp, tp_size,
2216 false);
2217 }
2218
2219 /* TODO: stop hardcoding this */
2220 mali_ptr
2221 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2222 {
2223 uint16_t locations[] = {
2224 128, 128,
2225 0, 256,
2226 0, 256,
2227 0, 256,
2228 0, 256,
2229 0, 256,
2230 0, 256,
2231 0, 256,
2232 0, 256,
2233 0, 256,
2234 0, 256,
2235 0, 256,
2236 0, 256,
2237 0, 256,
2238 0, 256,
2239 0, 256,
2240 0, 256,
2241 0, 256,
2242 0, 256,
2243 0, 256,
2244 0, 256,
2245 0, 256,
2246 0, 256,
2247 0, 256,
2248 0, 256,
2249 0, 256,
2250 0, 256,
2251 0, 256,
2252 0, 256,
2253 0, 256,
2254 0, 256,
2255 0, 256,
2256 128, 128,
2257 0, 0,
2258 0, 0,
2259 0, 0,
2260 0, 0,
2261 0, 0,
2262 0, 0,
2263 0, 0,
2264 0, 0,
2265 0, 0,
2266 0, 0,
2267 0, 0,
2268 0, 0,
2269 0, 0,
2270 0, 0,
2271 0, 0,
2272 };
2273
2274 return panfrost_upload_transient(batch, locations, 96 * sizeof(uint16_t));
2275 }