panfrost: Clean up panfrost_frag_meta_rasterizer_update
[mesa.git] / src / gallium / drivers / panfrost / pan_cmdstream.c
1 /*
2 * Copyright (C) 2018 Alyssa Rosenzweig
3 * Copyright (C) 2020 Collabora Ltd.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "util/macros.h"
26 #include "util/u_prim.h"
27 #include "util/u_vbuf.h"
28
29 #include "panfrost-quirks.h"
30
31 #include "pan_allocate.h"
32 #include "pan_bo.h"
33 #include "pan_cmdstream.h"
34 #include "pan_context.h"
35 #include "pan_job.h"
36
37 /* If a BO is accessed for a particular shader stage, will it be in the primary
38 * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
39 * fragment will be primary, e.g. compute jobs will be considered
40 * "vertex/tiler" by analogy */
41
42 static inline uint32_t
43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
44 {
45 assert(stage == PIPE_SHADER_FRAGMENT ||
46 stage == PIPE_SHADER_VERTEX ||
47 stage == PIPE_SHADER_COMPUTE);
48
49 return stage == PIPE_SHADER_FRAGMENT ?
50 PAN_BO_ACCESS_FRAGMENT :
51 PAN_BO_ACCESS_VERTEX_TILER;
52 }
53
54 static void
55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
56 struct mali_vertex_tiler_postfix *postfix)
57 {
58 struct panfrost_device *dev = pan_device(ctx->base.screen);
59 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
60
61 unsigned shift = panfrost_get_stack_shift(batch->stack_size);
62 struct mali_shared_memory shared = {
63 .stack_shift = shift,
64 .scratchpad = panfrost_batch_get_scratchpad(batch, shift, dev->thread_tls_alloc, dev->core_count)->gpu,
65 .shared_workgroup_count = ~0,
66 };
67 postfix->shared_memory = panfrost_upload_transient(batch, &shared, sizeof(shared));
68 }
69
70 static void
71 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
72 struct mali_vertex_tiler_postfix *postfix)
73 {
74 struct panfrost_device *dev = pan_device(ctx->base.screen);
75 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
76
77 /* If we haven't, reserve space for the framebuffer */
78
79 if (!batch->framebuffer.gpu) {
80 unsigned size = (dev->quirks & MIDGARD_SFBD) ?
81 sizeof(struct mali_single_framebuffer) :
82 sizeof(struct mali_framebuffer);
83
84 batch->framebuffer = panfrost_allocate_transient(batch, size);
85
86 /* Tag the pointer */
87 if (!(dev->quirks & MIDGARD_SFBD))
88 batch->framebuffer.gpu |= MALI_MFBD;
89 }
90
91 postfix->shared_memory = batch->framebuffer.gpu;
92 }
93
94 static void
95 panfrost_vt_update_rasterizer(struct panfrost_context *ctx,
96 struct mali_vertex_tiler_prefix *prefix,
97 struct mali_vertex_tiler_postfix *postfix)
98 {
99 struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
100
101 postfix->gl_enables |= 0x7;
102 SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
103 rasterizer && rasterizer->base.front_ccw);
104 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
105 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_FRONT));
106 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
107 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_BACK));
108 SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
109 rasterizer && rasterizer->base.flatshade_first);
110 }
111
112 void
113 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
114 struct mali_vertex_tiler_prefix *prefix,
115 union midgard_primitive_size *primitive_size)
116 {
117 struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
118
119 if (!panfrost_writes_point_size(ctx)) {
120 bool points = prefix->draw_mode == MALI_POINTS;
121 float val = 0.0f;
122
123 if (rasterizer)
124 val = points ?
125 rasterizer->base.point_size :
126 rasterizer->base.line_width;
127
128 primitive_size->constant = val;
129 }
130 }
131
132 static void
133 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
134 struct mali_vertex_tiler_postfix *postfix)
135 {
136 SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
137 if (ctx->occlusion_query)
138 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
139 else
140 postfix->occlusion_counter = 0;
141 }
142
143 void
144 panfrost_vt_init(struct panfrost_context *ctx,
145 enum pipe_shader_type stage,
146 struct mali_vertex_tiler_prefix *prefix,
147 struct mali_vertex_tiler_postfix *postfix)
148 {
149 struct panfrost_device *device = pan_device(ctx->base.screen);
150
151 if (!ctx->shader[stage])
152 return;
153
154 memset(prefix, 0, sizeof(*prefix));
155 memset(postfix, 0, sizeof(*postfix));
156
157 if (device->quirks & IS_BIFROST) {
158 postfix->gl_enables = 0x2;
159 panfrost_vt_emit_shared_memory(ctx, postfix);
160 } else {
161 postfix->gl_enables = 0x6;
162 panfrost_vt_attach_framebuffer(ctx, postfix);
163 }
164
165 if (stage == PIPE_SHADER_FRAGMENT) {
166 panfrost_vt_update_occlusion_query(ctx, postfix);
167 panfrost_vt_update_rasterizer(ctx, prefix, postfix);
168 }
169 }
170
171 static unsigned
172 panfrost_translate_index_size(unsigned size)
173 {
174 switch (size) {
175 case 1:
176 return MALI_DRAW_INDEXED_UINT8;
177
178 case 2:
179 return MALI_DRAW_INDEXED_UINT16;
180
181 case 4:
182 return MALI_DRAW_INDEXED_UINT32;
183
184 default:
185 unreachable("Invalid index size");
186 }
187 }
188
189 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
190 * good for the duration of the draw (transient), could last longer. Also get
191 * the bounds on the index buffer for the range accessed by the draw. We do
192 * these operations together because there are natural optimizations which
193 * require them to be together. */
194
195 static mali_ptr
196 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
197 const struct pipe_draw_info *info,
198 unsigned *min_index, unsigned *max_index)
199 {
200 struct panfrost_resource *rsrc = pan_resource(info->index.resource);
201 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
202 off_t offset = info->start * info->index_size;
203 bool needs_indices = true;
204 mali_ptr out = 0;
205
206 if (info->max_index != ~0u) {
207 *min_index = info->min_index;
208 *max_index = info->max_index;
209 needs_indices = false;
210 }
211
212 if (!info->has_user_indices) {
213 /* Only resources can be directly mapped */
214 panfrost_batch_add_bo(batch, rsrc->bo,
215 PAN_BO_ACCESS_SHARED |
216 PAN_BO_ACCESS_READ |
217 PAN_BO_ACCESS_VERTEX_TILER);
218 out = rsrc->bo->gpu + offset;
219
220 /* Check the cache */
221 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
222 info->start,
223 info->count,
224 min_index,
225 max_index);
226 } else {
227 /* Otherwise, we need to upload to transient memory */
228 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
229 out = panfrost_upload_transient(batch, ibuf8 + offset,
230 info->count *
231 info->index_size);
232 }
233
234 if (needs_indices) {
235 /* Fallback */
236 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
237
238 if (!info->has_user_indices)
239 panfrost_minmax_cache_add(rsrc->index_cache,
240 info->start, info->count,
241 *min_index, *max_index);
242 }
243
244 return out;
245 }
246
247 void
248 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
249 const struct pipe_draw_info *info,
250 enum mali_draw_mode draw_mode,
251 struct mali_vertex_tiler_postfix *vertex_postfix,
252 struct mali_vertex_tiler_prefix *tiler_prefix,
253 struct mali_vertex_tiler_postfix *tiler_postfix,
254 unsigned *vertex_count,
255 unsigned *padded_count)
256 {
257 tiler_prefix->draw_mode = draw_mode;
258
259 unsigned draw_flags = 0;
260
261 if (panfrost_writes_point_size(ctx))
262 draw_flags |= MALI_DRAW_VARYING_SIZE;
263
264 if (info->primitive_restart)
265 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
266
267 /* These doesn't make much sense */
268
269 draw_flags |= 0x3000;
270
271 if (info->index_size) {
272 unsigned min_index = 0, max_index = 0;
273
274 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
275 info,
276 &min_index,
277 &max_index);
278
279 /* Use the corresponding values */
280 *vertex_count = max_index - min_index + 1;
281 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
282 tiler_prefix->offset_bias_correction = -min_index;
283 tiler_prefix->index_count = MALI_POSITIVE(info->count);
284 draw_flags |= panfrost_translate_index_size(info->index_size);
285 } else {
286 tiler_prefix->indices = 0;
287 *vertex_count = ctx->vertex_count;
288 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
289 tiler_prefix->offset_bias_correction = 0;
290 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
291 }
292
293 tiler_prefix->unknown_draw = draw_flags;
294
295 /* Encode the padded vertex count */
296
297 if (info->instance_count > 1) {
298 *padded_count = panfrost_padded_vertex_count(*vertex_count);
299
300 unsigned shift = __builtin_ctz(ctx->padded_count);
301 unsigned k = ctx->padded_count >> (shift + 1);
302
303 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
304 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
305 } else {
306 *padded_count = *vertex_count;
307
308 /* Reset instancing state */
309 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
310 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
311 }
312 }
313
314 static void
315 panfrost_shader_meta_init(struct panfrost_context *ctx,
316 enum pipe_shader_type st,
317 struct mali_shader_meta *meta)
318 {
319 const struct panfrost_device *dev = pan_device(ctx->base.screen);
320 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
321
322 memset(meta, 0, sizeof(*meta));
323 meta->shader = (ss->bo ? ss->bo->gpu : 0) | ss->first_tag;
324 meta->attribute_count = ss->attribute_count;
325 meta->varying_count = ss->varying_count;
326 meta->texture_count = ctx->sampler_view_count[st];
327 meta->sampler_count = ctx->sampler_count[st];
328
329 if (dev->quirks & IS_BIFROST) {
330 if (st == PIPE_SHADER_VERTEX)
331 meta->bifrost1.unk1 = 0x800000;
332 else {
333 /* First clause ATEST |= 0x4000000.
334 * Less than 32 regs |= 0x200 */
335 meta->bifrost1.unk1 = 0x950020;
336 }
337
338 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
339 if (st == PIPE_SHADER_VERTEX)
340 meta->bifrost2.preload_regs = 0xC0;
341 else {
342 meta->bifrost2.preload_regs = 0x1;
343 SET_BIT(meta->bifrost2.preload_regs, 0x10, ss->reads_frag_coord);
344 }
345
346 meta->bifrost2.uniform_count = MIN2(ss->uniform_count,
347 ss->uniform_cutoff);
348 } else {
349 meta->midgard1.uniform_count = MIN2(ss->uniform_count,
350 ss->uniform_cutoff);
351 meta->midgard1.work_count = ss->work_reg_count;
352
353 /* TODO: This is not conformant on ES3 */
354 meta->midgard1.flags_hi = MALI_SUPPRESS_INF_NAN;
355
356 meta->midgard1.flags_lo = 0x20;
357 meta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
358
359 SET_BIT(meta->midgard1.flags_hi, MALI_WRITES_GLOBAL, ss->writes_global);
360 }
361 }
362
363 static unsigned
364 panfrost_translate_compare_func(enum pipe_compare_func in)
365 {
366 switch (in) {
367 case PIPE_FUNC_NEVER:
368 return MALI_FUNC_NEVER;
369
370 case PIPE_FUNC_LESS:
371 return MALI_FUNC_LESS;
372
373 case PIPE_FUNC_EQUAL:
374 return MALI_FUNC_EQUAL;
375
376 case PIPE_FUNC_LEQUAL:
377 return MALI_FUNC_LEQUAL;
378
379 case PIPE_FUNC_GREATER:
380 return MALI_FUNC_GREATER;
381
382 case PIPE_FUNC_NOTEQUAL:
383 return MALI_FUNC_NOTEQUAL;
384
385 case PIPE_FUNC_GEQUAL:
386 return MALI_FUNC_GEQUAL;
387
388 case PIPE_FUNC_ALWAYS:
389 return MALI_FUNC_ALWAYS;
390
391 default:
392 unreachable("Invalid func");
393 }
394 }
395
396 static unsigned
397 panfrost_translate_stencil_op(enum pipe_stencil_op in)
398 {
399 switch (in) {
400 case PIPE_STENCIL_OP_KEEP:
401 return MALI_STENCIL_KEEP;
402
403 case PIPE_STENCIL_OP_ZERO:
404 return MALI_STENCIL_ZERO;
405
406 case PIPE_STENCIL_OP_REPLACE:
407 return MALI_STENCIL_REPLACE;
408
409 case PIPE_STENCIL_OP_INCR:
410 return MALI_STENCIL_INCR;
411
412 case PIPE_STENCIL_OP_DECR:
413 return MALI_STENCIL_DECR;
414
415 case PIPE_STENCIL_OP_INCR_WRAP:
416 return MALI_STENCIL_INCR_WRAP;
417
418 case PIPE_STENCIL_OP_DECR_WRAP:
419 return MALI_STENCIL_DECR_WRAP;
420
421 case PIPE_STENCIL_OP_INVERT:
422 return MALI_STENCIL_INVERT;
423
424 default:
425 unreachable("Invalid stencil op");
426 }
427 }
428
429 static unsigned
430 translate_tex_wrap(enum pipe_tex_wrap w)
431 {
432 switch (w) {
433 case PIPE_TEX_WRAP_REPEAT:
434 return MALI_WRAP_REPEAT;
435
436 case PIPE_TEX_WRAP_CLAMP:
437 return MALI_WRAP_CLAMP;
438
439 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
440 return MALI_WRAP_CLAMP_TO_EDGE;
441
442 case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
443 return MALI_WRAP_CLAMP_TO_BORDER;
444
445 case PIPE_TEX_WRAP_MIRROR_REPEAT:
446 return MALI_WRAP_MIRRORED_REPEAT;
447
448 case PIPE_TEX_WRAP_MIRROR_CLAMP:
449 return MALI_WRAP_MIRRORED_CLAMP;
450
451 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
452 return MALI_WRAP_MIRRORED_CLAMP_TO_EDGE;
453
454 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
455 return MALI_WRAP_MIRRORED_CLAMP_TO_BORDER;
456
457 default:
458 unreachable("Invalid wrap");
459 }
460 }
461
462 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
463 struct mali_sampler_descriptor *hw)
464 {
465 unsigned func = panfrost_translate_compare_func(cso->compare_func);
466 bool min_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
467 bool mag_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
468 bool mip_linear = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR;
469 unsigned min_filter = min_nearest ? MALI_SAMP_MIN_NEAREST : 0;
470 unsigned mag_filter = mag_nearest ? MALI_SAMP_MAG_NEAREST : 0;
471 unsigned mip_filter = mip_linear ?
472 (MALI_SAMP_MIP_LINEAR_1 | MALI_SAMP_MIP_LINEAR_2) : 0;
473 unsigned normalized = cso->normalized_coords ? MALI_SAMP_NORM_COORDS : 0;
474
475 *hw = (struct mali_sampler_descriptor) {
476 .filter_mode = min_filter | mag_filter | mip_filter |
477 normalized,
478 .wrap_s = translate_tex_wrap(cso->wrap_s),
479 .wrap_t = translate_tex_wrap(cso->wrap_t),
480 .wrap_r = translate_tex_wrap(cso->wrap_r),
481 .compare_func = panfrost_flip_compare_func(func),
482 .border_color = {
483 cso->border_color.f[0],
484 cso->border_color.f[1],
485 cso->border_color.f[2],
486 cso->border_color.f[3]
487 },
488 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
489 .max_lod = FIXED_16(cso->max_lod, false),
490 .lod_bias = FIXED_16(cso->lod_bias, true), /* can be negative */
491 .seamless_cube_map = cso->seamless_cube_map,
492 };
493
494 /* If necessary, we disable mipmapping in the sampler descriptor by
495 * clamping the LOD as tight as possible (from 0 to epsilon,
496 * essentially -- remember these are fixed point numbers, so
497 * epsilon=1/256) */
498
499 if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
500 hw->max_lod = hw->min_lod + 1;
501 }
502
503 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
504 struct bifrost_sampler_descriptor *hw)
505 {
506 *hw = (struct bifrost_sampler_descriptor) {
507 .unk1 = 0x1,
508 .wrap_s = translate_tex_wrap(cso->wrap_s),
509 .wrap_t = translate_tex_wrap(cso->wrap_t),
510 .wrap_r = translate_tex_wrap(cso->wrap_r),
511 .unk8 = 0x8,
512 .min_filter = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST,
513 .norm_coords = cso->normalized_coords,
514 .mip_filter = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR,
515 .mag_filter = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR,
516 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
517 .max_lod = FIXED_16(cso->max_lod, false),
518 };
519
520 /* If necessary, we disable mipmapping in the sampler descriptor by
521 * clamping the LOD as tight as possible (from 0 to epsilon,
522 * essentially -- remember these are fixed point numbers, so
523 * epsilon=1/256) */
524
525 if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
526 hw->max_lod = hw->min_lod + 1;
527 }
528
529 static void
530 panfrost_make_stencil_state(const struct pipe_stencil_state *in,
531 struct mali_stencil_test *out)
532 {
533 out->ref = 0; /* Gallium gets it from elsewhere */
534
535 out->mask = in->valuemask;
536 out->func = panfrost_translate_compare_func(in->func);
537 out->sfail = panfrost_translate_stencil_op(in->fail_op);
538 out->dpfail = panfrost_translate_stencil_op(in->zfail_op);
539 out->dppass = panfrost_translate_stencil_op(in->zpass_op);
540 }
541
542 static void
543 panfrost_frag_meta_rasterizer_update(struct panfrost_context *ctx,
544 struct mali_shader_meta *fragmeta)
545 {
546 if (!ctx->rasterizer) {
547 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, true);
548 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, false);
549 fragmeta->depth_units = 0.0f;
550 fragmeta->depth_factor = 0.0f;
551 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, false);
552 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, false);
553 return;
554 }
555
556 struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
557
558 bool msaa = rast->multisample;
559
560 /* TODO: Sample size */
561 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
562 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
563 fragmeta->depth_units = rast->offset_units * 2.0f;
564 fragmeta->depth_factor = rast->offset_scale;
565
566 /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
567
568 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, rast->offset_tri);
569 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, rast->offset_tri);
570 }
571
572 static void
573 panfrost_frag_meta_zsa_update(struct panfrost_context *ctx,
574 struct mali_shader_meta *fragmeta)
575 {
576 const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
577 int zfunc = PIPE_FUNC_ALWAYS;
578
579 if (!zsa) {
580 struct pipe_stencil_state default_stencil = {
581 .enabled = 0,
582 .func = PIPE_FUNC_ALWAYS,
583 .fail_op = MALI_STENCIL_KEEP,
584 .zfail_op = MALI_STENCIL_KEEP,
585 .zpass_op = MALI_STENCIL_KEEP,
586 .writemask = 0xFF,
587 .valuemask = 0xFF
588 };
589
590 panfrost_make_stencil_state(&default_stencil,
591 &fragmeta->stencil_front);
592 fragmeta->stencil_mask_front = default_stencil.writemask;
593 fragmeta->stencil_back = fragmeta->stencil_front;
594 fragmeta->stencil_mask_back = default_stencil.writemask;
595 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST, false);
596 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK, false);
597 } else {
598 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
599 zsa->stencil[0].enabled);
600 panfrost_make_stencil_state(&zsa->stencil[0],
601 &fragmeta->stencil_front);
602 fragmeta->stencil_mask_front = zsa->stencil[0].writemask;
603 fragmeta->stencil_front.ref = ctx->stencil_ref.ref_value[0];
604
605 /* If back-stencil is not enabled, use the front values */
606
607 if (zsa->stencil[1].enabled) {
608 panfrost_make_stencil_state(&zsa->stencil[1],
609 &fragmeta->stencil_back);
610 fragmeta->stencil_mask_back = zsa->stencil[1].writemask;
611 fragmeta->stencil_back.ref = ctx->stencil_ref.ref_value[1];
612 } else {
613 fragmeta->stencil_back = fragmeta->stencil_front;
614 fragmeta->stencil_mask_back = fragmeta->stencil_mask_front;
615 fragmeta->stencil_back.ref = fragmeta->stencil_front.ref;
616 }
617
618 if (zsa->depth.enabled)
619 zfunc = zsa->depth.func;
620
621 /* Depth state (TODO: Refactor) */
622
623 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
624 zsa->depth.writemask);
625 }
626
627 fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
628 fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(zfunc));
629 }
630
631 static bool
632 panfrost_fs_required(
633 struct panfrost_shader_state *fs,
634 struct panfrost_blend_final *blend,
635 unsigned rt_count)
636 {
637 /* If we generally have side effects */
638 if (fs->fs_sidefx)
639 return true;
640
641 /* If colour is written we need to execute */
642 for (unsigned i = 0; i < rt_count; ++i) {
643 if (!blend[i].no_colour)
644 return true;
645 }
646
647 /* If depth is written and not implied we need to execute.
648 * TODO: Predicate on Z/S writes being enabled */
649 return (fs->writes_depth || fs->writes_stencil);
650 }
651
652 static void
653 panfrost_frag_meta_blend_update(struct panfrost_context *ctx,
654 struct mali_shader_meta *fragmeta,
655 void *rts)
656 {
657 const struct panfrost_device *dev = pan_device(ctx->base.screen);
658 struct panfrost_shader_state *fs;
659 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
660
661 SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
662 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
663 !ctx->blend->base.dither);
664
665 /* Get blending setup */
666 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
667
668 struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
669 unsigned shader_offset = 0;
670 struct panfrost_bo *shader_bo = NULL;
671
672 for (unsigned c = 0; c < rt_count; ++c)
673 blend[c] = panfrost_get_blend_for_context(ctx, c, &shader_bo,
674 &shader_offset);
675
676 /* Disable shader execution if we can */
677 if (dev->quirks & MIDGARD_SHADERLESS
678 && !panfrost_fs_required(fs, blend, rt_count)) {
679 fragmeta->shader = 0;
680 fragmeta->attribute_count = 0;
681 fragmeta->varying_count = 0;
682 fragmeta->texture_count = 0;
683 fragmeta->sampler_count = 0;
684
685 /* This feature is not known to work on Bifrost */
686 fragmeta->midgard1.work_count = 1;
687 fragmeta->midgard1.uniform_count = 0;
688 fragmeta->midgard1.uniform_buffer_count = 0;
689 }
690
691 /* If there is a blend shader, work registers are shared. We impose 8
692 * work registers as a limit for blend shaders. Should be lower XXX */
693
694 if (!(dev->quirks & IS_BIFROST)) {
695 for (unsigned c = 0; c < rt_count; ++c) {
696 if (blend[c].is_shader) {
697 fragmeta->midgard1.work_count =
698 MAX2(fragmeta->midgard1.work_count, 8);
699 }
700 }
701 }
702
703 /* Even on MFBD, the shader descriptor gets blend shaders. It's *also*
704 * copied to the blend_meta appended (by convention), but this is the
705 * field actually read by the hardware. (Or maybe both are read...?).
706 * Specify the last RTi with a blend shader. */
707
708 fragmeta->blend.shader = 0;
709
710 for (signed rt = (rt_count - 1); rt >= 0; --rt) {
711 if (!blend[rt].is_shader)
712 continue;
713
714 fragmeta->blend.shader = blend[rt].shader.gpu |
715 blend[rt].shader.first_tag;
716 break;
717 }
718
719 if (dev->quirks & MIDGARD_SFBD) {
720 /* When only a single render target platform is used, the blend
721 * information is inside the shader meta itself. We additionally
722 * need to signal CAN_DISCARD for nontrivial blend modes (so
723 * we're able to read back the destination buffer) */
724
725 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
726 blend[0].is_shader);
727
728 if (!blend[0].is_shader) {
729 fragmeta->blend.equation = *blend[0].equation.equation;
730 fragmeta->blend.constant = blend[0].equation.constant;
731 }
732
733 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
734 !blend[0].no_blending || fs->can_discard);
735 return;
736 }
737
738 if (dev->quirks & IS_BIFROST) {
739 bool no_blend = true;
740
741 for (unsigned i = 0; i < rt_count; ++i)
742 no_blend &= (blend[i].no_blending | blend[i].no_colour);
743
744 SET_BIT(fragmeta->bifrost1.unk1, MALI_BIFROST_EARLY_Z,
745 !fs->can_discard && !fs->writes_depth && no_blend);
746 }
747
748 /* Additional blend descriptor tacked on for jobs using MFBD */
749
750 for (unsigned i = 0; i < rt_count; ++i) {
751 unsigned flags = 0;
752
753 if (ctx->pipe_framebuffer.nr_cbufs > i && !blend[i].no_colour) {
754 flags = 0x200;
755
756 bool is_srgb = (ctx->pipe_framebuffer.nr_cbufs > i) &&
757 (ctx->pipe_framebuffer.cbufs[i]) &&
758 util_format_is_srgb(ctx->pipe_framebuffer.cbufs[i]->format);
759
760 SET_BIT(flags, MALI_BLEND_MRT_SHADER, blend[i].is_shader);
761 SET_BIT(flags, MALI_BLEND_LOAD_TIB, !blend[i].no_blending);
762 SET_BIT(flags, MALI_BLEND_SRGB, is_srgb);
763 SET_BIT(flags, MALI_BLEND_NO_DITHER, !ctx->blend->base.dither);
764 }
765
766 if (dev->quirks & IS_BIFROST) {
767 struct bifrost_blend_rt *brts = rts;
768
769 brts[i].flags = flags;
770
771 if (blend[i].is_shader) {
772 /* The blend shader's address needs to be at
773 * the same top 32 bit as the fragment shader.
774 * TODO: Ensure that's always the case.
775 */
776 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
777 (fs->bo->gpu & (0xffffffffull << 32)));
778 brts[i].shader = blend[i].shader.gpu;
779 brts[i].unk2 = 0x0;
780 } else if (ctx->pipe_framebuffer.nr_cbufs > i) {
781 enum pipe_format format = ctx->pipe_framebuffer.cbufs[i]->format;
782 const struct util_format_description *format_desc;
783 format_desc = util_format_description(format);
784
785 brts[i].equation = *blend[i].equation.equation;
786
787 /* TODO: this is a bit more complicated */
788 brts[i].constant = blend[i].equation.constant;
789
790 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
791
792 /* 0x19 disables blending and forces REPLACE
793 * mode (equivalent to rgb_mode = alpha_mode =
794 * x122, colour mask = 0xF). 0x1a allows
795 * blending. */
796 brts[i].unk2 = blend[i].no_blending ? 0x19 : 0x1a;
797
798 brts[i].shader_type = fs->blend_types[i];
799 } else {
800 /* Dummy attachment for depth-only */
801 brts[i].unk2 = 0x3;
802 brts[i].shader_type = fs->blend_types[i];
803 }
804 } else {
805 struct midgard_blend_rt *mrts = rts;
806 mrts[i].flags = flags;
807
808 if (blend[i].is_shader) {
809 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
810 } else {
811 mrts[i].blend.equation = *blend[i].equation.equation;
812 mrts[i].blend.constant = blend[i].equation.constant;
813 }
814 }
815 }
816 }
817
818 static void
819 panfrost_frag_shader_meta_init(struct panfrost_context *ctx,
820 struct mali_shader_meta *fragmeta,
821 void *rts)
822 {
823 const struct panfrost_device *dev = pan_device(ctx->base.screen);
824 struct panfrost_shader_state *fs;
825
826 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
827
828 fragmeta->alpha_coverage = ~MALI_ALPHA_COVERAGE(0.000000);
829 fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x3010;
830 fragmeta->unknown2_4 = 0x4e0;
831
832 /* unknown2_4 has 0x10 bit set on T6XX and T720. We don't know why this
833 * is required (independent of 32-bit/64-bit descriptors), or why it's
834 * not used on later GPU revisions. Otherwise, all shader jobs fault on
835 * these earlier chips (perhaps this is a chicken bit of some kind).
836 * More investigation is needed. */
837
838 SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
839
840 if (dev->quirks & IS_BIFROST) {
841 /* TODO */
842 } else {
843 /* Depending on whether it's legal to in the given shader, we try to
844 * enable early-z testing. TODO: respect e-z force */
845
846 SET_BIT(fragmeta->midgard1.flags_lo, MALI_EARLY_Z,
847 !fs->can_discard && !fs->writes_global &&
848 !fs->writes_depth && !fs->writes_stencil);
849
850 /* Add the writes Z/S flags if needed. */
851 SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_Z, fs->writes_depth);
852 SET_BIT(fragmeta->midgard1.flags_hi, MALI_WRITES_S, fs->writes_stencil);
853
854 /* Any time texturing is used, derivatives are implicitly calculated,
855 * so we need to enable helper invocations */
856
857 SET_BIT(fragmeta->midgard1.flags_lo, MALI_HELPER_INVOCATIONS,
858 fs->helper_invocations);
859
860 const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
861
862 bool depth_enabled = fs->writes_depth ||
863 (zsa && zsa->depth.enabled && zsa->depth.func != PIPE_FUNC_ALWAYS);
864
865 SET_BIT(fragmeta->midgard1.flags_lo, 0x400, !depth_enabled && fs->can_discard);
866 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_ZS, depth_enabled && fs->can_discard);
867 }
868
869 panfrost_frag_meta_rasterizer_update(ctx, fragmeta);
870 panfrost_frag_meta_zsa_update(ctx, fragmeta);
871 panfrost_frag_meta_blend_update(ctx, fragmeta, rts);
872 }
873
874 void
875 panfrost_emit_shader_meta(struct panfrost_batch *batch,
876 enum pipe_shader_type st,
877 struct mali_vertex_tiler_postfix *postfix)
878 {
879 struct panfrost_context *ctx = batch->ctx;
880 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
881
882 if (!ss) {
883 postfix->shader = 0;
884 return;
885 }
886
887 struct mali_shader_meta meta;
888
889 panfrost_shader_meta_init(ctx, st, &meta);
890
891 /* Add the shader BO to the batch. */
892 panfrost_batch_add_bo(batch, ss->bo,
893 PAN_BO_ACCESS_PRIVATE |
894 PAN_BO_ACCESS_READ |
895 panfrost_bo_access_for_stage(st));
896
897 mali_ptr shader_ptr;
898
899 if (st == PIPE_SHADER_FRAGMENT) {
900 struct panfrost_device *dev = pan_device(ctx->base.screen);
901 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
902 size_t desc_size = sizeof(meta);
903 void *rts = NULL;
904 struct panfrost_transfer xfer;
905 unsigned rt_size;
906
907 if (dev->quirks & MIDGARD_SFBD)
908 rt_size = 0;
909 else if (dev->quirks & IS_BIFROST)
910 rt_size = sizeof(struct bifrost_blend_rt);
911 else
912 rt_size = sizeof(struct midgard_blend_rt);
913
914 desc_size += rt_size * rt_count;
915
916 if (rt_size)
917 rts = rzalloc_size(ctx, rt_size * rt_count);
918
919 panfrost_frag_shader_meta_init(ctx, &meta, rts);
920
921 xfer = panfrost_allocate_transient(batch, desc_size);
922
923 memcpy(xfer.cpu, &meta, sizeof(meta));
924 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
925
926 if (rt_size)
927 ralloc_free(rts);
928
929 shader_ptr = xfer.gpu;
930 } else {
931 shader_ptr = panfrost_upload_transient(batch, &meta,
932 sizeof(meta));
933 }
934
935 postfix->shader = shader_ptr;
936 }
937
938 static void
939 panfrost_mali_viewport_init(struct panfrost_context *ctx,
940 struct mali_viewport *mvp)
941 {
942 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
943
944 /* Clip bounds are encoded as floats. The viewport itself is encoded as
945 * (somewhat) asymmetric ints. */
946
947 const struct pipe_scissor_state *ss = &ctx->scissor;
948
949 memset(mvp, 0, sizeof(*mvp));
950
951 /* By default, do no viewport clipping, i.e. clip to (-inf, inf) in
952 * each direction. Clipping to the viewport in theory should work, but
953 * in practice causes issues when we're not explicitly trying to
954 * scissor */
955
956 *mvp = (struct mali_viewport) {
957 .clip_minx = -INFINITY,
958 .clip_miny = -INFINITY,
959 .clip_maxx = INFINITY,
960 .clip_maxy = INFINITY,
961 };
962
963 /* Always scissor to the viewport by default. */
964 float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
965 float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
966
967 float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
968 float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
969
970 float minz = (vp->translate[2] - fabsf(vp->scale[2]));
971 float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
972
973 /* Apply the scissor test */
974
975 unsigned minx, miny, maxx, maxy;
976
977 if (ss && ctx->rasterizer && ctx->rasterizer->base.scissor) {
978 minx = MAX2(ss->minx, vp_minx);
979 miny = MAX2(ss->miny, vp_miny);
980 maxx = MIN2(ss->maxx, vp_maxx);
981 maxy = MIN2(ss->maxy, vp_maxy);
982 } else {
983 minx = vp_minx;
984 miny = vp_miny;
985 maxx = vp_maxx;
986 maxy = vp_maxy;
987 }
988
989 /* Hardware needs the min/max to be strictly ordered, so flip if we
990 * need to. The viewport transformation in the vertex shader will
991 * handle the negatives if we don't */
992
993 if (miny > maxy) {
994 unsigned temp = miny;
995 miny = maxy;
996 maxy = temp;
997 }
998
999 if (minx > maxx) {
1000 unsigned temp = minx;
1001 minx = maxx;
1002 maxx = temp;
1003 }
1004
1005 if (minz > maxz) {
1006 float temp = minz;
1007 minz = maxz;
1008 maxz = temp;
1009 }
1010
1011 /* Clamp to the framebuffer size as a last check */
1012
1013 minx = MIN2(ctx->pipe_framebuffer.width, minx);
1014 maxx = MIN2(ctx->pipe_framebuffer.width, maxx);
1015
1016 miny = MIN2(ctx->pipe_framebuffer.height, miny);
1017 maxy = MIN2(ctx->pipe_framebuffer.height, maxy);
1018
1019 /* Upload */
1020
1021 mvp->viewport0[0] = minx;
1022 mvp->viewport1[0] = MALI_POSITIVE(maxx);
1023
1024 mvp->viewport0[1] = miny;
1025 mvp->viewport1[1] = MALI_POSITIVE(maxy);
1026
1027 mvp->clip_minz = minz;
1028 mvp->clip_maxz = maxz;
1029 }
1030
1031 void
1032 panfrost_emit_viewport(struct panfrost_batch *batch,
1033 struct mali_vertex_tiler_postfix *tiler_postfix)
1034 {
1035 struct panfrost_context *ctx = batch->ctx;
1036 struct mali_viewport mvp;
1037
1038 panfrost_mali_viewport_init(batch->ctx, &mvp);
1039
1040 /* Update the job, unless we're doing wallpapering (whose lack of
1041 * scissor we can ignore, since if we "miss" a tile of wallpaper, it'll
1042 * just... be faster :) */
1043
1044 if (!ctx->wallpaper_batch)
1045 panfrost_batch_union_scissor(batch, mvp.viewport0[0],
1046 mvp.viewport0[1],
1047 mvp.viewport1[0] + 1,
1048 mvp.viewport1[1] + 1);
1049
1050 tiler_postfix->viewport = panfrost_upload_transient(batch, &mvp,
1051 sizeof(mvp));
1052 }
1053
1054 static mali_ptr
1055 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
1056 enum pipe_shader_type st,
1057 struct panfrost_constant_buffer *buf,
1058 unsigned index)
1059 {
1060 struct pipe_constant_buffer *cb = &buf->cb[index];
1061 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1062
1063 if (rsrc) {
1064 panfrost_batch_add_bo(batch, rsrc->bo,
1065 PAN_BO_ACCESS_SHARED |
1066 PAN_BO_ACCESS_READ |
1067 panfrost_bo_access_for_stage(st));
1068
1069 /* Alignment gauranteed by
1070 * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
1071 return rsrc->bo->gpu + cb->buffer_offset;
1072 } else if (cb->user_buffer) {
1073 return panfrost_upload_transient(batch,
1074 cb->user_buffer +
1075 cb->buffer_offset,
1076 cb->buffer_size);
1077 } else {
1078 unreachable("No constant buffer");
1079 }
1080 }
1081
1082 struct sysval_uniform {
1083 union {
1084 float f[4];
1085 int32_t i[4];
1086 uint32_t u[4];
1087 uint64_t du[2];
1088 };
1089 };
1090
1091 static void
1092 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
1093 struct sysval_uniform *uniform)
1094 {
1095 struct panfrost_context *ctx = batch->ctx;
1096 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1097
1098 uniform->f[0] = vp->scale[0];
1099 uniform->f[1] = vp->scale[1];
1100 uniform->f[2] = vp->scale[2];
1101 }
1102
1103 static void
1104 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
1105 struct sysval_uniform *uniform)
1106 {
1107 struct panfrost_context *ctx = batch->ctx;
1108 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1109
1110 uniform->f[0] = vp->translate[0];
1111 uniform->f[1] = vp->translate[1];
1112 uniform->f[2] = vp->translate[2];
1113 }
1114
1115 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
1116 enum pipe_shader_type st,
1117 unsigned int sysvalid,
1118 struct sysval_uniform *uniform)
1119 {
1120 struct panfrost_context *ctx = batch->ctx;
1121 unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
1122 unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
1123 bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
1124 struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
1125
1126 assert(dim);
1127 uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
1128
1129 if (dim > 1)
1130 uniform->i[1] = u_minify(tex->texture->height0,
1131 tex->u.tex.first_level);
1132
1133 if (dim > 2)
1134 uniform->i[2] = u_minify(tex->texture->depth0,
1135 tex->u.tex.first_level);
1136
1137 if (is_array)
1138 uniform->i[dim] = tex->texture->array_size;
1139 }
1140
1141 static void
1142 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
1143 enum pipe_shader_type st,
1144 unsigned ssbo_id,
1145 struct sysval_uniform *uniform)
1146 {
1147 struct panfrost_context *ctx = batch->ctx;
1148
1149 assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
1150 struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
1151
1152 /* Compute address */
1153 struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
1154
1155 panfrost_batch_add_bo(batch, bo,
1156 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
1157 panfrost_bo_access_for_stage(st));
1158
1159 /* Upload address and size as sysval */
1160 uniform->du[0] = bo->gpu + sb.buffer_offset;
1161 uniform->u[2] = sb.buffer_size;
1162 }
1163
1164 static void
1165 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1166 enum pipe_shader_type st,
1167 unsigned samp_idx,
1168 struct sysval_uniform *uniform)
1169 {
1170 struct panfrost_context *ctx = batch->ctx;
1171 struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1172
1173 uniform->f[0] = sampl->min_lod;
1174 uniform->f[1] = sampl->max_lod;
1175 uniform->f[2] = sampl->lod_bias;
1176
1177 /* Even without any errata, Midgard represents "no mipmapping" as
1178 * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1179 * panfrost_create_sampler_state which also explains our choice of
1180 * epsilon value (again to keep behaviour consistent) */
1181
1182 if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1183 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1184 }
1185
1186 static void
1187 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1188 struct sysval_uniform *uniform)
1189 {
1190 struct panfrost_context *ctx = batch->ctx;
1191
1192 uniform->u[0] = ctx->compute_grid->grid[0];
1193 uniform->u[1] = ctx->compute_grid->grid[1];
1194 uniform->u[2] = ctx->compute_grid->grid[2];
1195 }
1196
1197 static void
1198 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1199 struct panfrost_shader_state *ss,
1200 enum pipe_shader_type st)
1201 {
1202 struct sysval_uniform *uniforms = (void *)buf;
1203
1204 for (unsigned i = 0; i < ss->sysval_count; ++i) {
1205 int sysval = ss->sysval[i];
1206
1207 switch (PAN_SYSVAL_TYPE(sysval)) {
1208 case PAN_SYSVAL_VIEWPORT_SCALE:
1209 panfrost_upload_viewport_scale_sysval(batch,
1210 &uniforms[i]);
1211 break;
1212 case PAN_SYSVAL_VIEWPORT_OFFSET:
1213 panfrost_upload_viewport_offset_sysval(batch,
1214 &uniforms[i]);
1215 break;
1216 case PAN_SYSVAL_TEXTURE_SIZE:
1217 panfrost_upload_txs_sysval(batch, st,
1218 PAN_SYSVAL_ID(sysval),
1219 &uniforms[i]);
1220 break;
1221 case PAN_SYSVAL_SSBO:
1222 panfrost_upload_ssbo_sysval(batch, st,
1223 PAN_SYSVAL_ID(sysval),
1224 &uniforms[i]);
1225 break;
1226 case PAN_SYSVAL_NUM_WORK_GROUPS:
1227 panfrost_upload_num_work_groups_sysval(batch,
1228 &uniforms[i]);
1229 break;
1230 case PAN_SYSVAL_SAMPLER:
1231 panfrost_upload_sampler_sysval(batch, st,
1232 PAN_SYSVAL_ID(sysval),
1233 &uniforms[i]);
1234 break;
1235 default:
1236 assert(0);
1237 }
1238 }
1239 }
1240
1241 static const void *
1242 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1243 unsigned index)
1244 {
1245 struct pipe_constant_buffer *cb = &buf->cb[index];
1246 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1247
1248 if (rsrc)
1249 return rsrc->bo->cpu;
1250 else if (cb->user_buffer)
1251 return cb->user_buffer;
1252 else
1253 unreachable("No constant buffer");
1254 }
1255
1256 void
1257 panfrost_emit_const_buf(struct panfrost_batch *batch,
1258 enum pipe_shader_type stage,
1259 struct mali_vertex_tiler_postfix *postfix)
1260 {
1261 struct panfrost_context *ctx = batch->ctx;
1262 struct panfrost_shader_variants *all = ctx->shader[stage];
1263
1264 if (!all)
1265 return;
1266
1267 struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1268
1269 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1270
1271 /* Uniforms are implicitly UBO #0 */
1272 bool has_uniforms = buf->enabled_mask & (1 << 0);
1273
1274 /* Allocate room for the sysval and the uniforms */
1275 size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1276 size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1277 size_t size = sys_size + uniform_size;
1278 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1279 size);
1280
1281 /* Upload sysvals requested by the shader */
1282 panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1283
1284 /* Upload uniforms */
1285 if (has_uniforms && uniform_size) {
1286 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1287 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1288 }
1289
1290 /* Next up, attach UBOs. UBO #0 is the uniforms we just
1291 * uploaded */
1292
1293 unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1294 assert(ubo_count >= 1);
1295
1296 size_t sz = sizeof(uint64_t) * ubo_count;
1297 uint64_t ubos[PAN_MAX_CONST_BUFFERS];
1298 int uniform_count = ss->uniform_count;
1299
1300 /* Upload uniforms as a UBO */
1301 ubos[0] = MALI_MAKE_UBO(2 + uniform_count, transfer.gpu);
1302
1303 /* The rest are honest-to-goodness UBOs */
1304
1305 for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1306 size_t usz = buf->cb[ubo].buffer_size;
1307 bool enabled = buf->enabled_mask & (1 << ubo);
1308 bool empty = usz == 0;
1309
1310 if (!enabled || empty) {
1311 /* Stub out disabled UBOs to catch accesses */
1312 ubos[ubo] = MALI_MAKE_UBO(0, 0xDEAD0000);
1313 continue;
1314 }
1315
1316 mali_ptr gpu = panfrost_map_constant_buffer_gpu(batch, stage,
1317 buf, ubo);
1318
1319 unsigned bytes_per_field = 16;
1320 unsigned aligned = ALIGN_POT(usz, bytes_per_field);
1321 ubos[ubo] = MALI_MAKE_UBO(aligned / bytes_per_field, gpu);
1322 }
1323
1324 mali_ptr ubufs = panfrost_upload_transient(batch, ubos, sz);
1325 postfix->uniforms = transfer.gpu;
1326 postfix->uniform_buffers = ubufs;
1327
1328 buf->dirty_mask = 0;
1329 }
1330
1331 void
1332 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1333 const struct pipe_grid_info *info,
1334 struct midgard_payload_vertex_tiler *vtp)
1335 {
1336 struct panfrost_context *ctx = batch->ctx;
1337 struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1338 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1339 unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1340 128));
1341 unsigned shared_size = single_size * info->grid[0] * info->grid[1] *
1342 info->grid[2] * 4;
1343 struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1344 shared_size,
1345 1);
1346
1347 struct mali_shared_memory shared = {
1348 .shared_memory = bo->gpu,
1349 .shared_workgroup_count =
1350 util_logbase2_ceil(info->grid[0]) +
1351 util_logbase2_ceil(info->grid[1]) +
1352 util_logbase2_ceil(info->grid[2]),
1353 .shared_unk1 = 0x2,
1354 .shared_shift = util_logbase2(single_size) - 1
1355 };
1356
1357 vtp->postfix.shared_memory = panfrost_upload_transient(batch, &shared,
1358 sizeof(shared));
1359 }
1360
1361 static mali_ptr
1362 panfrost_get_tex_desc(struct panfrost_batch *batch,
1363 enum pipe_shader_type st,
1364 struct panfrost_sampler_view *view)
1365 {
1366 if (!view)
1367 return (mali_ptr) 0;
1368
1369 struct pipe_sampler_view *pview = &view->base;
1370 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1371
1372 /* Add the BO to the job so it's retained until the job is done. */
1373
1374 panfrost_batch_add_bo(batch, rsrc->bo,
1375 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1376 panfrost_bo_access_for_stage(st));
1377
1378 panfrost_batch_add_bo(batch, view->bo,
1379 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1380 panfrost_bo_access_for_stage(st));
1381
1382 return view->bo->gpu;
1383 }
1384
1385 static void
1386 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
1387 struct pipe_context *pctx)
1388 {
1389 struct panfrost_resource *rsrc = pan_resource(view->base.texture);
1390 if (view->layout != rsrc->layout) {
1391 panfrost_bo_unreference(view->bo);
1392 panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);
1393 }
1394 }
1395
1396 void
1397 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1398 enum pipe_shader_type stage,
1399 struct mali_vertex_tiler_postfix *postfix)
1400 {
1401 struct panfrost_context *ctx = batch->ctx;
1402 struct panfrost_device *device = pan_device(ctx->base.screen);
1403
1404 if (!ctx->sampler_view_count[stage])
1405 return;
1406
1407 if (device->quirks & IS_BIFROST) {
1408 struct bifrost_texture_descriptor *descriptors;
1409
1410 descriptors = malloc(sizeof(struct bifrost_texture_descriptor) *
1411 ctx->sampler_view_count[stage]);
1412
1413 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1414 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1415 struct pipe_sampler_view *pview = &view->base;
1416 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1417 panfrost_update_sampler_view(view, &ctx->base);
1418
1419 /* Add the BOs to the job so they are retained until the job is done. */
1420
1421 panfrost_batch_add_bo(batch, rsrc->bo,
1422 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1423 panfrost_bo_access_for_stage(stage));
1424
1425 panfrost_batch_add_bo(batch, view->bo,
1426 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1427 panfrost_bo_access_for_stage(stage));
1428
1429 memcpy(&descriptors[i], view->bifrost_descriptor, sizeof(*view->bifrost_descriptor));
1430 }
1431
1432 postfix->textures = panfrost_upload_transient(batch,
1433 descriptors,
1434 sizeof(struct bifrost_texture_descriptor) *
1435 ctx->sampler_view_count[stage]);
1436
1437 free(descriptors);
1438 } else {
1439 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1440
1441 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1442 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1443
1444 panfrost_update_sampler_view(view, &ctx->base);
1445
1446 trampolines[i] = panfrost_get_tex_desc(batch, stage, view);
1447 }
1448
1449 postfix->textures = panfrost_upload_transient(batch,
1450 trampolines,
1451 sizeof(uint64_t) *
1452 ctx->sampler_view_count[stage]);
1453 }
1454 }
1455
1456 void
1457 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1458 enum pipe_shader_type stage,
1459 struct mali_vertex_tiler_postfix *postfix)
1460 {
1461 struct panfrost_context *ctx = batch->ctx;
1462 struct panfrost_device *device = pan_device(ctx->base.screen);
1463
1464 if (!ctx->sampler_count[stage])
1465 return;
1466
1467 if (device->quirks & IS_BIFROST) {
1468 size_t desc_size = sizeof(struct bifrost_sampler_descriptor);
1469 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1470 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1471 transfer_size);
1472 struct bifrost_sampler_descriptor *desc = (struct bifrost_sampler_descriptor *)transfer.cpu;
1473
1474 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1475 desc[i] = ctx->samplers[stage][i]->bifrost_hw;
1476
1477 postfix->sampler_descriptor = transfer.gpu;
1478 } else {
1479 size_t desc_size = sizeof(struct mali_sampler_descriptor);
1480 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1481 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1482 transfer_size);
1483 struct mali_sampler_descriptor *desc = (struct mali_sampler_descriptor *)transfer.cpu;
1484
1485 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1486 desc[i] = ctx->samplers[stage][i]->midgard_hw;
1487
1488 postfix->sampler_descriptor = transfer.gpu;
1489 }
1490 }
1491
1492 void
1493 panfrost_emit_vertex_attr_meta(struct panfrost_batch *batch,
1494 struct mali_vertex_tiler_postfix *vertex_postfix)
1495 {
1496 struct panfrost_context *ctx = batch->ctx;
1497
1498 if (!ctx->vertex)
1499 return;
1500
1501 struct panfrost_vertex_state *so = ctx->vertex;
1502
1503 panfrost_vertex_state_upd_attr_offs(ctx, vertex_postfix);
1504 vertex_postfix->attribute_meta = panfrost_upload_transient(batch, so->hw,
1505 sizeof(*so->hw) *
1506 PAN_MAX_ATTRIBUTE);
1507 }
1508
1509 void
1510 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1511 struct mali_vertex_tiler_postfix *vertex_postfix)
1512 {
1513 struct panfrost_context *ctx = batch->ctx;
1514 struct panfrost_vertex_state *so = ctx->vertex;
1515
1516 /* Staged mali_attr, and index into them. i =/= k, depending on the
1517 * vertex buffer mask and instancing. Twice as much room is allocated,
1518 * for a worst case of NPOT_DIVIDEs which take up extra slot */
1519 union mali_attr attrs[PIPE_MAX_ATTRIBS * 2];
1520 unsigned k = 0;
1521
1522 for (unsigned i = 0; i < so->num_elements; ++i) {
1523 /* We map a mali_attr to be 1:1 with the mali_attr_meta, which
1524 * means duplicating some vertex buffers (who cares? aside from
1525 * maybe some caching implications but I somehow doubt that
1526 * matters) */
1527
1528 struct pipe_vertex_element *elem = &so->pipe[i];
1529 unsigned vbi = elem->vertex_buffer_index;
1530
1531 /* The exception to 1:1 mapping is that we can have multiple
1532 * entries (NPOT divisors), so we fixup anyways */
1533
1534 so->hw[i].index = k;
1535
1536 if (!(ctx->vb_mask & (1 << vbi)))
1537 continue;
1538
1539 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1540 struct panfrost_resource *rsrc;
1541
1542 rsrc = pan_resource(buf->buffer.resource);
1543 if (!rsrc)
1544 continue;
1545
1546 /* Align to 64 bytes by masking off the lower bits. This
1547 * will be adjusted back when we fixup the src_offset in
1548 * mali_attr_meta */
1549
1550 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1551 mali_ptr addr = raw_addr & ~63;
1552 unsigned chopped_addr = raw_addr - addr;
1553
1554 /* Add a dependency of the batch on the vertex buffer */
1555 panfrost_batch_add_bo(batch, rsrc->bo,
1556 PAN_BO_ACCESS_SHARED |
1557 PAN_BO_ACCESS_READ |
1558 PAN_BO_ACCESS_VERTEX_TILER);
1559
1560 /* Set common fields */
1561 attrs[k].elements = addr;
1562 attrs[k].stride = buf->stride;
1563
1564 /* Since we advanced the base pointer, we shrink the buffer
1565 * size */
1566 attrs[k].size = rsrc->base.width0 - buf->buffer_offset;
1567
1568 /* We need to add the extra size we masked off (for
1569 * correctness) so the data doesn't get clamped away */
1570 attrs[k].size += chopped_addr;
1571
1572 /* For non-instancing make sure we initialize */
1573 attrs[k].shift = attrs[k].extra_flags = 0;
1574
1575 /* Instancing uses a dramatically different code path than
1576 * linear, so dispatch for the actual emission now that the
1577 * common code is finished */
1578
1579 unsigned divisor = elem->instance_divisor;
1580
1581 if (divisor && ctx->instance_count == 1) {
1582 /* Silly corner case where there's a divisor(=1) but
1583 * there's no legitimate instancing. So we want *every*
1584 * attribute to be the same. So set stride to zero so
1585 * we don't go anywhere. */
1586
1587 attrs[k].size = attrs[k].stride + chopped_addr;
1588 attrs[k].stride = 0;
1589 attrs[k++].elements |= MALI_ATTR_LINEAR;
1590 } else if (ctx->instance_count <= 1) {
1591 /* Normal, non-instanced attributes */
1592 attrs[k++].elements |= MALI_ATTR_LINEAR;
1593 } else {
1594 unsigned instance_shift = vertex_postfix->instance_shift;
1595 unsigned instance_odd = vertex_postfix->instance_odd;
1596
1597 k += panfrost_vertex_instanced(ctx->padded_count,
1598 instance_shift,
1599 instance_odd,
1600 divisor, &attrs[k]);
1601 }
1602 }
1603
1604 /* Add special gl_VertexID/gl_InstanceID buffers */
1605
1606 panfrost_vertex_id(ctx->padded_count, &attrs[k]);
1607 so->hw[PAN_VERTEX_ID].index = k++;
1608 panfrost_instance_id(ctx->padded_count, &attrs[k]);
1609 so->hw[PAN_INSTANCE_ID].index = k++;
1610
1611 /* Upload whatever we emitted and go */
1612
1613 vertex_postfix->attributes = panfrost_upload_transient(batch, attrs,
1614 k * sizeof(*attrs));
1615 }
1616
1617 static mali_ptr
1618 panfrost_emit_varyings(struct panfrost_batch *batch, union mali_attr *slot,
1619 unsigned stride, unsigned count)
1620 {
1621 /* Fill out the descriptor */
1622 slot->stride = stride;
1623 slot->size = stride * count;
1624 slot->shift = slot->extra_flags = 0;
1625
1626 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1627 slot->size);
1628
1629 slot->elements = transfer.gpu | MALI_ATTR_LINEAR;
1630
1631 return transfer.gpu;
1632 }
1633
1634 static unsigned
1635 panfrost_streamout_offset(unsigned stride, unsigned offset,
1636 struct pipe_stream_output_target *target)
1637 {
1638 return (target->buffer_offset + (offset * stride * 4)) & 63;
1639 }
1640
1641 static void
1642 panfrost_emit_streamout(struct panfrost_batch *batch, union mali_attr *slot,
1643 unsigned stride, unsigned offset, unsigned count,
1644 struct pipe_stream_output_target *target)
1645 {
1646 /* Fill out the descriptor */
1647 slot->stride = stride * 4;
1648 slot->shift = slot->extra_flags = 0;
1649
1650 unsigned max_size = target->buffer_size;
1651 unsigned expected_size = slot->stride * count;
1652
1653 /* Grab the BO and bind it to the batch */
1654 struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1655
1656 /* Varyings are WRITE from the perspective of the VERTEX but READ from
1657 * the perspective of the TILER and FRAGMENT.
1658 */
1659 panfrost_batch_add_bo(batch, bo,
1660 PAN_BO_ACCESS_SHARED |
1661 PAN_BO_ACCESS_RW |
1662 PAN_BO_ACCESS_VERTEX_TILER |
1663 PAN_BO_ACCESS_FRAGMENT);
1664
1665 /* We will have an offset applied to get alignment */
1666 mali_ptr addr = bo->gpu + target->buffer_offset + (offset * slot->stride);
1667 slot->elements = (addr & ~63) | MALI_ATTR_LINEAR;
1668 slot->size = MIN2(max_size, expected_size) + (addr & 63);
1669 }
1670
1671 static bool
1672 has_point_coord(unsigned mask, gl_varying_slot loc)
1673 {
1674 if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1675 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1676 else if (loc == VARYING_SLOT_PNTC)
1677 return (mask & (1 << 8));
1678 else
1679 return false;
1680 }
1681
1682 /* Helpers for manipulating stream out information so we can pack varyings
1683 * accordingly. Compute the src_offset for a given captured varying */
1684
1685 static struct pipe_stream_output *
1686 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1687 {
1688 for (unsigned i = 0; i < info->num_outputs; ++i) {
1689 if (info->output[i].register_index == loc)
1690 return &info->output[i];
1691 }
1692
1693 unreachable("Varying not captured");
1694 }
1695
1696 static unsigned
1697 pan_varying_size(enum mali_format fmt)
1698 {
1699 unsigned type = MALI_EXTRACT_TYPE(fmt);
1700 unsigned chan = MALI_EXTRACT_CHANNELS(fmt);
1701 unsigned bits = MALI_EXTRACT_BITS(fmt);
1702 unsigned bpc = 0;
1703
1704 if (bits == MALI_CHANNEL_FLOAT) {
1705 /* No doubles */
1706 bool fp16 = (type == MALI_FORMAT_SINT);
1707 assert(fp16 || (type == MALI_FORMAT_UNORM));
1708
1709 bpc = fp16 ? 2 : 4;
1710 } else {
1711 assert(type >= MALI_FORMAT_SNORM && type <= MALI_FORMAT_SINT);
1712
1713 /* See the enums */
1714 bits = 1 << bits;
1715 assert(bits >= 8);
1716 bpc = bits / 8;
1717 }
1718
1719 return bpc * chan;
1720 }
1721
1722 /* Indices for named (non-XFB) varyings that are present. These are packed
1723 * tightly so they correspond to a bitfield present (P) indexed by (1 <<
1724 * PAN_VARY_*). This has the nice property that you can lookup the buffer index
1725 * of a given special field given a shift S by:
1726 *
1727 * idx = popcount(P & ((1 << S) - 1))
1728 *
1729 * That is... look at all of the varyings that come earlier and count them, the
1730 * count is the new index since plus one. Likewise, the total number of special
1731 * buffers required is simply popcount(P)
1732 */
1733
1734 enum pan_special_varying {
1735 PAN_VARY_GENERAL = 0,
1736 PAN_VARY_POSITION = 1,
1737 PAN_VARY_PSIZ = 2,
1738 PAN_VARY_PNTCOORD = 3,
1739 PAN_VARY_FACE = 4,
1740 PAN_VARY_FRAGCOORD = 5,
1741
1742 /* Keep last */
1743 PAN_VARY_MAX,
1744 };
1745
1746 /* Given a varying, figure out which index it correpsonds to */
1747
1748 static inline unsigned
1749 pan_varying_index(unsigned present, enum pan_special_varying v)
1750 {
1751 unsigned mask = (1 << v) - 1;
1752 return util_bitcount(present & mask);
1753 }
1754
1755 /* Get the base offset for XFB buffers, which by convention come after
1756 * everything else. Wrapper function for semantic reasons; by construction this
1757 * is just popcount. */
1758
1759 static inline unsigned
1760 pan_xfb_base(unsigned present)
1761 {
1762 return util_bitcount(present);
1763 }
1764
1765 /* Computes the present mask for varyings so we can start emitting varying records */
1766
1767 static inline unsigned
1768 pan_varying_present(
1769 struct panfrost_shader_state *vs,
1770 struct panfrost_shader_state *fs,
1771 unsigned quirks)
1772 {
1773 /* At the moment we always emit general and position buffers. Not
1774 * strictly necessary but usually harmless */
1775
1776 unsigned present = (1 << PAN_VARY_GENERAL) | (1 << PAN_VARY_POSITION);
1777
1778 /* Enable special buffers by the shader info */
1779
1780 if (vs->writes_point_size)
1781 present |= (1 << PAN_VARY_PSIZ);
1782
1783 if (fs->reads_point_coord)
1784 present |= (1 << PAN_VARY_PNTCOORD);
1785
1786 if (fs->reads_face)
1787 present |= (1 << PAN_VARY_FACE);
1788
1789 if (fs->reads_frag_coord && !(quirks & IS_BIFROST))
1790 present |= (1 << PAN_VARY_FRAGCOORD);
1791
1792 /* Also, if we have a point sprite, we need a point coord buffer */
1793
1794 for (unsigned i = 0; i < fs->varying_count; i++) {
1795 gl_varying_slot loc = fs->varyings_loc[i];
1796
1797 if (has_point_coord(fs->point_sprite_mask, loc))
1798 present |= (1 << PAN_VARY_PNTCOORD);
1799 }
1800
1801 return present;
1802 }
1803
1804 /* Emitters for varying records */
1805
1806 static struct mali_attr_meta
1807 pan_emit_vary(unsigned present, enum pan_special_varying buf,
1808 unsigned quirks, enum mali_format format,
1809 unsigned offset)
1810 {
1811 unsigned nr_channels = MALI_EXTRACT_CHANNELS(format);
1812
1813 struct mali_attr_meta meta = {
1814 .index = pan_varying_index(present, buf),
1815 .unknown1 = quirks & IS_BIFROST ? 0x0 : 0x2,
1816 .swizzle = quirks & HAS_SWIZZLES ?
1817 panfrost_get_default_swizzle(nr_channels) :
1818 panfrost_bifrost_swizzle(nr_channels),
1819 .format = format,
1820 .src_offset = offset
1821 };
1822
1823 return meta;
1824 }
1825
1826 /* General varying that is unused */
1827
1828 static struct mali_attr_meta
1829 pan_emit_vary_only(unsigned present, unsigned quirks)
1830 {
1831 return pan_emit_vary(present, 0, quirks, MALI_VARYING_DISCARD, 0);
1832 }
1833
1834 /* Special records */
1835
1836 static const enum mali_format pan_varying_formats[PAN_VARY_MAX] = {
1837 [PAN_VARY_POSITION] = MALI_VARYING_POS,
1838 [PAN_VARY_PSIZ] = MALI_R16F,
1839 [PAN_VARY_PNTCOORD] = MALI_R16F,
1840 [PAN_VARY_FACE] = MALI_R32I,
1841 [PAN_VARY_FRAGCOORD] = MALI_RGBA32F
1842 };
1843
1844 static struct mali_attr_meta
1845 pan_emit_vary_special(unsigned present, enum pan_special_varying buf,
1846 unsigned quirks)
1847 {
1848 assert(buf < PAN_VARY_MAX);
1849 return pan_emit_vary(present, buf, quirks, pan_varying_formats[buf], 0);
1850 }
1851
1852 static enum mali_format
1853 pan_xfb_format(enum mali_format format, unsigned nr)
1854 {
1855 if (MALI_EXTRACT_BITS(format) == MALI_CHANNEL_FLOAT)
1856 return MALI_R32F | MALI_NR_CHANNELS(nr);
1857 else
1858 return MALI_EXTRACT_TYPE(format) | MALI_NR_CHANNELS(nr) | MALI_CHANNEL_32;
1859 }
1860
1861 /* Transform feedback records. Note struct pipe_stream_output is (if packed as
1862 * a bitfield) 32-bit, smaller than a 64-bit pointer, so may as well pass by
1863 * value. */
1864
1865 static struct mali_attr_meta
1866 pan_emit_vary_xfb(unsigned present,
1867 unsigned max_xfb,
1868 unsigned *streamout_offsets,
1869 unsigned quirks,
1870 enum mali_format format,
1871 struct pipe_stream_output o)
1872 {
1873 /* Otherwise construct a record for it */
1874 struct mali_attr_meta meta = {
1875 /* XFB buffers come after everything else */
1876 .index = pan_xfb_base(present) + o.output_buffer,
1877
1878 /* As usual unknown bit */
1879 .unknown1 = quirks & IS_BIFROST ? 0x0 : 0x2,
1880
1881 /* Override swizzle with number of channels */
1882 .swizzle = quirks & HAS_SWIZZLES ?
1883 panfrost_get_default_swizzle(o.num_components) :
1884 panfrost_bifrost_swizzle(o.num_components),
1885
1886 /* Override number of channels and precision to highp */
1887 .format = pan_xfb_format(format, o.num_components),
1888
1889 /* Apply given offsets together */
1890 .src_offset = (o.dst_offset * 4) /* dwords */
1891 + streamout_offsets[o.output_buffer]
1892 };
1893
1894 return meta;
1895 }
1896
1897 /* Determine if we should capture a varying for XFB. This requires actually
1898 * having a buffer for it. If we don't capture it, we'll fallback to a general
1899 * varying path (linked or unlinked, possibly discarding the write) */
1900
1901 static bool
1902 panfrost_xfb_captured(struct panfrost_shader_state *xfb,
1903 unsigned loc, unsigned max_xfb)
1904 {
1905 if (!(xfb->so_mask & (1ll << loc)))
1906 return false;
1907
1908 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1909 return o->output_buffer < max_xfb;
1910 }
1911
1912 /* Higher-level wrapper around all of the above, classifying a varying into one
1913 * of the above types */
1914
1915 static struct mali_attr_meta
1916 panfrost_emit_varying(
1917 struct panfrost_shader_state *stage,
1918 struct panfrost_shader_state *other,
1919 struct panfrost_shader_state *xfb,
1920 unsigned present,
1921 unsigned max_xfb,
1922 unsigned *streamout_offsets,
1923 unsigned quirks,
1924 unsigned *gen_offsets,
1925 enum mali_format *gen_formats,
1926 unsigned *gen_stride,
1927 unsigned idx,
1928 bool should_alloc,
1929 bool is_fragment)
1930 {
1931 gl_varying_slot loc = stage->varyings_loc[idx];
1932 enum mali_format format = stage->varyings[idx];
1933
1934 /* Override format to match linkage */
1935 if (!should_alloc && gen_formats[idx])
1936 format = gen_formats[idx];
1937
1938 if (has_point_coord(stage->point_sprite_mask, loc)) {
1939 return pan_emit_vary_special(present, PAN_VARY_PNTCOORD, quirks);
1940 } else if (panfrost_xfb_captured(xfb, loc, max_xfb)) {
1941 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1942 return pan_emit_vary_xfb(present, max_xfb, streamout_offsets, quirks, format, *o);
1943 } else if (loc == VARYING_SLOT_POS) {
1944 if (is_fragment)
1945 return pan_emit_vary_special(present, PAN_VARY_FRAGCOORD, quirks);
1946 else
1947 return pan_emit_vary_special(present, PAN_VARY_POSITION, quirks);
1948 } else if (loc == VARYING_SLOT_PSIZ) {
1949 return pan_emit_vary_special(present, PAN_VARY_PSIZ, quirks);
1950 } else if (loc == VARYING_SLOT_PNTC) {
1951 return pan_emit_vary_special(present, PAN_VARY_PNTCOORD, quirks);
1952 } else if (loc == VARYING_SLOT_FACE) {
1953 return pan_emit_vary_special(present, PAN_VARY_FACE, quirks);
1954 }
1955
1956 /* We've exhausted special cases, so it's otherwise a general varying. Check if we're linked */
1957 signed other_idx = -1;
1958
1959 for (unsigned j = 0; j < other->varying_count; ++j) {
1960 if (other->varyings_loc[j] == loc) {
1961 other_idx = j;
1962 break;
1963 }
1964 }
1965
1966 if (other_idx < 0)
1967 return pan_emit_vary_only(present, quirks);
1968
1969 unsigned offset = gen_offsets[other_idx];
1970
1971 if (should_alloc) {
1972 /* We're linked, so allocate a space via a watermark allocation */
1973 enum mali_format alt = other->varyings[other_idx];
1974
1975 /* Do interpolation at minimum precision */
1976 unsigned size_main = pan_varying_size(format);
1977 unsigned size_alt = pan_varying_size(alt);
1978 unsigned size = MIN2(size_main, size_alt);
1979
1980 /* If a varying is marked for XFB but not actually captured, we
1981 * should match the format to the format that would otherwise
1982 * be used for XFB, since dEQP checks for invariance here. It's
1983 * unclear if this is required by the spec. */
1984
1985 if (xfb->so_mask & (1ull << loc)) {
1986 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1987 format = pan_xfb_format(format, o->num_components);
1988 size = pan_varying_size(format);
1989 } else if (size == size_alt) {
1990 format = alt;
1991 }
1992
1993 gen_offsets[idx] = *gen_stride;
1994 gen_formats[other_idx] = format;
1995 offset = *gen_stride;
1996 *gen_stride += size;
1997 }
1998
1999 return pan_emit_vary(present, PAN_VARY_GENERAL,
2000 quirks, format, offset);
2001 }
2002
2003 static void
2004 pan_emit_special_input(union mali_attr *varyings,
2005 unsigned present,
2006 enum pan_special_varying v,
2007 mali_ptr addr)
2008 {
2009 if (present & (1 << v)) {
2010 /* Ensure we write exactly once for performance and with fields
2011 * zeroed appropriately to avoid flakes */
2012
2013 union mali_attr s = {
2014 .elements = addr
2015 };
2016
2017 varyings[pan_varying_index(present, v)] = s;
2018 }
2019 }
2020
2021 void
2022 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
2023 unsigned vertex_count,
2024 struct mali_vertex_tiler_postfix *vertex_postfix,
2025 struct mali_vertex_tiler_postfix *tiler_postfix,
2026 union midgard_primitive_size *primitive_size)
2027 {
2028 /* Load the shaders */
2029 struct panfrost_context *ctx = batch->ctx;
2030 struct panfrost_device *dev = pan_device(ctx->base.screen);
2031 struct panfrost_shader_state *vs, *fs;
2032 size_t vs_size, fs_size;
2033
2034 /* Allocate the varying descriptor */
2035
2036 vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
2037 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
2038 vs_size = sizeof(struct mali_attr_meta) * vs->varying_count;
2039 fs_size = sizeof(struct mali_attr_meta) * fs->varying_count;
2040
2041 struct panfrost_transfer trans = panfrost_allocate_transient(batch,
2042 vs_size +
2043 fs_size);
2044
2045 struct pipe_stream_output_info *so = &vs->stream_output;
2046 unsigned present = pan_varying_present(vs, fs, dev->quirks);
2047
2048 /* Check if this varying is linked by us. This is the case for
2049 * general-purpose, non-captured varyings. If it is, link it. If it's
2050 * not, use the provided stream out information to determine the
2051 * offset, since it was already linked for us. */
2052
2053 unsigned gen_offsets[32];
2054 enum mali_format gen_formats[32];
2055 memset(gen_offsets, 0, sizeof(gen_offsets));
2056 memset(gen_formats, 0, sizeof(gen_formats));
2057
2058 unsigned gen_stride = 0;
2059 assert(vs->varying_count < ARRAY_SIZE(gen_offsets));
2060 assert(fs->varying_count < ARRAY_SIZE(gen_offsets));
2061
2062 unsigned streamout_offsets[32];
2063
2064 for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
2065 streamout_offsets[i] = panfrost_streamout_offset(
2066 so->stride[i],
2067 ctx->streamout.offsets[i],
2068 ctx->streamout.targets[i]);
2069 }
2070
2071 struct mali_attr_meta *ovs = (struct mali_attr_meta *)trans.cpu;
2072 struct mali_attr_meta *ofs = ovs + vs->varying_count;
2073
2074 for (unsigned i = 0; i < vs->varying_count; i++) {
2075 ovs[i] = panfrost_emit_varying(vs, fs, vs, present,
2076 ctx->streamout.num_targets, streamout_offsets,
2077 dev->quirks,
2078 gen_offsets, gen_formats, &gen_stride, i, true, false);
2079 }
2080
2081 for (unsigned i = 0; i < fs->varying_count; i++) {
2082 ofs[i] = panfrost_emit_varying(fs, vs, vs, present,
2083 ctx->streamout.num_targets, streamout_offsets,
2084 dev->quirks,
2085 gen_offsets, gen_formats, &gen_stride, i, false, true);
2086 }
2087
2088 unsigned xfb_base = pan_xfb_base(present);
2089 struct panfrost_transfer T = panfrost_allocate_transient(batch,
2090 sizeof(union mali_attr) * (xfb_base + ctx->streamout.num_targets));
2091 union mali_attr *varyings = (union mali_attr *) T.cpu;
2092
2093 /* Emit the stream out buffers */
2094
2095 unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
2096 ctx->vertex_count);
2097
2098 for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
2099 panfrost_emit_streamout(batch, &varyings[xfb_base + i],
2100 so->stride[i],
2101 ctx->streamout.offsets[i],
2102 out_count,
2103 ctx->streamout.targets[i]);
2104 }
2105
2106 panfrost_emit_varyings(batch,
2107 &varyings[pan_varying_index(present, PAN_VARY_GENERAL)],
2108 gen_stride, vertex_count);
2109
2110 /* fp32 vec4 gl_Position */
2111 tiler_postfix->position_varying = panfrost_emit_varyings(batch,
2112 &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
2113 sizeof(float) * 4, vertex_count);
2114
2115 if (present & (1 << PAN_VARY_PSIZ)) {
2116 primitive_size->pointer = panfrost_emit_varyings(batch,
2117 &varyings[pan_varying_index(present, PAN_VARY_PSIZ)],
2118 2, vertex_count);
2119 }
2120
2121 pan_emit_special_input(varyings, present, PAN_VARY_PNTCOORD, MALI_VARYING_POINT_COORD);
2122 pan_emit_special_input(varyings, present, PAN_VARY_FACE, MALI_VARYING_FRONT_FACING);
2123 pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD, MALI_VARYING_FRAG_COORD);
2124
2125 vertex_postfix->varyings = T.gpu;
2126 tiler_postfix->varyings = T.gpu;
2127
2128 vertex_postfix->varying_meta = trans.gpu;
2129 tiler_postfix->varying_meta = trans.gpu + vs_size;
2130 }
2131
2132 void
2133 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
2134 struct mali_vertex_tiler_prefix *vertex_prefix,
2135 struct mali_vertex_tiler_postfix *vertex_postfix,
2136 struct mali_vertex_tiler_prefix *tiler_prefix,
2137 struct mali_vertex_tiler_postfix *tiler_postfix,
2138 union midgard_primitive_size *primitive_size)
2139 {
2140 struct panfrost_context *ctx = batch->ctx;
2141 struct panfrost_device *device = pan_device(ctx->base.screen);
2142 bool wallpapering = ctx->wallpaper_batch && batch->tiler_dep;
2143 struct bifrost_payload_vertex bifrost_vertex = {0,};
2144 struct bifrost_payload_tiler bifrost_tiler = {0,};
2145 struct midgard_payload_vertex_tiler midgard_vertex = {0,};
2146 struct midgard_payload_vertex_tiler midgard_tiler = {0,};
2147 void *vp, *tp;
2148 size_t vp_size, tp_size;
2149
2150 if (device->quirks & IS_BIFROST) {
2151 bifrost_vertex.prefix = *vertex_prefix;
2152 bifrost_vertex.postfix = *vertex_postfix;
2153 vp = &bifrost_vertex;
2154 vp_size = sizeof(bifrost_vertex);
2155
2156 bifrost_tiler.prefix = *tiler_prefix;
2157 bifrost_tiler.tiler.primitive_size = *primitive_size;
2158 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
2159 bifrost_tiler.postfix = *tiler_postfix;
2160 tp = &bifrost_tiler;
2161 tp_size = sizeof(bifrost_tiler);
2162 } else {
2163 midgard_vertex.prefix = *vertex_prefix;
2164 midgard_vertex.postfix = *vertex_postfix;
2165 vp = &midgard_vertex;
2166 vp_size = sizeof(midgard_vertex);
2167
2168 midgard_tiler.prefix = *tiler_prefix;
2169 midgard_tiler.postfix = *tiler_postfix;
2170 midgard_tiler.primitive_size = *primitive_size;
2171 tp = &midgard_tiler;
2172 tp_size = sizeof(midgard_tiler);
2173 }
2174
2175 if (wallpapering) {
2176 /* Inject in reverse order, with "predicted" job indices.
2177 * THIS IS A HACK XXX */
2178 panfrost_new_job(batch, JOB_TYPE_TILER, false,
2179 batch->job_index + 2, tp, tp_size, true);
2180 panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
2181 vp, vp_size, true);
2182 return;
2183 }
2184
2185 /* If rasterizer discard is enable, only submit the vertex */
2186
2187 bool rasterizer_discard = ctx->rasterizer &&
2188 ctx->rasterizer->base.rasterizer_discard;
2189
2190 unsigned vertex = panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
2191 vp, vp_size, false);
2192
2193 if (rasterizer_discard)
2194 return;
2195
2196 panfrost_new_job(batch, JOB_TYPE_TILER, false, vertex, tp, tp_size,
2197 false);
2198 }
2199
2200 /* TODO: stop hardcoding this */
2201 mali_ptr
2202 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2203 {
2204 uint16_t locations[] = {
2205 128, 128,
2206 0, 256,
2207 0, 256,
2208 0, 256,
2209 0, 256,
2210 0, 256,
2211 0, 256,
2212 0, 256,
2213 0, 256,
2214 0, 256,
2215 0, 256,
2216 0, 256,
2217 0, 256,
2218 0, 256,
2219 0, 256,
2220 0, 256,
2221 0, 256,
2222 0, 256,
2223 0, 256,
2224 0, 256,
2225 0, 256,
2226 0, 256,
2227 0, 256,
2228 0, 256,
2229 0, 256,
2230 0, 256,
2231 0, 256,
2232 0, 256,
2233 0, 256,
2234 0, 256,
2235 0, 256,
2236 0, 256,
2237 128, 128,
2238 0, 0,
2239 0, 0,
2240 0, 0,
2241 0, 0,
2242 0, 0,
2243 0, 0,
2244 0, 0,
2245 0, 0,
2246 0, 0,
2247 0, 0,
2248 0, 0,
2249 0, 0,
2250 0, 0,
2251 0, 0,
2252 0, 0,
2253 };
2254
2255 return panfrost_upload_transient(batch, locations, 96 * sizeof(uint16_t));
2256 }