panfrost: Don't mask coverage mask to 4-bits
[mesa.git] / src / gallium / drivers / panfrost / pan_cmdstream.c
1 /*
2 * Copyright (C) 2018 Alyssa Rosenzweig
3 * Copyright (C) 2020 Collabora Ltd.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "util/macros.h"
26 #include "util/u_prim.h"
27 #include "util/u_vbuf.h"
28
29 #include "panfrost-quirks.h"
30
31 #include "pan_pool.h"
32 #include "pan_bo.h"
33 #include "pan_cmdstream.h"
34 #include "pan_context.h"
35 #include "pan_job.h"
36
37 /* If a BO is accessed for a particular shader stage, will it be in the primary
38 * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
39 * fragment will be primary, e.g. compute jobs will be considered
40 * "vertex/tiler" by analogy */
41
42 static inline uint32_t
43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
44 {
45 assert(stage == PIPE_SHADER_FRAGMENT ||
46 stage == PIPE_SHADER_VERTEX ||
47 stage == PIPE_SHADER_COMPUTE);
48
49 return stage == PIPE_SHADER_FRAGMENT ?
50 PAN_BO_ACCESS_FRAGMENT :
51 PAN_BO_ACCESS_VERTEX_TILER;
52 }
53
54 static void
55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
56 struct mali_vertex_tiler_postfix *postfix)
57 {
58 struct panfrost_device *dev = pan_device(ctx->base.screen);
59 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
60
61 unsigned shift = panfrost_get_stack_shift(batch->stack_size);
62 struct mali_shared_memory shared = {
63 .stack_shift = shift,
64 .scratchpad = panfrost_batch_get_scratchpad(batch, shift, dev->thread_tls_alloc, dev->core_count)->gpu,
65 .shared_workgroup_count = ~0,
66 };
67 postfix->shared_memory = panfrost_pool_upload(&batch->pool, &shared, sizeof(shared));
68 }
69
70 static void
71 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
72 struct mali_vertex_tiler_postfix *postfix)
73 {
74 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
75 postfix->shared_memory = panfrost_batch_reserve_framebuffer(batch);
76 }
77
78 static void
79 panfrost_vt_update_rasterizer(struct panfrost_context *ctx,
80 struct mali_vertex_tiler_prefix *prefix,
81 struct mali_vertex_tiler_postfix *postfix)
82 {
83 struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
84
85 postfix->gl_enables |= 0x7;
86 SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
87 rasterizer && rasterizer->base.front_ccw);
88 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
89 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_FRONT));
90 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
91 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_BACK));
92 SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
93 rasterizer && rasterizer->base.flatshade_first);
94 }
95
96 void
97 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
98 struct mali_vertex_tiler_prefix *prefix,
99 union midgard_primitive_size *primitive_size)
100 {
101 struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
102
103 if (!panfrost_writes_point_size(ctx)) {
104 bool points = prefix->draw_mode == MALI_DRAW_MODE_POINTS;
105 float val = 0.0f;
106
107 if (rasterizer)
108 val = points ?
109 rasterizer->base.point_size :
110 rasterizer->base.line_width;
111
112 primitive_size->constant = val;
113 }
114 }
115
116 static void
117 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
118 struct mali_vertex_tiler_postfix *postfix)
119 {
120 SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
121 if (ctx->occlusion_query) {
122 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
123 panfrost_batch_add_bo(ctx->batch, ctx->occlusion_query->bo,
124 PAN_BO_ACCESS_SHARED |
125 PAN_BO_ACCESS_RW |
126 PAN_BO_ACCESS_FRAGMENT);
127 } else {
128 postfix->occlusion_counter = 0;
129 }
130 }
131
132 void
133 panfrost_vt_init(struct panfrost_context *ctx,
134 enum pipe_shader_type stage,
135 struct mali_vertex_tiler_prefix *prefix,
136 struct mali_vertex_tiler_postfix *postfix)
137 {
138 struct panfrost_device *device = pan_device(ctx->base.screen);
139
140 if (!ctx->shader[stage])
141 return;
142
143 memset(prefix, 0, sizeof(*prefix));
144 memset(postfix, 0, sizeof(*postfix));
145
146 if (device->quirks & IS_BIFROST) {
147 postfix->gl_enables = 0x2;
148 panfrost_vt_emit_shared_memory(ctx, postfix);
149 } else {
150 postfix->gl_enables = 0x6;
151 panfrost_vt_attach_framebuffer(ctx, postfix);
152 }
153
154 if (stage == PIPE_SHADER_FRAGMENT) {
155 panfrost_vt_update_occlusion_query(ctx, postfix);
156 panfrost_vt_update_rasterizer(ctx, prefix, postfix);
157 }
158 }
159
160 static unsigned
161 panfrost_translate_index_size(unsigned size)
162 {
163 switch (size) {
164 case 1:
165 return MALI_DRAW_INDEXED_UINT8;
166
167 case 2:
168 return MALI_DRAW_INDEXED_UINT16;
169
170 case 4:
171 return MALI_DRAW_INDEXED_UINT32;
172
173 default:
174 unreachable("Invalid index size");
175 }
176 }
177
178 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
179 * good for the duration of the draw (transient), could last longer. Also get
180 * the bounds on the index buffer for the range accessed by the draw. We do
181 * these operations together because there are natural optimizations which
182 * require them to be together. */
183
184 static mali_ptr
185 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
186 const struct pipe_draw_info *info,
187 unsigned *min_index, unsigned *max_index)
188 {
189 struct panfrost_resource *rsrc = pan_resource(info->index.resource);
190 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
191 off_t offset = info->start * info->index_size;
192 bool needs_indices = true;
193 mali_ptr out = 0;
194
195 if (info->max_index != ~0u) {
196 *min_index = info->min_index;
197 *max_index = info->max_index;
198 needs_indices = false;
199 }
200
201 if (!info->has_user_indices) {
202 /* Only resources can be directly mapped */
203 panfrost_batch_add_bo(batch, rsrc->bo,
204 PAN_BO_ACCESS_SHARED |
205 PAN_BO_ACCESS_READ |
206 PAN_BO_ACCESS_VERTEX_TILER);
207 out = rsrc->bo->gpu + offset;
208
209 /* Check the cache */
210 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
211 info->start,
212 info->count,
213 min_index,
214 max_index);
215 } else {
216 /* Otherwise, we need to upload to transient memory */
217 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
218 out = panfrost_pool_upload(&batch->pool, ibuf8 + offset,
219 info->count *
220 info->index_size);
221 }
222
223 if (needs_indices) {
224 /* Fallback */
225 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
226
227 if (!info->has_user_indices)
228 panfrost_minmax_cache_add(rsrc->index_cache,
229 info->start, info->count,
230 *min_index, *max_index);
231 }
232
233 return out;
234 }
235
236 void
237 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
238 const struct pipe_draw_info *info,
239 enum mali_draw_mode draw_mode,
240 struct mali_vertex_tiler_postfix *vertex_postfix,
241 struct mali_vertex_tiler_prefix *tiler_prefix,
242 struct mali_vertex_tiler_postfix *tiler_postfix,
243 unsigned *vertex_count,
244 unsigned *padded_count)
245 {
246 tiler_prefix->draw_mode = draw_mode;
247
248 unsigned draw_flags = 0;
249
250 if (panfrost_writes_point_size(ctx))
251 draw_flags |= MALI_DRAW_VARYING_SIZE;
252
253 if (info->primitive_restart)
254 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
255
256 /* These doesn't make much sense */
257
258 draw_flags |= 0x3000;
259
260 if (info->index_size) {
261 unsigned min_index = 0, max_index = 0;
262
263 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
264 info,
265 &min_index,
266 &max_index);
267
268 /* Use the corresponding values */
269 *vertex_count = max_index - min_index + 1;
270 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
271 tiler_prefix->offset_bias_correction = -min_index;
272 tiler_prefix->index_count = MALI_POSITIVE(info->count);
273 draw_flags |= panfrost_translate_index_size(info->index_size);
274 } else {
275 tiler_prefix->indices = 0;
276 *vertex_count = ctx->vertex_count;
277 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
278 tiler_prefix->offset_bias_correction = 0;
279 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
280 }
281
282 tiler_prefix->unknown_draw = draw_flags;
283
284 /* Encode the padded vertex count */
285
286 if (info->instance_count > 1) {
287 *padded_count = panfrost_padded_vertex_count(*vertex_count);
288
289 unsigned shift = __builtin_ctz(ctx->padded_count);
290 unsigned k = ctx->padded_count >> (shift + 1);
291
292 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
293 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
294 } else {
295 *padded_count = *vertex_count;
296
297 /* Reset instancing state */
298 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
299 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
300 }
301 }
302
303 static void
304 panfrost_shader_meta_init(struct panfrost_context *ctx,
305 enum pipe_shader_type st,
306 struct mali_shader_meta *meta)
307 {
308 const struct panfrost_device *dev = pan_device(ctx->base.screen);
309 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
310
311 memset(meta, 0, sizeof(*meta));
312 meta->shader = (ss->bo ? ss->bo->gpu : 0) | ss->first_tag;
313 meta->attribute_count = ss->attribute_count;
314 meta->varying_count = ss->varying_count;
315 meta->texture_count = ctx->sampler_view_count[st];
316 meta->sampler_count = ctx->sampler_count[st];
317
318 if (dev->quirks & IS_BIFROST) {
319 if (st == PIPE_SHADER_VERTEX)
320 meta->bifrost1.unk1 = 0x800000;
321 else {
322 /* First clause ATEST |= 0x4000000.
323 * Less than 32 regs |= 0x200 */
324 meta->bifrost1.unk1 = 0x950020;
325 }
326
327 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
328 if (st == PIPE_SHADER_VERTEX)
329 meta->bifrost2.preload_regs = 0xC0;
330 else {
331 meta->bifrost2.preload_regs = 0x1;
332 SET_BIT(meta->bifrost2.preload_regs, 0x10, ss->reads_frag_coord);
333 }
334
335 meta->bifrost2.uniform_count = MIN2(ss->uniform_count,
336 ss->uniform_cutoff);
337 } else {
338 meta->midgard1.uniform_count = MIN2(ss->uniform_count,
339 ss->uniform_cutoff);
340 meta->midgard1.work_count = ss->work_reg_count;
341
342 /* TODO: This is not conformant on ES3 */
343 meta->midgard1.flags_hi = MALI_SUPPRESS_INF_NAN;
344
345 meta->midgard1.flags_lo = 0x20;
346 meta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
347
348 SET_BIT(meta->midgard1.flags_hi, MALI_WRITES_GLOBAL, ss->writes_global);
349 }
350 }
351
352 static unsigned
353 translate_tex_wrap(enum pipe_tex_wrap w)
354 {
355 switch (w) {
356 case PIPE_TEX_WRAP_REPEAT:
357 return MALI_WRAP_MODE_REPEAT;
358
359 case PIPE_TEX_WRAP_CLAMP:
360 return MALI_WRAP_MODE_CLAMP;
361
362 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
363 return MALI_WRAP_MODE_CLAMP_TO_EDGE;
364
365 case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
366 return MALI_WRAP_MODE_CLAMP_TO_BORDER;
367
368 case PIPE_TEX_WRAP_MIRROR_REPEAT:
369 return MALI_WRAP_MODE_MIRRORED_REPEAT;
370
371 case PIPE_TEX_WRAP_MIRROR_CLAMP:
372 return MALI_WRAP_MODE_MIRRORED_CLAMP;
373
374 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
375 return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE;
376
377 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
378 return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER;
379
380 default:
381 unreachable("Invalid wrap");
382 }
383 }
384
385 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
386 struct mali_sampler_descriptor *hw)
387 {
388 unsigned func = panfrost_translate_compare_func(cso->compare_func);
389 bool min_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
390 bool mag_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
391 bool mip_linear = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR;
392 unsigned min_filter = min_nearest ? MALI_SAMP_MIN_NEAREST : 0;
393 unsigned mag_filter = mag_nearest ? MALI_SAMP_MAG_NEAREST : 0;
394 unsigned mip_filter = mip_linear ?
395 (MALI_SAMP_MIP_LINEAR_1 | MALI_SAMP_MIP_LINEAR_2) : 0;
396 unsigned normalized = cso->normalized_coords ? MALI_SAMP_NORM_COORDS : 0;
397
398 *hw = (struct mali_sampler_descriptor) {
399 .filter_mode = min_filter | mag_filter | mip_filter |
400 normalized,
401 .wrap_s = translate_tex_wrap(cso->wrap_s),
402 .wrap_t = translate_tex_wrap(cso->wrap_t),
403 .wrap_r = translate_tex_wrap(cso->wrap_r),
404 .compare_func = cso->compare_mode ?
405 panfrost_flip_compare_func(func) :
406 MALI_FUNC_NEVER,
407 .border_color = {
408 cso->border_color.f[0],
409 cso->border_color.f[1],
410 cso->border_color.f[2],
411 cso->border_color.f[3]
412 },
413 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
414 .max_lod = FIXED_16(cso->max_lod, false),
415 .lod_bias = FIXED_16(cso->lod_bias, true), /* can be negative */
416 .seamless_cube_map = cso->seamless_cube_map,
417 };
418
419 /* If necessary, we disable mipmapping in the sampler descriptor by
420 * clamping the LOD as tight as possible (from 0 to epsilon,
421 * essentially -- remember these are fixed point numbers, so
422 * epsilon=1/256) */
423
424 if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
425 hw->max_lod = hw->min_lod + 1;
426 }
427
428 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
429 struct bifrost_sampler_descriptor *hw)
430 {
431 *hw = (struct bifrost_sampler_descriptor) {
432 .unk1 = 0x1,
433 .wrap_s = translate_tex_wrap(cso->wrap_s),
434 .wrap_t = translate_tex_wrap(cso->wrap_t),
435 .wrap_r = translate_tex_wrap(cso->wrap_r),
436 .unk8 = 0x8,
437 .min_filter = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST,
438 .norm_coords = cso->normalized_coords,
439 .mip_filter = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR,
440 .mag_filter = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR,
441 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
442 .max_lod = FIXED_16(cso->max_lod, false),
443 };
444
445 /* If necessary, we disable mipmapping in the sampler descriptor by
446 * clamping the LOD as tight as possible (from 0 to epsilon,
447 * essentially -- remember these are fixed point numbers, so
448 * epsilon=1/256) */
449
450 if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
451 hw->max_lod = hw->min_lod + 1;
452 }
453
454 static void
455 panfrost_frag_meta_rasterizer_update(struct panfrost_context *ctx,
456 struct mali_shader_meta *fragmeta)
457 {
458 if (!ctx->rasterizer) {
459 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, true);
460 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, false);
461 fragmeta->depth_units = 0.0f;
462 fragmeta->depth_factor = 0.0f;
463 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, false);
464 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, false);
465 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_NEAR, true);
466 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_FAR, true);
467 return;
468 }
469
470 struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
471
472 bool msaa = rast->multisample;
473
474 /* TODO: Sample size */
475 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
476 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
477
478 struct panfrost_shader_state *fs;
479 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
480
481 /* EXT_shader_framebuffer_fetch requires the shader to be run
482 * per-sample when outputs are read. */
483 bool per_sample = ctx->min_samples > 1 || fs->outputs_read;
484 SET_BIT(fragmeta->unknown2_3, MALI_PER_SAMPLE, msaa && per_sample);
485
486 fragmeta->depth_units = rast->offset_units * 2.0f;
487 fragmeta->depth_factor = rast->offset_scale;
488
489 /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
490
491 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, rast->offset_tri);
492 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, rast->offset_tri);
493
494 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_NEAR, rast->depth_clip_near);
495 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_FAR, rast->depth_clip_far);
496 }
497
498 static void
499 panfrost_frag_meta_zsa_update(struct panfrost_context *ctx,
500 struct mali_shader_meta *fragmeta)
501 {
502 const struct panfrost_zsa_state *so = ctx->depth_stencil;
503 int zfunc = PIPE_FUNC_ALWAYS;
504
505 if (!so) {
506 /* If stenciling is disabled, the state is irrelevant */
507 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST, false);
508 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK, false);
509 } else {
510 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
511 so->base.stencil[0].enabled);
512
513 fragmeta->stencil_mask_front = so->stencil_mask_front;
514 fragmeta->stencil_mask_back = so->stencil_mask_back;
515
516 /* Bottom bits for stencil ref, exactly one word */
517 fragmeta->stencil_front.opaque[0] = so->stencil_front.opaque[0] | ctx->stencil_ref.ref_value[0];
518
519 /* If back-stencil is not enabled, use the front values */
520
521 if (so->base.stencil[1].enabled)
522 fragmeta->stencil_back.opaque[0] = so->stencil_back.opaque[0] | ctx->stencil_ref.ref_value[1];
523 else
524 fragmeta->stencil_back = fragmeta->stencil_front;
525
526 if (so->base.depth.enabled)
527 zfunc = so->base.depth.func;
528
529 /* Depth state (TODO: Refactor) */
530
531 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
532 so->base.depth.writemask);
533 }
534
535 fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
536 fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(zfunc));
537 }
538
539 static bool
540 panfrost_fs_required(
541 struct panfrost_shader_state *fs,
542 struct panfrost_blend_final *blend,
543 unsigned rt_count)
544 {
545 /* If we generally have side effects */
546 if (fs->fs_sidefx)
547 return true;
548
549 /* If colour is written we need to execute */
550 for (unsigned i = 0; i < rt_count; ++i) {
551 if (!blend[i].no_colour)
552 return true;
553 }
554
555 /* If depth is written and not implied we need to execute.
556 * TODO: Predicate on Z/S writes being enabled */
557 return (fs->writes_depth || fs->writes_stencil);
558 }
559
560 static void
561 panfrost_frag_meta_blend_update(struct panfrost_context *ctx,
562 struct mali_shader_meta *fragmeta,
563 void *rts)
564 {
565 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
566 const struct panfrost_device *dev = pan_device(ctx->base.screen);
567 struct panfrost_shader_state *fs;
568 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
569
570 SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
571 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
572 !ctx->blend->base.dither);
573
574 SET_BIT(fragmeta->unknown2_4, MALI_ALPHA_TO_COVERAGE,
575 ctx->blend->base.alpha_to_coverage);
576
577 /* Get blending setup */
578 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
579
580 struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
581 unsigned shader_offset = 0;
582 struct panfrost_bo *shader_bo = NULL;
583
584 for (unsigned c = 0; c < rt_count; ++c)
585 blend[c] = panfrost_get_blend_for_context(ctx, c, &shader_bo,
586 &shader_offset);
587
588 /* Disable shader execution if we can */
589 if (dev->quirks & MIDGARD_SHADERLESS
590 && !panfrost_fs_required(fs, blend, rt_count)) {
591 fragmeta->shader = 0;
592 fragmeta->attribute_count = 0;
593 fragmeta->varying_count = 0;
594 fragmeta->texture_count = 0;
595 fragmeta->sampler_count = 0;
596
597 /* This feature is not known to work on Bifrost */
598 fragmeta->midgard1.work_count = 1;
599 fragmeta->midgard1.uniform_count = 0;
600 fragmeta->midgard1.uniform_buffer_count = 0;
601 }
602
603 /* If there is a blend shader, work registers are shared. We impose 8
604 * work registers as a limit for blend shaders. Should be lower XXX */
605
606 if (!(dev->quirks & IS_BIFROST)) {
607 for (unsigned c = 0; c < rt_count; ++c) {
608 if (blend[c].is_shader) {
609 fragmeta->midgard1.work_count =
610 MAX2(fragmeta->midgard1.work_count, 8);
611 }
612 }
613 }
614
615 /* Even on MFBD, the shader descriptor gets blend shaders. It's *also*
616 * copied to the blend_meta appended (by convention), but this is the
617 * field actually read by the hardware. (Or maybe both are read...?).
618 * Specify the last RTi with a blend shader. */
619
620 fragmeta->blend.shader = 0;
621
622 for (signed rt = (rt_count - 1); rt >= 0; --rt) {
623 if (!blend[rt].is_shader)
624 continue;
625
626 fragmeta->blend.shader = blend[rt].shader.gpu |
627 blend[rt].shader.first_tag;
628 break;
629 }
630
631 if (dev->quirks & MIDGARD_SFBD) {
632 /* When only a single render target platform is used, the blend
633 * information is inside the shader meta itself. We additionally
634 * need to signal CAN_DISCARD for nontrivial blend modes (so
635 * we're able to read back the destination buffer) */
636
637 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
638 blend[0].is_shader);
639
640 if (!blend[0].is_shader) {
641 fragmeta->blend.equation = *blend[0].equation.equation;
642 fragmeta->blend.constant = blend[0].equation.constant;
643 }
644
645 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
646 !blend[0].no_blending || fs->can_discard);
647
648 batch->draws |= PIPE_CLEAR_COLOR0;
649 return;
650 }
651
652 if (dev->quirks & IS_BIFROST) {
653 bool no_blend = true;
654
655 for (unsigned i = 0; i < rt_count; ++i)
656 no_blend &= (blend[i].no_blending | blend[i].no_colour);
657
658 SET_BIT(fragmeta->bifrost1.unk1, MALI_BIFROST_EARLY_Z,
659 !fs->can_discard && !fs->writes_depth && no_blend);
660 }
661
662 /* Additional blend descriptor tacked on for jobs using MFBD */
663
664 for (unsigned i = 0; i < rt_count; ++i) {
665 unsigned flags = 0;
666
667 if (ctx->pipe_framebuffer.nr_cbufs > i && !blend[i].no_colour) {
668 flags = 0x200;
669 batch->draws |= (PIPE_CLEAR_COLOR0 << i);
670
671 bool is_srgb = (ctx->pipe_framebuffer.nr_cbufs > i) &&
672 (ctx->pipe_framebuffer.cbufs[i]) &&
673 util_format_is_srgb(ctx->pipe_framebuffer.cbufs[i]->format);
674
675 SET_BIT(flags, MALI_BLEND_MRT_SHADER, blend[i].is_shader);
676 SET_BIT(flags, MALI_BLEND_LOAD_TIB, !blend[i].no_blending);
677 SET_BIT(flags, MALI_BLEND_SRGB, is_srgb);
678 SET_BIT(flags, MALI_BLEND_NO_DITHER, !ctx->blend->base.dither);
679 }
680
681 if (dev->quirks & IS_BIFROST) {
682 struct bifrost_blend_rt *brts = rts;
683
684 brts[i].flags = flags;
685
686 if (blend[i].is_shader) {
687 /* The blend shader's address needs to be at
688 * the same top 32 bit as the fragment shader.
689 * TODO: Ensure that's always the case.
690 */
691 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
692 (fs->bo->gpu & (0xffffffffull << 32)));
693 brts[i].shader = blend[i].shader.gpu;
694 brts[i].unk2 = 0x0;
695 } else if (ctx->pipe_framebuffer.nr_cbufs > i) {
696 enum pipe_format format = ctx->pipe_framebuffer.cbufs[i]->format;
697 const struct util_format_description *format_desc;
698 format_desc = util_format_description(format);
699
700 brts[i].equation = *blend[i].equation.equation;
701
702 /* TODO: this is a bit more complicated */
703 brts[i].constant = blend[i].equation.constant;
704
705 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
706
707 /* 0x19 disables blending and forces REPLACE
708 * mode (equivalent to rgb_mode = alpha_mode =
709 * x122, colour mask = 0xF). 0x1a allows
710 * blending. */
711 brts[i].unk2 = blend[i].no_blending ? 0x19 : 0x1a;
712
713 brts[i].shader_type = fs->blend_types[i];
714 } else {
715 /* Dummy attachment for depth-only */
716 brts[i].unk2 = 0x3;
717 brts[i].shader_type = fs->blend_types[i];
718 }
719 } else {
720 struct midgard_blend_rt *mrts = rts;
721 mrts[i].flags = flags;
722
723 if (blend[i].is_shader) {
724 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
725 } else {
726 mrts[i].blend.equation = *blend[i].equation.equation;
727 mrts[i].blend.constant = blend[i].equation.constant;
728 }
729 }
730 }
731 }
732
733 static void
734 panfrost_frag_shader_meta_init(struct panfrost_context *ctx,
735 struct mali_shader_meta *fragmeta,
736 void *rts)
737 {
738 const struct panfrost_device *dev = pan_device(ctx->base.screen);
739 struct panfrost_shader_state *fs;
740
741 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
742
743 bool msaa = ctx->rasterizer && ctx->rasterizer->base.multisample;
744 fragmeta->coverage_mask = msaa ? ctx->sample_mask : ~0;
745
746 fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x10;
747 fragmeta->unknown2_4 = 0x4e0;
748
749 /* unknown2_4 has 0x10 bit set on T6XX and T720. We don't know why this
750 * is required (independent of 32-bit/64-bit descriptors), or why it's
751 * not used on later GPU revisions. Otherwise, all shader jobs fault on
752 * these earlier chips (perhaps this is a chicken bit of some kind).
753 * More investigation is needed. */
754
755 SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
756
757 if (dev->quirks & IS_BIFROST) {
758 /* TODO */
759 } else {
760 /* Depending on whether it's legal to in the given shader, we try to
761 * enable early-z testing. TODO: respect e-z force */
762
763 SET_BIT(fragmeta->midgard1.flags_lo, MALI_EARLY_Z,
764 !fs->can_discard && !fs->writes_global &&
765 !fs->writes_depth && !fs->writes_stencil &&
766 !ctx->blend->base.alpha_to_coverage);
767
768 /* Add the writes Z/S flags if needed. */
769 SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_Z, fs->writes_depth);
770 SET_BIT(fragmeta->midgard1.flags_hi, MALI_WRITES_S, fs->writes_stencil);
771
772 /* Any time texturing is used, derivatives are implicitly calculated,
773 * so we need to enable helper invocations */
774
775 SET_BIT(fragmeta->midgard1.flags_lo, MALI_HELPER_INVOCATIONS,
776 fs->helper_invocations);
777
778 /* If discard is enabled, which bit we set to convey this
779 * depends on if depth/stencil is used for the draw or not.
780 * Just one of depth OR stencil is enough to trigger this. */
781
782 const struct pipe_depth_stencil_alpha_state *zsa = &ctx->depth_stencil->base;
783 bool zs_enabled = fs->writes_depth || fs->writes_stencil;
784
785 if (zsa) {
786 zs_enabled |= (zsa->depth.enabled && zsa->depth.func != PIPE_FUNC_ALWAYS);
787 zs_enabled |= zsa->stencil[0].enabled;
788 }
789
790 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_TILEBUFFER,
791 fs->outputs_read || (!zs_enabled && fs->can_discard));
792 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_ZS, zs_enabled && fs->can_discard);
793 }
794
795 panfrost_frag_meta_rasterizer_update(ctx, fragmeta);
796 panfrost_frag_meta_zsa_update(ctx, fragmeta);
797 panfrost_frag_meta_blend_update(ctx, fragmeta, rts);
798 }
799
800 void
801 panfrost_emit_shader_meta(struct panfrost_batch *batch,
802 enum pipe_shader_type st,
803 struct mali_vertex_tiler_postfix *postfix)
804 {
805 struct panfrost_context *ctx = batch->ctx;
806 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
807
808 if (!ss) {
809 postfix->shader = 0;
810 return;
811 }
812
813 struct mali_shader_meta meta;
814
815 panfrost_shader_meta_init(ctx, st, &meta);
816
817 /* Add the shader BO to the batch. */
818 panfrost_batch_add_bo(batch, ss->bo,
819 PAN_BO_ACCESS_PRIVATE |
820 PAN_BO_ACCESS_READ |
821 panfrost_bo_access_for_stage(st));
822
823 mali_ptr shader_ptr;
824
825 if (st == PIPE_SHADER_FRAGMENT) {
826 struct panfrost_device *dev = pan_device(ctx->base.screen);
827 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
828 size_t desc_size = sizeof(meta);
829 void *rts = NULL;
830 struct panfrost_transfer xfer;
831 unsigned rt_size;
832
833 if (dev->quirks & MIDGARD_SFBD)
834 rt_size = 0;
835 else if (dev->quirks & IS_BIFROST)
836 rt_size = sizeof(struct bifrost_blend_rt);
837 else
838 rt_size = sizeof(struct midgard_blend_rt);
839
840 desc_size += rt_size * rt_count;
841
842 if (rt_size)
843 rts = rzalloc_size(ctx, rt_size * rt_count);
844
845 panfrost_frag_shader_meta_init(ctx, &meta, rts);
846
847 xfer = panfrost_pool_alloc(&batch->pool, desc_size);
848
849 memcpy(xfer.cpu, &meta, sizeof(meta));
850 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
851
852 if (rt_size)
853 ralloc_free(rts);
854
855 shader_ptr = xfer.gpu;
856 } else {
857 shader_ptr = panfrost_pool_upload(&batch->pool, &meta,
858 sizeof(meta));
859 }
860
861 postfix->shader = shader_ptr;
862 }
863
864 void
865 panfrost_emit_viewport(struct panfrost_batch *batch,
866 struct mali_vertex_tiler_postfix *tiler_postfix)
867 {
868 struct panfrost_context *ctx = batch->ctx;
869 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
870 const struct pipe_scissor_state *ss = &ctx->scissor;
871 const struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
872 const struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer;
873
874 /* Derive min/max from translate/scale. Note since |x| >= 0 by
875 * definition, we have that -|x| <= |x| hence translate - |scale| <=
876 * translate + |scale|, so the ordering is correct here. */
877 float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
878 float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
879 float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
880 float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
881 float minz = (vp->translate[2] - fabsf(vp->scale[2]));
882 float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
883
884 /* Scissor to the intersection of viewport and to the scissor, clamped
885 * to the framebuffer */
886
887 unsigned minx = MIN2(fb->width, vp_minx);
888 unsigned maxx = MIN2(fb->width, vp_maxx);
889 unsigned miny = MIN2(fb->height, vp_miny);
890 unsigned maxy = MIN2(fb->height, vp_maxy);
891
892 if (ss && rast && rast->scissor) {
893 minx = MAX2(ss->minx, minx);
894 miny = MAX2(ss->miny, miny);
895 maxx = MIN2(ss->maxx, maxx);
896 maxy = MIN2(ss->maxy, maxy);
897 }
898
899 struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool, MALI_VIEWPORT_LENGTH);
900
901 pan_pack(T.cpu, VIEWPORT, cfg) {
902 cfg.scissor_minimum_x = minx;
903 cfg.scissor_minimum_y = miny;
904 cfg.scissor_maximum_x = maxx - 1;
905 cfg.scissor_maximum_y = maxy - 1;
906
907 cfg.minimum_z = rast->depth_clip_near ? minz : -INFINITY;
908 cfg.maximum_z = rast->depth_clip_far ? maxz : INFINITY;
909 }
910
911 tiler_postfix->viewport = T.gpu;
912 panfrost_batch_union_scissor(batch, minx, miny, maxx, maxy);
913 }
914
915 static mali_ptr
916 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
917 enum pipe_shader_type st,
918 struct panfrost_constant_buffer *buf,
919 unsigned index)
920 {
921 struct pipe_constant_buffer *cb = &buf->cb[index];
922 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
923
924 if (rsrc) {
925 panfrost_batch_add_bo(batch, rsrc->bo,
926 PAN_BO_ACCESS_SHARED |
927 PAN_BO_ACCESS_READ |
928 panfrost_bo_access_for_stage(st));
929
930 /* Alignment gauranteed by
931 * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
932 return rsrc->bo->gpu + cb->buffer_offset;
933 } else if (cb->user_buffer) {
934 return panfrost_pool_upload(&batch->pool,
935 cb->user_buffer +
936 cb->buffer_offset,
937 cb->buffer_size);
938 } else {
939 unreachable("No constant buffer");
940 }
941 }
942
943 struct sysval_uniform {
944 union {
945 float f[4];
946 int32_t i[4];
947 uint32_t u[4];
948 uint64_t du[2];
949 };
950 };
951
952 static void
953 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
954 struct sysval_uniform *uniform)
955 {
956 struct panfrost_context *ctx = batch->ctx;
957 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
958
959 uniform->f[0] = vp->scale[0];
960 uniform->f[1] = vp->scale[1];
961 uniform->f[2] = vp->scale[2];
962 }
963
964 static void
965 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
966 struct sysval_uniform *uniform)
967 {
968 struct panfrost_context *ctx = batch->ctx;
969 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
970
971 uniform->f[0] = vp->translate[0];
972 uniform->f[1] = vp->translate[1];
973 uniform->f[2] = vp->translate[2];
974 }
975
976 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
977 enum pipe_shader_type st,
978 unsigned int sysvalid,
979 struct sysval_uniform *uniform)
980 {
981 struct panfrost_context *ctx = batch->ctx;
982 unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
983 unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
984 bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
985 struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
986
987 assert(dim);
988 uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
989
990 if (dim > 1)
991 uniform->i[1] = u_minify(tex->texture->height0,
992 tex->u.tex.first_level);
993
994 if (dim > 2)
995 uniform->i[2] = u_minify(tex->texture->depth0,
996 tex->u.tex.first_level);
997
998 if (is_array)
999 uniform->i[dim] = tex->texture->array_size;
1000 }
1001
1002 static void
1003 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
1004 enum pipe_shader_type st,
1005 unsigned ssbo_id,
1006 struct sysval_uniform *uniform)
1007 {
1008 struct panfrost_context *ctx = batch->ctx;
1009
1010 assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
1011 struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
1012
1013 /* Compute address */
1014 struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
1015
1016 panfrost_batch_add_bo(batch, bo,
1017 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
1018 panfrost_bo_access_for_stage(st));
1019
1020 /* Upload address and size as sysval */
1021 uniform->du[0] = bo->gpu + sb.buffer_offset;
1022 uniform->u[2] = sb.buffer_size;
1023 }
1024
1025 static void
1026 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1027 enum pipe_shader_type st,
1028 unsigned samp_idx,
1029 struct sysval_uniform *uniform)
1030 {
1031 struct panfrost_context *ctx = batch->ctx;
1032 struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1033
1034 uniform->f[0] = sampl->min_lod;
1035 uniform->f[1] = sampl->max_lod;
1036 uniform->f[2] = sampl->lod_bias;
1037
1038 /* Even without any errata, Midgard represents "no mipmapping" as
1039 * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1040 * panfrost_create_sampler_state which also explains our choice of
1041 * epsilon value (again to keep behaviour consistent) */
1042
1043 if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1044 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1045 }
1046
1047 static void
1048 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1049 struct sysval_uniform *uniform)
1050 {
1051 struct panfrost_context *ctx = batch->ctx;
1052
1053 uniform->u[0] = ctx->compute_grid->grid[0];
1054 uniform->u[1] = ctx->compute_grid->grid[1];
1055 uniform->u[2] = ctx->compute_grid->grid[2];
1056 }
1057
1058 static void
1059 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1060 struct panfrost_shader_state *ss,
1061 enum pipe_shader_type st)
1062 {
1063 struct sysval_uniform *uniforms = (void *)buf;
1064
1065 for (unsigned i = 0; i < ss->sysval_count; ++i) {
1066 int sysval = ss->sysval[i];
1067
1068 switch (PAN_SYSVAL_TYPE(sysval)) {
1069 case PAN_SYSVAL_VIEWPORT_SCALE:
1070 panfrost_upload_viewport_scale_sysval(batch,
1071 &uniforms[i]);
1072 break;
1073 case PAN_SYSVAL_VIEWPORT_OFFSET:
1074 panfrost_upload_viewport_offset_sysval(batch,
1075 &uniforms[i]);
1076 break;
1077 case PAN_SYSVAL_TEXTURE_SIZE:
1078 panfrost_upload_txs_sysval(batch, st,
1079 PAN_SYSVAL_ID(sysval),
1080 &uniforms[i]);
1081 break;
1082 case PAN_SYSVAL_SSBO:
1083 panfrost_upload_ssbo_sysval(batch, st,
1084 PAN_SYSVAL_ID(sysval),
1085 &uniforms[i]);
1086 break;
1087 case PAN_SYSVAL_NUM_WORK_GROUPS:
1088 panfrost_upload_num_work_groups_sysval(batch,
1089 &uniforms[i]);
1090 break;
1091 case PAN_SYSVAL_SAMPLER:
1092 panfrost_upload_sampler_sysval(batch, st,
1093 PAN_SYSVAL_ID(sysval),
1094 &uniforms[i]);
1095 break;
1096 default:
1097 assert(0);
1098 }
1099 }
1100 }
1101
1102 static const void *
1103 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1104 unsigned index)
1105 {
1106 struct pipe_constant_buffer *cb = &buf->cb[index];
1107 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1108
1109 if (rsrc)
1110 return rsrc->bo->cpu;
1111 else if (cb->user_buffer)
1112 return cb->user_buffer;
1113 else
1114 unreachable("No constant buffer");
1115 }
1116
1117 void
1118 panfrost_emit_const_buf(struct panfrost_batch *batch,
1119 enum pipe_shader_type stage,
1120 struct mali_vertex_tiler_postfix *postfix)
1121 {
1122 struct panfrost_context *ctx = batch->ctx;
1123 struct panfrost_shader_variants *all = ctx->shader[stage];
1124
1125 if (!all)
1126 return;
1127
1128 struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1129
1130 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1131
1132 /* Uniforms are implicitly UBO #0 */
1133 bool has_uniforms = buf->enabled_mask & (1 << 0);
1134
1135 /* Allocate room for the sysval and the uniforms */
1136 size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1137 size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1138 size_t size = sys_size + uniform_size;
1139 struct panfrost_transfer transfer = panfrost_pool_alloc(&batch->pool,
1140 size);
1141
1142 /* Upload sysvals requested by the shader */
1143 panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1144
1145 /* Upload uniforms */
1146 if (has_uniforms && uniform_size) {
1147 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1148 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1149 }
1150
1151 /* Next up, attach UBOs. UBO #0 is the uniforms we just
1152 * uploaded */
1153
1154 unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1155 assert(ubo_count >= 1);
1156
1157 size_t sz = MALI_UNIFORM_BUFFER_LENGTH * ubo_count;
1158 struct panfrost_transfer ubos = panfrost_pool_alloc(&batch->pool, sz);
1159 uint64_t *ubo_ptr = (uint64_t *) ubos.cpu;
1160
1161 /* Upload uniforms as a UBO */
1162
1163 if (ss->uniform_count) {
1164 pan_pack(ubo_ptr, UNIFORM_BUFFER, cfg) {
1165 cfg.entries = ss->uniform_count;
1166 cfg.pointer = transfer.gpu;
1167 }
1168 } else {
1169 *ubo_ptr = 0;
1170 }
1171
1172 /* The rest are honest-to-goodness UBOs */
1173
1174 for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1175 size_t usz = buf->cb[ubo].buffer_size;
1176 bool enabled = buf->enabled_mask & (1 << ubo);
1177 bool empty = usz == 0;
1178
1179 if (!enabled || empty) {
1180 ubo_ptr[ubo] = 0;
1181 continue;
1182 }
1183
1184 pan_pack(ubo_ptr + ubo, UNIFORM_BUFFER, cfg) {
1185 cfg.entries = DIV_ROUND_UP(usz, 16);
1186 cfg.pointer = panfrost_map_constant_buffer_gpu(batch,
1187 stage, buf, ubo);
1188 }
1189 }
1190
1191 postfix->uniforms = transfer.gpu;
1192 postfix->uniform_buffers = ubos.gpu;
1193
1194 buf->dirty_mask = 0;
1195 }
1196
1197 void
1198 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1199 const struct pipe_grid_info *info,
1200 struct midgard_payload_vertex_tiler *vtp)
1201 {
1202 struct panfrost_context *ctx = batch->ctx;
1203 struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1204 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1205 unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1206 128));
1207 unsigned shared_size = single_size * info->grid[0] * info->grid[1] *
1208 info->grid[2] * 4;
1209 struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1210 shared_size,
1211 1);
1212
1213 struct mali_shared_memory shared = {
1214 .shared_memory = bo->gpu,
1215 .shared_workgroup_count =
1216 util_logbase2_ceil(info->grid[0]) +
1217 util_logbase2_ceil(info->grid[1]) +
1218 util_logbase2_ceil(info->grid[2]),
1219 .shared_unk1 = 0x2,
1220 .shared_shift = util_logbase2(single_size) - 1
1221 };
1222
1223 vtp->postfix.shared_memory = panfrost_pool_upload(&batch->pool, &shared,
1224 sizeof(shared));
1225 }
1226
1227 static mali_ptr
1228 panfrost_get_tex_desc(struct panfrost_batch *batch,
1229 enum pipe_shader_type st,
1230 struct panfrost_sampler_view *view)
1231 {
1232 if (!view)
1233 return (mali_ptr) 0;
1234
1235 struct pipe_sampler_view *pview = &view->base;
1236 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1237
1238 /* Add the BO to the job so it's retained until the job is done. */
1239
1240 panfrost_batch_add_bo(batch, rsrc->bo,
1241 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1242 panfrost_bo_access_for_stage(st));
1243
1244 panfrost_batch_add_bo(batch, view->bo,
1245 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1246 panfrost_bo_access_for_stage(st));
1247
1248 return view->bo->gpu;
1249 }
1250
1251 static void
1252 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
1253 struct pipe_context *pctx)
1254 {
1255 struct panfrost_resource *rsrc = pan_resource(view->base.texture);
1256 if (view->texture_bo != rsrc->bo->gpu ||
1257 view->modifier != rsrc->modifier) {
1258 panfrost_bo_unreference(view->bo);
1259 panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);
1260 }
1261 }
1262
1263 void
1264 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1265 enum pipe_shader_type stage,
1266 struct mali_vertex_tiler_postfix *postfix)
1267 {
1268 struct panfrost_context *ctx = batch->ctx;
1269 struct panfrost_device *device = pan_device(ctx->base.screen);
1270
1271 if (!ctx->sampler_view_count[stage])
1272 return;
1273
1274 if (device->quirks & IS_BIFROST) {
1275 struct bifrost_texture_descriptor *descriptors;
1276
1277 descriptors = malloc(sizeof(struct bifrost_texture_descriptor) *
1278 ctx->sampler_view_count[stage]);
1279
1280 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1281 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1282 struct pipe_sampler_view *pview = &view->base;
1283 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1284 panfrost_update_sampler_view(view, &ctx->base);
1285
1286 /* Add the BOs to the job so they are retained until the job is done. */
1287
1288 panfrost_batch_add_bo(batch, rsrc->bo,
1289 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1290 panfrost_bo_access_for_stage(stage));
1291
1292 panfrost_batch_add_bo(batch, view->bo,
1293 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1294 panfrost_bo_access_for_stage(stage));
1295
1296 memcpy(&descriptors[i], view->bifrost_descriptor, sizeof(*view->bifrost_descriptor));
1297 }
1298
1299 postfix->textures = panfrost_pool_upload(&batch->pool,
1300 descriptors,
1301 sizeof(struct bifrost_texture_descriptor) *
1302 ctx->sampler_view_count[stage]);
1303
1304 free(descriptors);
1305 } else {
1306 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1307
1308 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1309 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1310
1311 panfrost_update_sampler_view(view, &ctx->base);
1312
1313 trampolines[i] = panfrost_get_tex_desc(batch, stage, view);
1314 }
1315
1316 postfix->textures = panfrost_pool_upload(&batch->pool,
1317 trampolines,
1318 sizeof(uint64_t) *
1319 ctx->sampler_view_count[stage]);
1320 }
1321 }
1322
1323 void
1324 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1325 enum pipe_shader_type stage,
1326 struct mali_vertex_tiler_postfix *postfix)
1327 {
1328 struct panfrost_context *ctx = batch->ctx;
1329 struct panfrost_device *device = pan_device(ctx->base.screen);
1330
1331 if (!ctx->sampler_count[stage])
1332 return;
1333
1334 if (device->quirks & IS_BIFROST) {
1335 size_t desc_size = sizeof(struct bifrost_sampler_descriptor);
1336 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1337 struct panfrost_transfer transfer = panfrost_pool_alloc(&batch->pool,
1338 transfer_size);
1339 struct bifrost_sampler_descriptor *desc = (struct bifrost_sampler_descriptor *)transfer.cpu;
1340
1341 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1342 desc[i] = ctx->samplers[stage][i]->bifrost_hw;
1343
1344 postfix->sampler_descriptor = transfer.gpu;
1345 } else {
1346 size_t desc_size = sizeof(struct mali_sampler_descriptor);
1347 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1348 struct panfrost_transfer transfer = panfrost_pool_alloc(&batch->pool,
1349 transfer_size);
1350 struct mali_sampler_descriptor *desc = (struct mali_sampler_descriptor *)transfer.cpu;
1351
1352 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1353 desc[i] = ctx->samplers[stage][i]->midgard_hw;
1354
1355 postfix->sampler_descriptor = transfer.gpu;
1356 }
1357 }
1358
1359 void
1360 panfrost_emit_vertex_attr_meta(struct panfrost_batch *batch,
1361 struct mali_vertex_tiler_postfix *vertex_postfix)
1362 {
1363 struct panfrost_context *ctx = batch->ctx;
1364
1365 if (!ctx->vertex)
1366 return;
1367
1368 struct panfrost_vertex_state *so = ctx->vertex;
1369
1370 panfrost_vertex_state_upd_attr_offs(ctx, vertex_postfix);
1371 vertex_postfix->attribute_meta = panfrost_pool_upload(&batch->pool, so->hw,
1372 sizeof(*so->hw) *
1373 PAN_MAX_ATTRIBUTE);
1374 }
1375
1376 void
1377 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1378 struct mali_vertex_tiler_postfix *vertex_postfix)
1379 {
1380 struct panfrost_context *ctx = batch->ctx;
1381 struct panfrost_vertex_state *so = ctx->vertex;
1382
1383 /* Staged mali_attr, and index into them. i =/= k, depending on the
1384 * vertex buffer mask and instancing. Twice as much room is allocated,
1385 * for a worst case of NPOT_DIVIDEs which take up extra slot */
1386 union mali_attr attrs[PIPE_MAX_ATTRIBS * 2];
1387 unsigned k = 0;
1388
1389 for (unsigned i = 0; i < so->num_elements; ++i) {
1390 /* We map a mali_attr to be 1:1 with the mali_attr_meta, which
1391 * means duplicating some vertex buffers (who cares? aside from
1392 * maybe some caching implications but I somehow doubt that
1393 * matters) */
1394
1395 struct pipe_vertex_element *elem = &so->pipe[i];
1396 unsigned vbi = elem->vertex_buffer_index;
1397
1398 /* The exception to 1:1 mapping is that we can have multiple
1399 * entries (NPOT divisors), so we fixup anyways */
1400
1401 so->hw[i].index = k;
1402
1403 if (!(ctx->vb_mask & (1 << vbi)))
1404 continue;
1405
1406 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1407 struct panfrost_resource *rsrc;
1408
1409 rsrc = pan_resource(buf->buffer.resource);
1410 if (!rsrc)
1411 continue;
1412
1413 /* Align to 64 bytes by masking off the lower bits. This
1414 * will be adjusted back when we fixup the src_offset in
1415 * mali_attr_meta */
1416
1417 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1418 mali_ptr addr = raw_addr & ~63;
1419 unsigned chopped_addr = raw_addr - addr;
1420
1421 /* Add a dependency of the batch on the vertex buffer */
1422 panfrost_batch_add_bo(batch, rsrc->bo,
1423 PAN_BO_ACCESS_SHARED |
1424 PAN_BO_ACCESS_READ |
1425 PAN_BO_ACCESS_VERTEX_TILER);
1426
1427 /* Set common fields */
1428 attrs[k].elements = addr;
1429 attrs[k].stride = buf->stride;
1430
1431 /* Since we advanced the base pointer, we shrink the buffer
1432 * size */
1433 attrs[k].size = rsrc->base.width0 - buf->buffer_offset;
1434
1435 /* We need to add the extra size we masked off (for
1436 * correctness) so the data doesn't get clamped away */
1437 attrs[k].size += chopped_addr;
1438
1439 /* For non-instancing make sure we initialize */
1440 attrs[k].shift = attrs[k].extra_flags = 0;
1441
1442 /* Instancing uses a dramatically different code path than
1443 * linear, so dispatch for the actual emission now that the
1444 * common code is finished */
1445
1446 unsigned divisor = elem->instance_divisor;
1447
1448 if (divisor && ctx->instance_count == 1) {
1449 /* Silly corner case where there's a divisor(=1) but
1450 * there's no legitimate instancing. So we want *every*
1451 * attribute to be the same. So set stride to zero so
1452 * we don't go anywhere. */
1453
1454 attrs[k].size = attrs[k].stride + chopped_addr;
1455 attrs[k].stride = 0;
1456 attrs[k++].elements |= MALI_ATTR_LINEAR;
1457 } else if (ctx->instance_count <= 1) {
1458 /* Normal, non-instanced attributes */
1459 attrs[k++].elements |= MALI_ATTR_LINEAR;
1460 } else {
1461 unsigned instance_shift = vertex_postfix->instance_shift;
1462 unsigned instance_odd = vertex_postfix->instance_odd;
1463
1464 k += panfrost_vertex_instanced(ctx->padded_count,
1465 instance_shift,
1466 instance_odd,
1467 divisor, &attrs[k]);
1468 }
1469 }
1470
1471 /* Add special gl_VertexID/gl_InstanceID buffers */
1472
1473 panfrost_vertex_id(ctx->padded_count, &attrs[k]);
1474 so->hw[PAN_VERTEX_ID].index = k++;
1475 panfrost_instance_id(ctx->padded_count, &attrs[k]);
1476 so->hw[PAN_INSTANCE_ID].index = k++;
1477
1478 /* Upload whatever we emitted and go */
1479
1480 vertex_postfix->attributes = panfrost_pool_upload(&batch->pool, attrs,
1481 k * sizeof(*attrs));
1482 }
1483
1484 static mali_ptr
1485 panfrost_emit_varyings(struct panfrost_batch *batch, union mali_attr *slot,
1486 unsigned stride, unsigned count)
1487 {
1488 /* Fill out the descriptor */
1489 slot->stride = stride;
1490 slot->size = stride * count;
1491 slot->shift = slot->extra_flags = 0;
1492
1493 struct panfrost_transfer transfer = panfrost_pool_alloc(&batch->pool,
1494 slot->size);
1495
1496 slot->elements = transfer.gpu | MALI_ATTR_LINEAR;
1497
1498 return transfer.gpu;
1499 }
1500
1501 static unsigned
1502 panfrost_streamout_offset(unsigned stride, unsigned offset,
1503 struct pipe_stream_output_target *target)
1504 {
1505 return (target->buffer_offset + (offset * stride * 4)) & 63;
1506 }
1507
1508 static void
1509 panfrost_emit_streamout(struct panfrost_batch *batch, union mali_attr *slot,
1510 unsigned stride, unsigned offset, unsigned count,
1511 struct pipe_stream_output_target *target)
1512 {
1513 /* Fill out the descriptor */
1514 slot->stride = stride * 4;
1515 slot->shift = slot->extra_flags = 0;
1516
1517 unsigned max_size = target->buffer_size;
1518 unsigned expected_size = slot->stride * count;
1519
1520 /* Grab the BO and bind it to the batch */
1521 struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1522
1523 /* Varyings are WRITE from the perspective of the VERTEX but READ from
1524 * the perspective of the TILER and FRAGMENT.
1525 */
1526 panfrost_batch_add_bo(batch, bo,
1527 PAN_BO_ACCESS_SHARED |
1528 PAN_BO_ACCESS_RW |
1529 PAN_BO_ACCESS_VERTEX_TILER |
1530 PAN_BO_ACCESS_FRAGMENT);
1531
1532 /* We will have an offset applied to get alignment */
1533 mali_ptr addr = bo->gpu + target->buffer_offset + (offset * slot->stride);
1534 slot->elements = (addr & ~63) | MALI_ATTR_LINEAR;
1535 slot->size = MIN2(max_size, expected_size) + (addr & 63);
1536 }
1537
1538 static bool
1539 has_point_coord(unsigned mask, gl_varying_slot loc)
1540 {
1541 if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1542 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1543 else if (loc == VARYING_SLOT_PNTC)
1544 return (mask & (1 << 8));
1545 else
1546 return false;
1547 }
1548
1549 /* Helpers for manipulating stream out information so we can pack varyings
1550 * accordingly. Compute the src_offset for a given captured varying */
1551
1552 static struct pipe_stream_output *
1553 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1554 {
1555 for (unsigned i = 0; i < info->num_outputs; ++i) {
1556 if (info->output[i].register_index == loc)
1557 return &info->output[i];
1558 }
1559
1560 unreachable("Varying not captured");
1561 }
1562
1563 static unsigned
1564 pan_varying_size(enum mali_format fmt)
1565 {
1566 unsigned type = MALI_EXTRACT_TYPE(fmt);
1567 unsigned chan = MALI_EXTRACT_CHANNELS(fmt);
1568 unsigned bits = MALI_EXTRACT_BITS(fmt);
1569 unsigned bpc = 0;
1570
1571 if (bits == MALI_CHANNEL_FLOAT) {
1572 /* No doubles */
1573 bool fp16 = (type == MALI_FORMAT_SINT);
1574 assert(fp16 || (type == MALI_FORMAT_UNORM));
1575
1576 bpc = fp16 ? 2 : 4;
1577 } else {
1578 assert(type >= MALI_FORMAT_SNORM && type <= MALI_FORMAT_SINT);
1579
1580 /* See the enums */
1581 bits = 1 << bits;
1582 assert(bits >= 8);
1583 bpc = bits / 8;
1584 }
1585
1586 return bpc * chan;
1587 }
1588
1589 /* Indices for named (non-XFB) varyings that are present. These are packed
1590 * tightly so they correspond to a bitfield present (P) indexed by (1 <<
1591 * PAN_VARY_*). This has the nice property that you can lookup the buffer index
1592 * of a given special field given a shift S by:
1593 *
1594 * idx = popcount(P & ((1 << S) - 1))
1595 *
1596 * That is... look at all of the varyings that come earlier and count them, the
1597 * count is the new index since plus one. Likewise, the total number of special
1598 * buffers required is simply popcount(P)
1599 */
1600
1601 enum pan_special_varying {
1602 PAN_VARY_GENERAL = 0,
1603 PAN_VARY_POSITION = 1,
1604 PAN_VARY_PSIZ = 2,
1605 PAN_VARY_PNTCOORD = 3,
1606 PAN_VARY_FACE = 4,
1607 PAN_VARY_FRAGCOORD = 5,
1608
1609 /* Keep last */
1610 PAN_VARY_MAX,
1611 };
1612
1613 /* Given a varying, figure out which index it correpsonds to */
1614
1615 static inline unsigned
1616 pan_varying_index(unsigned present, enum pan_special_varying v)
1617 {
1618 unsigned mask = (1 << v) - 1;
1619 return util_bitcount(present & mask);
1620 }
1621
1622 /* Get the base offset for XFB buffers, which by convention come after
1623 * everything else. Wrapper function for semantic reasons; by construction this
1624 * is just popcount. */
1625
1626 static inline unsigned
1627 pan_xfb_base(unsigned present)
1628 {
1629 return util_bitcount(present);
1630 }
1631
1632 /* Computes the present mask for varyings so we can start emitting varying records */
1633
1634 static inline unsigned
1635 pan_varying_present(
1636 struct panfrost_shader_state *vs,
1637 struct panfrost_shader_state *fs,
1638 unsigned quirks)
1639 {
1640 /* At the moment we always emit general and position buffers. Not
1641 * strictly necessary but usually harmless */
1642
1643 unsigned present = (1 << PAN_VARY_GENERAL) | (1 << PAN_VARY_POSITION);
1644
1645 /* Enable special buffers by the shader info */
1646
1647 if (vs->writes_point_size)
1648 present |= (1 << PAN_VARY_PSIZ);
1649
1650 if (fs->reads_point_coord)
1651 present |= (1 << PAN_VARY_PNTCOORD);
1652
1653 if (fs->reads_face)
1654 present |= (1 << PAN_VARY_FACE);
1655
1656 if (fs->reads_frag_coord && !(quirks & IS_BIFROST))
1657 present |= (1 << PAN_VARY_FRAGCOORD);
1658
1659 /* Also, if we have a point sprite, we need a point coord buffer */
1660
1661 for (unsigned i = 0; i < fs->varying_count; i++) {
1662 gl_varying_slot loc = fs->varyings_loc[i];
1663
1664 if (has_point_coord(fs->point_sprite_mask, loc))
1665 present |= (1 << PAN_VARY_PNTCOORD);
1666 }
1667
1668 return present;
1669 }
1670
1671 /* Emitters for varying records */
1672
1673 static struct mali_attr_meta
1674 pan_emit_vary(unsigned present, enum pan_special_varying buf,
1675 unsigned quirks, enum mali_format format,
1676 unsigned offset)
1677 {
1678 unsigned nr_channels = MALI_EXTRACT_CHANNELS(format);
1679
1680 struct mali_attr_meta meta = {
1681 .index = pan_varying_index(present, buf),
1682 .unknown1 = quirks & IS_BIFROST ? 0x0 : 0x2,
1683 .swizzle = quirks & HAS_SWIZZLES ?
1684 panfrost_get_default_swizzle(nr_channels) :
1685 panfrost_bifrost_swizzle(nr_channels),
1686 .format = format,
1687 .src_offset = offset
1688 };
1689
1690 return meta;
1691 }
1692
1693 /* General varying that is unused */
1694
1695 static struct mali_attr_meta
1696 pan_emit_vary_only(unsigned present, unsigned quirks)
1697 {
1698 return pan_emit_vary(present, 0, quirks, MALI_VARYING_DISCARD, 0);
1699 }
1700
1701 /* Special records */
1702
1703 static const enum mali_format pan_varying_formats[PAN_VARY_MAX] = {
1704 [PAN_VARY_POSITION] = MALI_VARYING_POS,
1705 [PAN_VARY_PSIZ] = MALI_R16F,
1706 [PAN_VARY_PNTCOORD] = MALI_R16F,
1707 [PAN_VARY_FACE] = MALI_R32I,
1708 [PAN_VARY_FRAGCOORD] = MALI_RGBA32F
1709 };
1710
1711 static struct mali_attr_meta
1712 pan_emit_vary_special(unsigned present, enum pan_special_varying buf,
1713 unsigned quirks)
1714 {
1715 assert(buf < PAN_VARY_MAX);
1716 return pan_emit_vary(present, buf, quirks, pan_varying_formats[buf], 0);
1717 }
1718
1719 static enum mali_format
1720 pan_xfb_format(enum mali_format format, unsigned nr)
1721 {
1722 if (MALI_EXTRACT_BITS(format) == MALI_CHANNEL_FLOAT)
1723 return MALI_R32F | MALI_NR_CHANNELS(nr);
1724 else
1725 return MALI_EXTRACT_TYPE(format) | MALI_NR_CHANNELS(nr) | MALI_CHANNEL_32;
1726 }
1727
1728 /* Transform feedback records. Note struct pipe_stream_output is (if packed as
1729 * a bitfield) 32-bit, smaller than a 64-bit pointer, so may as well pass by
1730 * value. */
1731
1732 static struct mali_attr_meta
1733 pan_emit_vary_xfb(unsigned present,
1734 unsigned max_xfb,
1735 unsigned *streamout_offsets,
1736 unsigned quirks,
1737 enum mali_format format,
1738 struct pipe_stream_output o)
1739 {
1740 /* Otherwise construct a record for it */
1741 struct mali_attr_meta meta = {
1742 /* XFB buffers come after everything else */
1743 .index = pan_xfb_base(present) + o.output_buffer,
1744
1745 /* As usual unknown bit */
1746 .unknown1 = quirks & IS_BIFROST ? 0x0 : 0x2,
1747
1748 /* Override swizzle with number of channels */
1749 .swizzle = quirks & HAS_SWIZZLES ?
1750 panfrost_get_default_swizzle(o.num_components) :
1751 panfrost_bifrost_swizzle(o.num_components),
1752
1753 /* Override number of channels and precision to highp */
1754 .format = pan_xfb_format(format, o.num_components),
1755
1756 /* Apply given offsets together */
1757 .src_offset = (o.dst_offset * 4) /* dwords */
1758 + streamout_offsets[o.output_buffer]
1759 };
1760
1761 return meta;
1762 }
1763
1764 /* Determine if we should capture a varying for XFB. This requires actually
1765 * having a buffer for it. If we don't capture it, we'll fallback to a general
1766 * varying path (linked or unlinked, possibly discarding the write) */
1767
1768 static bool
1769 panfrost_xfb_captured(struct panfrost_shader_state *xfb,
1770 unsigned loc, unsigned max_xfb)
1771 {
1772 if (!(xfb->so_mask & (1ll << loc)))
1773 return false;
1774
1775 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1776 return o->output_buffer < max_xfb;
1777 }
1778
1779 /* Higher-level wrapper around all of the above, classifying a varying into one
1780 * of the above types */
1781
1782 static struct mali_attr_meta
1783 panfrost_emit_varying(
1784 struct panfrost_shader_state *stage,
1785 struct panfrost_shader_state *other,
1786 struct panfrost_shader_state *xfb,
1787 unsigned present,
1788 unsigned max_xfb,
1789 unsigned *streamout_offsets,
1790 unsigned quirks,
1791 unsigned *gen_offsets,
1792 enum mali_format *gen_formats,
1793 unsigned *gen_stride,
1794 unsigned idx,
1795 bool should_alloc,
1796 bool is_fragment)
1797 {
1798 gl_varying_slot loc = stage->varyings_loc[idx];
1799 enum mali_format format = stage->varyings[idx];
1800
1801 /* Override format to match linkage */
1802 if (!should_alloc && gen_formats[idx])
1803 format = gen_formats[idx];
1804
1805 if (has_point_coord(stage->point_sprite_mask, loc)) {
1806 return pan_emit_vary_special(present, PAN_VARY_PNTCOORD, quirks);
1807 } else if (panfrost_xfb_captured(xfb, loc, max_xfb)) {
1808 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1809 return pan_emit_vary_xfb(present, max_xfb, streamout_offsets, quirks, format, *o);
1810 } else if (loc == VARYING_SLOT_POS) {
1811 if (is_fragment)
1812 return pan_emit_vary_special(present, PAN_VARY_FRAGCOORD, quirks);
1813 else
1814 return pan_emit_vary_special(present, PAN_VARY_POSITION, quirks);
1815 } else if (loc == VARYING_SLOT_PSIZ) {
1816 return pan_emit_vary_special(present, PAN_VARY_PSIZ, quirks);
1817 } else if (loc == VARYING_SLOT_PNTC) {
1818 return pan_emit_vary_special(present, PAN_VARY_PNTCOORD, quirks);
1819 } else if (loc == VARYING_SLOT_FACE) {
1820 return pan_emit_vary_special(present, PAN_VARY_FACE, quirks);
1821 }
1822
1823 /* We've exhausted special cases, so it's otherwise a general varying. Check if we're linked */
1824 signed other_idx = -1;
1825
1826 for (unsigned j = 0; j < other->varying_count; ++j) {
1827 if (other->varyings_loc[j] == loc) {
1828 other_idx = j;
1829 break;
1830 }
1831 }
1832
1833 if (other_idx < 0)
1834 return pan_emit_vary_only(present, quirks);
1835
1836 unsigned offset = gen_offsets[other_idx];
1837
1838 if (should_alloc) {
1839 /* We're linked, so allocate a space via a watermark allocation */
1840 enum mali_format alt = other->varyings[other_idx];
1841
1842 /* Do interpolation at minimum precision */
1843 unsigned size_main = pan_varying_size(format);
1844 unsigned size_alt = pan_varying_size(alt);
1845 unsigned size = MIN2(size_main, size_alt);
1846
1847 /* If a varying is marked for XFB but not actually captured, we
1848 * should match the format to the format that would otherwise
1849 * be used for XFB, since dEQP checks for invariance here. It's
1850 * unclear if this is required by the spec. */
1851
1852 if (xfb->so_mask & (1ull << loc)) {
1853 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1854 format = pan_xfb_format(format, o->num_components);
1855 size = pan_varying_size(format);
1856 } else if (size == size_alt) {
1857 format = alt;
1858 }
1859
1860 gen_offsets[idx] = *gen_stride;
1861 gen_formats[other_idx] = format;
1862 offset = *gen_stride;
1863 *gen_stride += size;
1864 }
1865
1866 return pan_emit_vary(present, PAN_VARY_GENERAL,
1867 quirks, format, offset);
1868 }
1869
1870 static void
1871 pan_emit_special_input(union mali_attr *varyings,
1872 unsigned present,
1873 enum pan_special_varying v,
1874 mali_ptr addr)
1875 {
1876 if (present & (1 << v)) {
1877 /* Ensure we write exactly once for performance and with fields
1878 * zeroed appropriately to avoid flakes */
1879
1880 union mali_attr s = {
1881 .elements = addr
1882 };
1883
1884 varyings[pan_varying_index(present, v)] = s;
1885 }
1886 }
1887
1888 void
1889 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1890 unsigned vertex_count,
1891 struct mali_vertex_tiler_postfix *vertex_postfix,
1892 struct mali_vertex_tiler_postfix *tiler_postfix,
1893 union midgard_primitive_size *primitive_size)
1894 {
1895 /* Load the shaders */
1896 struct panfrost_context *ctx = batch->ctx;
1897 struct panfrost_device *dev = pan_device(ctx->base.screen);
1898 struct panfrost_shader_state *vs, *fs;
1899 size_t vs_size, fs_size;
1900
1901 /* Allocate the varying descriptor */
1902
1903 vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1904 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1905 vs_size = sizeof(struct mali_attr_meta) * vs->varying_count;
1906 fs_size = sizeof(struct mali_attr_meta) * fs->varying_count;
1907
1908 struct panfrost_transfer trans = panfrost_pool_alloc(&batch->pool,
1909 vs_size +
1910 fs_size);
1911
1912 struct pipe_stream_output_info *so = &vs->stream_output;
1913 unsigned present = pan_varying_present(vs, fs, dev->quirks);
1914
1915 /* Check if this varying is linked by us. This is the case for
1916 * general-purpose, non-captured varyings. If it is, link it. If it's
1917 * not, use the provided stream out information to determine the
1918 * offset, since it was already linked for us. */
1919
1920 unsigned gen_offsets[32];
1921 enum mali_format gen_formats[32];
1922 memset(gen_offsets, 0, sizeof(gen_offsets));
1923 memset(gen_formats, 0, sizeof(gen_formats));
1924
1925 unsigned gen_stride = 0;
1926 assert(vs->varying_count < ARRAY_SIZE(gen_offsets));
1927 assert(fs->varying_count < ARRAY_SIZE(gen_offsets));
1928
1929 unsigned streamout_offsets[32];
1930
1931 for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1932 streamout_offsets[i] = panfrost_streamout_offset(
1933 so->stride[i],
1934 ctx->streamout.offsets[i],
1935 ctx->streamout.targets[i]);
1936 }
1937
1938 struct mali_attr_meta *ovs = (struct mali_attr_meta *)trans.cpu;
1939 struct mali_attr_meta *ofs = ovs + vs->varying_count;
1940
1941 for (unsigned i = 0; i < vs->varying_count; i++) {
1942 ovs[i] = panfrost_emit_varying(vs, fs, vs, present,
1943 ctx->streamout.num_targets, streamout_offsets,
1944 dev->quirks,
1945 gen_offsets, gen_formats, &gen_stride, i, true, false);
1946 }
1947
1948 for (unsigned i = 0; i < fs->varying_count; i++) {
1949 ofs[i] = panfrost_emit_varying(fs, vs, vs, present,
1950 ctx->streamout.num_targets, streamout_offsets,
1951 dev->quirks,
1952 gen_offsets, gen_formats, &gen_stride, i, false, true);
1953 }
1954
1955 unsigned xfb_base = pan_xfb_base(present);
1956 struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool,
1957 sizeof(union mali_attr) * (xfb_base + ctx->streamout.num_targets));
1958 union mali_attr *varyings = (union mali_attr *) T.cpu;
1959
1960 /* Emit the stream out buffers */
1961
1962 unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1963 ctx->vertex_count);
1964
1965 for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1966 panfrost_emit_streamout(batch, &varyings[xfb_base + i],
1967 so->stride[i],
1968 ctx->streamout.offsets[i],
1969 out_count,
1970 ctx->streamout.targets[i]);
1971 }
1972
1973 panfrost_emit_varyings(batch,
1974 &varyings[pan_varying_index(present, PAN_VARY_GENERAL)],
1975 gen_stride, vertex_count);
1976
1977 /* fp32 vec4 gl_Position */
1978 tiler_postfix->position_varying = panfrost_emit_varyings(batch,
1979 &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
1980 sizeof(float) * 4, vertex_count);
1981
1982 if (present & (1 << PAN_VARY_PSIZ)) {
1983 primitive_size->pointer = panfrost_emit_varyings(batch,
1984 &varyings[pan_varying_index(present, PAN_VARY_PSIZ)],
1985 2, vertex_count);
1986 }
1987
1988 pan_emit_special_input(varyings, present, PAN_VARY_PNTCOORD, MALI_VARYING_POINT_COORD);
1989 pan_emit_special_input(varyings, present, PAN_VARY_FACE, MALI_VARYING_FRONT_FACING);
1990 pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD, MALI_VARYING_FRAG_COORD);
1991
1992 vertex_postfix->varyings = T.gpu;
1993 tiler_postfix->varyings = T.gpu;
1994
1995 vertex_postfix->varying_meta = trans.gpu;
1996 tiler_postfix->varying_meta = trans.gpu + vs_size;
1997 }
1998
1999 void
2000 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
2001 struct mali_vertex_tiler_prefix *vertex_prefix,
2002 struct mali_vertex_tiler_postfix *vertex_postfix,
2003 struct mali_vertex_tiler_prefix *tiler_prefix,
2004 struct mali_vertex_tiler_postfix *tiler_postfix,
2005 union midgard_primitive_size *primitive_size)
2006 {
2007 struct panfrost_context *ctx = batch->ctx;
2008 struct panfrost_device *device = pan_device(ctx->base.screen);
2009 bool wallpapering = ctx->wallpaper_batch && batch->scoreboard.tiler_dep;
2010 struct bifrost_payload_vertex bifrost_vertex = {0,};
2011 struct bifrost_payload_tiler bifrost_tiler = {0,};
2012 struct midgard_payload_vertex_tiler midgard_vertex = {0,};
2013 struct midgard_payload_vertex_tiler midgard_tiler = {0,};
2014 void *vp, *tp;
2015 size_t vp_size, tp_size;
2016
2017 if (device->quirks & IS_BIFROST) {
2018 bifrost_vertex.prefix = *vertex_prefix;
2019 bifrost_vertex.postfix = *vertex_postfix;
2020 vp = &bifrost_vertex;
2021 vp_size = sizeof(bifrost_vertex);
2022
2023 bifrost_tiler.prefix = *tiler_prefix;
2024 bifrost_tiler.tiler.primitive_size = *primitive_size;
2025 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
2026 bifrost_tiler.postfix = *tiler_postfix;
2027 tp = &bifrost_tiler;
2028 tp_size = sizeof(bifrost_tiler);
2029 } else {
2030 midgard_vertex.prefix = *vertex_prefix;
2031 midgard_vertex.postfix = *vertex_postfix;
2032 vp = &midgard_vertex;
2033 vp_size = sizeof(midgard_vertex);
2034
2035 midgard_tiler.prefix = *tiler_prefix;
2036 midgard_tiler.postfix = *tiler_postfix;
2037 midgard_tiler.primitive_size = *primitive_size;
2038 tp = &midgard_tiler;
2039 tp_size = sizeof(midgard_tiler);
2040 }
2041
2042 if (wallpapering) {
2043 /* Inject in reverse order, with "predicted" job indices.
2044 * THIS IS A HACK XXX */
2045 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false,
2046 batch->scoreboard.job_index + 2, tp, tp_size, true);
2047 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2048 vp, vp_size, true);
2049 return;
2050 }
2051
2052 /* If rasterizer discard is enable, only submit the vertex */
2053
2054 bool rasterizer_discard = ctx->rasterizer &&
2055 ctx->rasterizer->base.rasterizer_discard;
2056
2057 unsigned vertex = panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2058 vp, vp_size, false);
2059
2060 if (rasterizer_discard)
2061 return;
2062
2063 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false, vertex, tp, tp_size,
2064 false);
2065 }
2066
2067 /* TODO: stop hardcoding this */
2068 mali_ptr
2069 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2070 {
2071 uint16_t locations[] = {
2072 128, 128,
2073 0, 256,
2074 0, 256,
2075 0, 256,
2076 0, 256,
2077 0, 256,
2078 0, 256,
2079 0, 256,
2080 0, 256,
2081 0, 256,
2082 0, 256,
2083 0, 256,
2084 0, 256,
2085 0, 256,
2086 0, 256,
2087 0, 256,
2088 0, 256,
2089 0, 256,
2090 0, 256,
2091 0, 256,
2092 0, 256,
2093 0, 256,
2094 0, 256,
2095 0, 256,
2096 0, 256,
2097 0, 256,
2098 0, 256,
2099 0, 256,
2100 0, 256,
2101 0, 256,
2102 0, 256,
2103 0, 256,
2104 128, 128,
2105 0, 0,
2106 0, 0,
2107 0, 0,
2108 0, 0,
2109 0, 0,
2110 0, 0,
2111 0, 0,
2112 0, 0,
2113 0, 0,
2114 0, 0,
2115 0, 0,
2116 0, 0,
2117 0, 0,
2118 0, 0,
2119 0, 0,
2120 };
2121
2122 return panfrost_pool_upload(&batch->pool, locations, 96 * sizeof(uint16_t));
2123 }