f57d240748a5dd1fbe76cadc17975df0f9ac0010
[mesa.git] / src / gallium / drivers / panfrost / pan_cmdstream.c
1 /*
2 * Copyright (C) 2018 Alyssa Rosenzweig
3 * Copyright (C) 2020 Collabora Ltd.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "util/macros.h"
26 #include "util/u_prim.h"
27 #include "util/u_vbuf.h"
28
29 #include "panfrost-quirks.h"
30
31 #include "pan_pool.h"
32 #include "pan_bo.h"
33 #include "pan_cmdstream.h"
34 #include "pan_context.h"
35 #include "pan_job.h"
36
37 /* If a BO is accessed for a particular shader stage, will it be in the primary
38 * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
39 * fragment will be primary, e.g. compute jobs will be considered
40 * "vertex/tiler" by analogy */
41
42 static inline uint32_t
43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
44 {
45 assert(stage == PIPE_SHADER_FRAGMENT ||
46 stage == PIPE_SHADER_VERTEX ||
47 stage == PIPE_SHADER_COMPUTE);
48
49 return stage == PIPE_SHADER_FRAGMENT ?
50 PAN_BO_ACCESS_FRAGMENT :
51 PAN_BO_ACCESS_VERTEX_TILER;
52 }
53
54 static void
55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
56 struct mali_vertex_tiler_postfix *postfix)
57 {
58 struct panfrost_device *dev = pan_device(ctx->base.screen);
59 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
60
61 struct mali_shared_memory shared = {
62 .shared_workgroup_count = ~0,
63 };
64
65 if (batch->stack_size) {
66 struct panfrost_bo *stack =
67 panfrost_batch_get_scratchpad(batch, batch->stack_size,
68 dev->thread_tls_alloc,
69 dev->core_count);
70
71 shared.stack_shift = panfrost_get_stack_shift(batch->stack_size);
72 shared.scratchpad = stack->gpu;
73 }
74
75 postfix->shared_memory = panfrost_pool_upload_aligned(&batch->pool, &shared, sizeof(shared), 64);
76 }
77
78 static void
79 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
80 struct mali_vertex_tiler_postfix *postfix)
81 {
82 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
83 postfix->shared_memory = panfrost_batch_reserve_framebuffer(batch);
84 }
85
86 static void
87 panfrost_vt_update_rasterizer(struct panfrost_rasterizer *rasterizer,
88 struct mali_vertex_tiler_prefix *prefix,
89 struct mali_vertex_tiler_postfix *postfix)
90 {
91 postfix->gl_enables |= 0x7;
92 SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
93 rasterizer->base.front_ccw);
94 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
95 (rasterizer->base.cull_face & PIPE_FACE_FRONT));
96 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
97 (rasterizer->base.cull_face & PIPE_FACE_BACK));
98 SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
99 rasterizer->base.flatshade_first);
100 }
101
102 void
103 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
104 struct mali_vertex_tiler_prefix *prefix,
105 union midgard_primitive_size *primitive_size)
106 {
107 struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
108
109 if (!panfrost_writes_point_size(ctx)) {
110 float val = (prefix->draw_mode == MALI_DRAW_MODE_POINTS) ?
111 rasterizer->base.point_size :
112 rasterizer->base.line_width;
113
114 primitive_size->constant = val;
115 }
116 }
117
118 static void
119 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
120 struct mali_vertex_tiler_postfix *postfix)
121 {
122 SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
123 if (ctx->occlusion_query) {
124 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
125 panfrost_batch_add_bo(ctx->batch, ctx->occlusion_query->bo,
126 PAN_BO_ACCESS_SHARED |
127 PAN_BO_ACCESS_RW |
128 PAN_BO_ACCESS_FRAGMENT);
129 } else {
130 postfix->occlusion_counter = 0;
131 }
132 }
133
134 void
135 panfrost_vt_init(struct panfrost_context *ctx,
136 enum pipe_shader_type stage,
137 struct mali_vertex_tiler_prefix *prefix,
138 struct mali_vertex_tiler_postfix *postfix)
139 {
140 struct panfrost_device *device = pan_device(ctx->base.screen);
141
142 if (!ctx->shader[stage])
143 return;
144
145 memset(prefix, 0, sizeof(*prefix));
146 memset(postfix, 0, sizeof(*postfix));
147
148 if (device->quirks & IS_BIFROST) {
149 postfix->gl_enables = 0x2;
150 panfrost_vt_emit_shared_memory(ctx, postfix);
151 } else {
152 postfix->gl_enables = 0x6;
153 panfrost_vt_attach_framebuffer(ctx, postfix);
154 }
155
156 if (stage == PIPE_SHADER_FRAGMENT) {
157 panfrost_vt_update_occlusion_query(ctx, postfix);
158 panfrost_vt_update_rasterizer(ctx->rasterizer, prefix, postfix);
159 }
160 }
161
162 static unsigned
163 panfrost_translate_index_size(unsigned size)
164 {
165 switch (size) {
166 case 1:
167 return MALI_DRAW_INDEXED_UINT8;
168
169 case 2:
170 return MALI_DRAW_INDEXED_UINT16;
171
172 case 4:
173 return MALI_DRAW_INDEXED_UINT32;
174
175 default:
176 unreachable("Invalid index size");
177 }
178 }
179
180 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
181 * good for the duration of the draw (transient), could last longer. Also get
182 * the bounds on the index buffer for the range accessed by the draw. We do
183 * these operations together because there are natural optimizations which
184 * require them to be together. */
185
186 static mali_ptr
187 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
188 const struct pipe_draw_info *info,
189 unsigned *min_index, unsigned *max_index)
190 {
191 struct panfrost_resource *rsrc = pan_resource(info->index.resource);
192 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
193 off_t offset = info->start * info->index_size;
194 bool needs_indices = true;
195 mali_ptr out = 0;
196
197 if (info->max_index != ~0u) {
198 *min_index = info->min_index;
199 *max_index = info->max_index;
200 needs_indices = false;
201 }
202
203 if (!info->has_user_indices) {
204 /* Only resources can be directly mapped */
205 panfrost_batch_add_bo(batch, rsrc->bo,
206 PAN_BO_ACCESS_SHARED |
207 PAN_BO_ACCESS_READ |
208 PAN_BO_ACCESS_VERTEX_TILER);
209 out = rsrc->bo->gpu + offset;
210
211 /* Check the cache */
212 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
213 info->start,
214 info->count,
215 min_index,
216 max_index);
217 } else {
218 /* Otherwise, we need to upload to transient memory */
219 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
220 struct panfrost_transfer T =
221 panfrost_pool_alloc_aligned(&batch->pool,
222 info->count * info->index_size,
223 info->index_size);
224
225 memcpy(T.cpu, ibuf8 + offset, info->count * info->index_size);
226 out = T.gpu;
227 }
228
229 if (needs_indices) {
230 /* Fallback */
231 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
232
233 if (!info->has_user_indices)
234 panfrost_minmax_cache_add(rsrc->index_cache,
235 info->start, info->count,
236 *min_index, *max_index);
237 }
238
239 return out;
240 }
241
242 void
243 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
244 const struct pipe_draw_info *info,
245 enum mali_draw_mode draw_mode,
246 struct mali_vertex_tiler_postfix *vertex_postfix,
247 struct mali_vertex_tiler_prefix *tiler_prefix,
248 struct mali_vertex_tiler_postfix *tiler_postfix,
249 unsigned *vertex_count,
250 unsigned *padded_count)
251 {
252 tiler_prefix->draw_mode = draw_mode;
253
254 unsigned draw_flags = 0;
255
256 if (panfrost_writes_point_size(ctx))
257 draw_flags |= MALI_DRAW_VARYING_SIZE;
258
259 if (info->primitive_restart)
260 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
261
262 /* These doesn't make much sense */
263
264 draw_flags |= 0x3000;
265
266 if (info->index_size) {
267 unsigned min_index = 0, max_index = 0;
268
269 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
270 info,
271 &min_index,
272 &max_index);
273
274 /* Use the corresponding values */
275 *vertex_count = max_index - min_index + 1;
276 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
277 tiler_prefix->offset_bias_correction = -min_index;
278 tiler_prefix->index_count = MALI_POSITIVE(info->count);
279 draw_flags |= panfrost_translate_index_size(info->index_size);
280 } else {
281 tiler_prefix->indices = 0;
282 *vertex_count = ctx->vertex_count;
283 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
284 tiler_prefix->offset_bias_correction = 0;
285 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
286 }
287
288 tiler_prefix->unknown_draw = draw_flags;
289
290 /* Encode the padded vertex count */
291
292 if (info->instance_count > 1) {
293 *padded_count = panfrost_padded_vertex_count(*vertex_count);
294
295 unsigned shift = __builtin_ctz(ctx->padded_count);
296 unsigned k = ctx->padded_count >> (shift + 1);
297
298 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
299 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
300 } else {
301 *padded_count = *vertex_count;
302
303 /* Reset instancing state */
304 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
305 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
306 }
307 }
308
309 static void
310 panfrost_shader_meta_init(struct panfrost_context *ctx,
311 enum pipe_shader_type st,
312 struct mali_shader_meta *meta)
313 {
314 const struct panfrost_device *dev = pan_device(ctx->base.screen);
315 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
316
317 memset(meta, 0, sizeof(*meta));
318 meta->shader = (ss->bo ? ss->bo->gpu : 0) | ss->first_tag;
319 meta->attribute_count = ss->attribute_count;
320 meta->varying_count = ss->varying_count;
321 meta->texture_count = ctx->sampler_view_count[st];
322 meta->sampler_count = ctx->sampler_count[st];
323
324 if (dev->quirks & IS_BIFROST) {
325 if (st == PIPE_SHADER_VERTEX)
326 meta->bifrost1.unk1 = 0x800000;
327 else {
328 /* First clause ATEST |= 0x4000000.
329 * Less than 32 regs |= 0x200 */
330 meta->bifrost1.unk1 = 0x950020;
331 }
332
333 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
334 if (st == PIPE_SHADER_VERTEX)
335 meta->bifrost2.preload_regs = 0xC0;
336 else {
337 meta->bifrost2.preload_regs = 0x1;
338 SET_BIT(meta->bifrost2.preload_regs, 0x10, ss->reads_frag_coord);
339 }
340
341 meta->bifrost2.uniform_count = MIN2(ss->uniform_count,
342 ss->uniform_cutoff);
343 } else {
344 meta->midgard1.uniform_count = MIN2(ss->uniform_count,
345 ss->uniform_cutoff);
346 meta->midgard1.work_count = ss->work_reg_count;
347
348 /* TODO: This is not conformant on ES3 */
349 meta->midgard1.flags_hi = MALI_SUPPRESS_INF_NAN;
350
351 meta->midgard1.flags_lo = 0x20;
352 meta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
353
354 SET_BIT(meta->midgard1.flags_lo, MALI_WRITES_GLOBAL, ss->writes_global);
355 }
356 }
357
358 static unsigned
359 translate_tex_wrap(enum pipe_tex_wrap w)
360 {
361 switch (w) {
362 case PIPE_TEX_WRAP_REPEAT: return MALI_WRAP_MODE_REPEAT;
363 case PIPE_TEX_WRAP_CLAMP: return MALI_WRAP_MODE_CLAMP;
364 case PIPE_TEX_WRAP_CLAMP_TO_EDGE: return MALI_WRAP_MODE_CLAMP_TO_EDGE;
365 case PIPE_TEX_WRAP_CLAMP_TO_BORDER: return MALI_WRAP_MODE_CLAMP_TO_BORDER;
366 case PIPE_TEX_WRAP_MIRROR_REPEAT: return MALI_WRAP_MODE_MIRRORED_REPEAT;
367 case PIPE_TEX_WRAP_MIRROR_CLAMP: return MALI_WRAP_MODE_MIRRORED_CLAMP;
368 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE;
369 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER;
370 default: unreachable("Invalid wrap");
371 }
372 }
373
374 /* The hardware compares in the wrong order order, so we have to flip before
375 * encoding. Yes, really. */
376
377 static enum mali_func
378 panfrost_sampler_compare_func(const struct pipe_sampler_state *cso)
379 {
380 if (!cso->compare_mode)
381 return MALI_FUNC_NEVER;
382
383 enum mali_func f = panfrost_translate_compare_func(cso->compare_func);
384 return panfrost_flip_compare_func(f);
385 }
386
387 static enum mali_mipmap_mode
388 pan_pipe_to_mipmode(enum pipe_tex_mipfilter f)
389 {
390 switch (f) {
391 case PIPE_TEX_MIPFILTER_NEAREST: return MALI_MIPMAP_MODE_NEAREST;
392 case PIPE_TEX_MIPFILTER_LINEAR: return MALI_MIPMAP_MODE_TRILINEAR;
393 case PIPE_TEX_MIPFILTER_NONE: return MALI_MIPMAP_MODE_NONE;
394 default: unreachable("Invalid");
395 }
396 }
397
398 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
399 struct mali_midgard_sampler_packed *hw)
400 {
401 pan_pack(hw, MIDGARD_SAMPLER, cfg) {
402 cfg.magnify_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
403 cfg.minify_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
404 cfg.mipmap_mode = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR) ?
405 MALI_MIPMAP_MODE_TRILINEAR : MALI_MIPMAP_MODE_NEAREST;
406 cfg.normalized_coordinates = cso->normalized_coords;
407
408 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
409
410 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
411
412 /* If necessary, we disable mipmapping in the sampler descriptor by
413 * clamping the LOD as tight as possible (from 0 to epsilon,
414 * essentially -- remember these are fixed point numbers, so
415 * epsilon=1/256) */
416
417 cfg.maximum_lod = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) ?
418 cfg.minimum_lod + 1 :
419 FIXED_16(cso->max_lod, false);
420
421 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
422 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
423 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
424
425 cfg.compare_function = panfrost_sampler_compare_func(cso);
426 cfg.seamless_cube_map = cso->seamless_cube_map;
427
428 cfg.border_color_r = cso->border_color.f[0];
429 cfg.border_color_g = cso->border_color.f[1];
430 cfg.border_color_b = cso->border_color.f[2];
431 cfg.border_color_a = cso->border_color.f[3];
432 }
433 }
434
435 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
436 struct mali_bifrost_sampler_packed *hw)
437 {
438 pan_pack(hw, BIFROST_SAMPLER, cfg) {
439 cfg.magnify_linear = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR;
440 cfg.minify_linear = cso->min_img_filter == PIPE_TEX_FILTER_LINEAR;
441 cfg.mipmap_mode = pan_pipe_to_mipmode(cso->min_mip_filter);
442 cfg.normalized_coordinates = cso->normalized_coords;
443
444 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
445 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
446 cfg.maximum_lod = FIXED_16(cso->max_lod, false);
447
448 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
449 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
450 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
451
452 cfg.compare_function = panfrost_sampler_compare_func(cso);
453 cfg.seamless_cube_map = cso->seamless_cube_map;
454 }
455 }
456
457 static void
458 panfrost_frag_meta_rasterizer_update(struct panfrost_context *ctx,
459 struct mali_shader_meta *fragmeta)
460 {
461 struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
462
463 bool msaa = rast->multisample;
464
465 /* TODO: Sample size */
466 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
467 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
468
469 struct panfrost_shader_state *fs;
470 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
471
472 /* EXT_shader_framebuffer_fetch requires the shader to be run
473 * per-sample when outputs are read. */
474 bool per_sample = ctx->min_samples > 1 || fs->outputs_read;
475 SET_BIT(fragmeta->unknown2_3, MALI_PER_SAMPLE, msaa && per_sample);
476
477 fragmeta->depth_units = rast->offset_units * 2.0f;
478 fragmeta->depth_factor = rast->offset_scale;
479
480 /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
481
482 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, rast->offset_tri);
483 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, rast->offset_tri);
484
485 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_NEAR, rast->depth_clip_near);
486 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_FAR, rast->depth_clip_far);
487 }
488
489 static void
490 panfrost_frag_meta_zsa_update(struct panfrost_context *ctx,
491 struct mali_shader_meta *fragmeta)
492 {
493 const struct panfrost_zsa_state *so = ctx->depth_stencil;
494
495 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
496 so->base.stencil[0].enabled);
497
498 fragmeta->stencil_mask_front = so->stencil_mask_front;
499 fragmeta->stencil_mask_back = so->stencil_mask_back;
500
501 /* Bottom bits for stencil ref, exactly one word */
502 fragmeta->stencil_front.opaque[0] = so->stencil_front.opaque[0] | ctx->stencil_ref.ref_value[0];
503
504 /* If back-stencil is not enabled, use the front values */
505
506 if (so->base.stencil[1].enabled)
507 fragmeta->stencil_back.opaque[0] = so->stencil_back.opaque[0] | ctx->stencil_ref.ref_value[1];
508 else
509 fragmeta->stencil_back = fragmeta->stencil_front;
510
511 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
512 so->base.depth.writemask);
513
514 fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
515 fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(
516 so->base.depth.enabled ? so->base.depth.func : PIPE_FUNC_ALWAYS));
517 }
518
519 static bool
520 panfrost_fs_required(
521 struct panfrost_shader_state *fs,
522 struct panfrost_blend_final *blend,
523 unsigned rt_count)
524 {
525 /* If we generally have side effects */
526 if (fs->fs_sidefx)
527 return true;
528
529 /* If colour is written we need to execute */
530 for (unsigned i = 0; i < rt_count; ++i) {
531 if (!blend[i].no_colour)
532 return true;
533 }
534
535 /* If depth is written and not implied we need to execute.
536 * TODO: Predicate on Z/S writes being enabled */
537 return (fs->writes_depth || fs->writes_stencil);
538 }
539
540 static void
541 panfrost_frag_meta_blend_update(struct panfrost_context *ctx,
542 struct mali_shader_meta *fragmeta,
543 struct panfrost_blend_final *blend)
544 {
545 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
546 const struct panfrost_device *dev = pan_device(ctx->base.screen);
547 struct panfrost_shader_state *fs;
548 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
549
550 SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
551 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
552 !ctx->blend->base.dither);
553
554 SET_BIT(fragmeta->unknown2_4, MALI_ALPHA_TO_COVERAGE,
555 ctx->blend->base.alpha_to_coverage);
556
557 /* Get blending setup */
558 unsigned rt_count = ctx->pipe_framebuffer.nr_cbufs;
559
560 /* Disable shader execution if we can */
561 if (dev->quirks & MIDGARD_SHADERLESS
562 && !panfrost_fs_required(fs, blend, rt_count)) {
563 fragmeta->shader = 0;
564 fragmeta->attribute_count = 0;
565 fragmeta->varying_count = 0;
566 fragmeta->texture_count = 0;
567 fragmeta->sampler_count = 0;
568
569 /* This feature is not known to work on Bifrost */
570 fragmeta->midgard1.work_count = 1;
571 fragmeta->midgard1.uniform_count = 0;
572 fragmeta->midgard1.uniform_buffer_count = 0;
573 }
574
575 /* If there is a blend shader, work registers are shared. We impose 8
576 * work registers as a limit for blend shaders. Should be lower XXX */
577
578 if (!(dev->quirks & IS_BIFROST)) {
579 for (unsigned c = 0; c < rt_count; ++c) {
580 if (blend[c].is_shader) {
581 fragmeta->midgard1.work_count =
582 MAX2(fragmeta->midgard1.work_count, 8);
583 }
584 }
585 }
586
587 /* Even on MFBD, the shader descriptor gets blend shaders. It's *also*
588 * copied to the blend_meta appended (by convention), but this is the
589 * field actually read by the hardware. (Or maybe both are read...?).
590 * Specify the last RTi with a blend shader. */
591
592 fragmeta->blend.shader = 0;
593
594 for (signed rt = ((signed) rt_count - 1); rt >= 0; --rt) {
595 if (!blend[rt].is_shader)
596 continue;
597
598 fragmeta->blend.shader = blend[rt].shader.gpu |
599 blend[rt].shader.first_tag;
600 break;
601 }
602
603 if (dev->quirks & MIDGARD_SFBD) {
604 /* When only a single render target platform is used, the blend
605 * information is inside the shader meta itself. We additionally
606 * need to signal CAN_DISCARD for nontrivial blend modes (so
607 * we're able to read back the destination buffer) */
608
609 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
610 blend[0].is_shader);
611
612 if (!blend[0].is_shader) {
613 fragmeta->blend.equation = *blend[0].equation.equation;
614 fragmeta->blend.constant = blend[0].equation.constant;
615 }
616
617 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
618 !blend[0].no_blending || fs->can_discard);
619
620 batch->draws |= PIPE_CLEAR_COLOR0;
621 return;
622 }
623
624 if (dev->quirks & IS_BIFROST) {
625 bool no_blend = true;
626
627 for (unsigned i = 0; i < rt_count; ++i)
628 no_blend &= (blend[i].no_blending | blend[i].no_colour);
629
630 SET_BIT(fragmeta->bifrost1.unk1, MALI_BIFROST_EARLY_Z,
631 !fs->can_discard && !fs->writes_depth && no_blend);
632 }
633 }
634
635 static void
636 panfrost_emit_blend(struct panfrost_batch *batch, void *rts,
637 struct panfrost_blend_final *blend)
638 {
639 const struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
640 struct panfrost_shader_state *fs = panfrost_get_shader_state(batch->ctx, PIPE_SHADER_FRAGMENT);
641 unsigned rt_count = batch->key.nr_cbufs;
642
643 struct bifrost_blend_rt *brts = rts;
644 struct midgard_blend_rt *mrts = rts;
645
646 /* Disable blending for depth-only on Bifrost */
647
648 if (rt_count == 0 && dev->quirks & IS_BIFROST)
649 brts[0].unk2 = 0x3;
650
651 for (unsigned i = 0; i < rt_count; ++i) {
652 unsigned flags = 0;
653
654 pan_pack(&flags, BLEND_FLAGS, cfg) {
655 if (blend[i].no_colour)
656 break;
657
658 batch->draws |= (PIPE_CLEAR_COLOR0 << i);
659
660 cfg.srgb = util_format_is_srgb(batch->key.cbufs[i]->format);
661 cfg.load_destination = !blend[i].no_blending; /* XXX */
662 cfg.dither_disable = !batch->ctx->blend->base.dither;
663
664 if (!(dev->quirks & IS_BIFROST))
665 cfg.midgard_blend_shader = blend[i].is_shader;
666 }
667
668 if (dev->quirks & IS_BIFROST) {
669 brts[i].flags = flags;
670
671 if (blend[i].is_shader) {
672 /* The blend shader's address needs to be at
673 * the same top 32 bit as the fragment shader.
674 * TODO: Ensure that's always the case.
675 */
676 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
677 (fs->bo->gpu & (0xffffffffull << 32)));
678 brts[i].shader = blend[i].shader.gpu;
679 brts[i].unk2 = 0x0;
680 } else {
681 enum pipe_format format = batch->key.cbufs[i]->format;
682 const struct util_format_description *format_desc;
683 format_desc = util_format_description(format);
684
685 brts[i].equation = *blend[i].equation.equation;
686
687 /* TODO: this is a bit more complicated */
688 brts[i].constant = blend[i].equation.constant;
689
690 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
691
692 /* 0x19 disables blending and forces REPLACE
693 * mode (equivalent to rgb_mode = alpha_mode =
694 * x122, colour mask = 0xF). 0x1a allows
695 * blending. */
696 brts[i].unk2 = blend[i].no_blending ? 0x19 : 0x1a;
697
698 brts[i].shader_type = fs->blend_types[i];
699 }
700 } else {
701 memcpy(&mrts[i].flags, &flags, sizeof(flags));
702
703 if (blend[i].is_shader) {
704 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
705 } else {
706 mrts[i].blend.equation = *blend[i].equation.equation;
707 mrts[i].blend.constant = blend[i].equation.constant;
708 }
709 }
710 }
711 }
712
713 static void
714 panfrost_frag_shader_meta_init(struct panfrost_context *ctx,
715 struct mali_shader_meta *fragmeta,
716 struct panfrost_blend_final *blend)
717 {
718 const struct panfrost_device *dev = pan_device(ctx->base.screen);
719 struct panfrost_shader_state *fs;
720
721 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
722
723 bool msaa = ctx->rasterizer->base.multisample;
724 fragmeta->coverage_mask = msaa ? ctx->sample_mask : ~0;
725
726 fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x10;
727 fragmeta->unknown2_4 = 0x4e0;
728
729 /* unknown2_4 has 0x10 bit set on T6XX and T720. We don't know why this
730 * is required (independent of 32-bit/64-bit descriptors), or why it's
731 * not used on later GPU revisions. Otherwise, all shader jobs fault on
732 * these earlier chips (perhaps this is a chicken bit of some kind).
733 * More investigation is needed. */
734
735 SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
736
737 if (dev->quirks & IS_BIFROST) {
738 /* TODO */
739 } else {
740 /* Depending on whether it's legal to in the given shader, we try to
741 * enable early-z testing. TODO: respect e-z force */
742
743 SET_BIT(fragmeta->midgard1.flags_lo, MALI_EARLY_Z,
744 !fs->can_discard && !fs->writes_global &&
745 !fs->writes_depth && !fs->writes_stencil &&
746 !ctx->blend->base.alpha_to_coverage);
747
748 /* Add the writes Z/S flags if needed. */
749 SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_Z, fs->writes_depth);
750 SET_BIT(fragmeta->midgard1.flags_hi, MALI_WRITES_S, fs->writes_stencil);
751
752 /* Any time texturing is used, derivatives are implicitly calculated,
753 * so we need to enable helper invocations */
754
755 SET_BIT(fragmeta->midgard1.flags_lo, MALI_HELPER_INVOCATIONS,
756 fs->helper_invocations);
757
758 /* If discard is enabled, which bit we set to convey this
759 * depends on if depth/stencil is used for the draw or not.
760 * Just one of depth OR stencil is enough to trigger this. */
761
762 const struct pipe_depth_stencil_alpha_state *zsa = &ctx->depth_stencil->base;
763 bool zs_enabled =
764 fs->writes_depth || fs->writes_stencil ||
765 (zsa->depth.enabled && zsa->depth.func != PIPE_FUNC_ALWAYS) ||
766 zsa->stencil[0].enabled;
767
768 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_TILEBUFFER,
769 fs->outputs_read || (!zs_enabled && fs->can_discard));
770 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_ZS, zs_enabled && fs->can_discard);
771 }
772
773 panfrost_frag_meta_rasterizer_update(ctx, fragmeta);
774 panfrost_frag_meta_zsa_update(ctx, fragmeta);
775 panfrost_frag_meta_blend_update(ctx, fragmeta, blend);
776 }
777
778 void
779 panfrost_emit_shader_meta(struct panfrost_batch *batch,
780 enum pipe_shader_type st,
781 struct mali_vertex_tiler_postfix *postfix)
782 {
783 struct panfrost_context *ctx = batch->ctx;
784 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
785
786 if (!ss) {
787 postfix->shader = 0;
788 return;
789 }
790
791 struct mali_shader_meta meta;
792
793 panfrost_shader_meta_init(ctx, st, &meta);
794
795 /* Add the shader BO to the batch. */
796 panfrost_batch_add_bo(batch, ss->bo,
797 PAN_BO_ACCESS_PRIVATE |
798 PAN_BO_ACCESS_READ |
799 panfrost_bo_access_for_stage(st));
800
801 mali_ptr shader_ptr;
802
803 if (st == PIPE_SHADER_FRAGMENT) {
804 struct panfrost_device *dev = pan_device(ctx->base.screen);
805 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
806 size_t desc_size = sizeof(meta);
807 void *rts = NULL;
808 struct panfrost_transfer xfer;
809 unsigned rt_size;
810
811 if (dev->quirks & MIDGARD_SFBD)
812 rt_size = 0;
813 else if (dev->quirks & IS_BIFROST)
814 rt_size = sizeof(struct bifrost_blend_rt);
815 else
816 rt_size = sizeof(struct midgard_blend_rt);
817
818 desc_size += rt_size * rt_count;
819
820 if (rt_size)
821 rts = rzalloc_size(ctx, rt_size * rt_count);
822
823 struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
824
825 for (unsigned c = 0; c < ctx->pipe_framebuffer.nr_cbufs; ++c)
826 blend[c] = panfrost_get_blend_for_context(ctx, c);
827
828 panfrost_frag_shader_meta_init(ctx, &meta, blend);
829
830 if (!(dev->quirks & MIDGARD_SFBD))
831 panfrost_emit_blend(batch, rts, blend);
832
833 xfer = panfrost_pool_alloc_aligned(&batch->pool, desc_size, sizeof(meta));
834
835 memcpy(xfer.cpu, &meta, sizeof(meta));
836 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
837
838 if (rt_size)
839 ralloc_free(rts);
840
841 shader_ptr = xfer.gpu;
842 } else {
843 shader_ptr = panfrost_pool_upload(&batch->pool, &meta,
844 sizeof(meta));
845 }
846
847 postfix->shader = shader_ptr;
848 }
849
850 void
851 panfrost_emit_viewport(struct panfrost_batch *batch,
852 struct mali_vertex_tiler_postfix *tiler_postfix)
853 {
854 struct panfrost_context *ctx = batch->ctx;
855 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
856 const struct pipe_scissor_state *ss = &ctx->scissor;
857 const struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
858 const struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer;
859
860 /* Derive min/max from translate/scale. Note since |x| >= 0 by
861 * definition, we have that -|x| <= |x| hence translate - |scale| <=
862 * translate + |scale|, so the ordering is correct here. */
863 float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
864 float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
865 float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
866 float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
867 float minz = (vp->translate[2] - fabsf(vp->scale[2]));
868 float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
869
870 /* Scissor to the intersection of viewport and to the scissor, clamped
871 * to the framebuffer */
872
873 unsigned minx = MIN2(fb->width, vp_minx);
874 unsigned maxx = MIN2(fb->width, vp_maxx);
875 unsigned miny = MIN2(fb->height, vp_miny);
876 unsigned maxy = MIN2(fb->height, vp_maxy);
877
878 if (ss && rast->scissor) {
879 minx = MAX2(ss->minx, minx);
880 miny = MAX2(ss->miny, miny);
881 maxx = MIN2(ss->maxx, maxx);
882 maxy = MIN2(ss->maxy, maxy);
883 }
884
885 struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool, MALI_VIEWPORT_LENGTH);
886
887 pan_pack(T.cpu, VIEWPORT, cfg) {
888 cfg.scissor_minimum_x = minx;
889 cfg.scissor_minimum_y = miny;
890 cfg.scissor_maximum_x = maxx - 1;
891 cfg.scissor_maximum_y = maxy - 1;
892
893 cfg.minimum_z = rast->depth_clip_near ? minz : -INFINITY;
894 cfg.maximum_z = rast->depth_clip_far ? maxz : INFINITY;
895 }
896
897 tiler_postfix->viewport = T.gpu;
898 panfrost_batch_union_scissor(batch, minx, miny, maxx, maxy);
899 }
900
901 static mali_ptr
902 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
903 enum pipe_shader_type st,
904 struct panfrost_constant_buffer *buf,
905 unsigned index)
906 {
907 struct pipe_constant_buffer *cb = &buf->cb[index];
908 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
909
910 if (rsrc) {
911 panfrost_batch_add_bo(batch, rsrc->bo,
912 PAN_BO_ACCESS_SHARED |
913 PAN_BO_ACCESS_READ |
914 panfrost_bo_access_for_stage(st));
915
916 /* Alignment gauranteed by
917 * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
918 return rsrc->bo->gpu + cb->buffer_offset;
919 } else if (cb->user_buffer) {
920 return panfrost_pool_upload_aligned(&batch->pool,
921 cb->user_buffer +
922 cb->buffer_offset,
923 cb->buffer_size, 16);
924 } else {
925 unreachable("No constant buffer");
926 }
927 }
928
929 struct sysval_uniform {
930 union {
931 float f[4];
932 int32_t i[4];
933 uint32_t u[4];
934 uint64_t du[2];
935 };
936 };
937
938 static void
939 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
940 struct sysval_uniform *uniform)
941 {
942 struct panfrost_context *ctx = batch->ctx;
943 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
944
945 uniform->f[0] = vp->scale[0];
946 uniform->f[1] = vp->scale[1];
947 uniform->f[2] = vp->scale[2];
948 }
949
950 static void
951 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
952 struct sysval_uniform *uniform)
953 {
954 struct panfrost_context *ctx = batch->ctx;
955 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
956
957 uniform->f[0] = vp->translate[0];
958 uniform->f[1] = vp->translate[1];
959 uniform->f[2] = vp->translate[2];
960 }
961
962 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
963 enum pipe_shader_type st,
964 unsigned int sysvalid,
965 struct sysval_uniform *uniform)
966 {
967 struct panfrost_context *ctx = batch->ctx;
968 unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
969 unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
970 bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
971 struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
972
973 assert(dim);
974 uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
975
976 if (dim > 1)
977 uniform->i[1] = u_minify(tex->texture->height0,
978 tex->u.tex.first_level);
979
980 if (dim > 2)
981 uniform->i[2] = u_minify(tex->texture->depth0,
982 tex->u.tex.first_level);
983
984 if (is_array)
985 uniform->i[dim] = tex->texture->array_size;
986 }
987
988 static void
989 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
990 enum pipe_shader_type st,
991 unsigned ssbo_id,
992 struct sysval_uniform *uniform)
993 {
994 struct panfrost_context *ctx = batch->ctx;
995
996 assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
997 struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
998
999 /* Compute address */
1000 struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
1001
1002 panfrost_batch_add_bo(batch, bo,
1003 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
1004 panfrost_bo_access_for_stage(st));
1005
1006 /* Upload address and size as sysval */
1007 uniform->du[0] = bo->gpu + sb.buffer_offset;
1008 uniform->u[2] = sb.buffer_size;
1009 }
1010
1011 static void
1012 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1013 enum pipe_shader_type st,
1014 unsigned samp_idx,
1015 struct sysval_uniform *uniform)
1016 {
1017 struct panfrost_context *ctx = batch->ctx;
1018 struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1019
1020 uniform->f[0] = sampl->min_lod;
1021 uniform->f[1] = sampl->max_lod;
1022 uniform->f[2] = sampl->lod_bias;
1023
1024 /* Even without any errata, Midgard represents "no mipmapping" as
1025 * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1026 * panfrost_create_sampler_state which also explains our choice of
1027 * epsilon value (again to keep behaviour consistent) */
1028
1029 if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1030 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1031 }
1032
1033 static void
1034 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1035 struct sysval_uniform *uniform)
1036 {
1037 struct panfrost_context *ctx = batch->ctx;
1038
1039 uniform->u[0] = ctx->compute_grid->grid[0];
1040 uniform->u[1] = ctx->compute_grid->grid[1];
1041 uniform->u[2] = ctx->compute_grid->grid[2];
1042 }
1043
1044 static void
1045 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1046 struct panfrost_shader_state *ss,
1047 enum pipe_shader_type st)
1048 {
1049 struct sysval_uniform *uniforms = (void *)buf;
1050
1051 for (unsigned i = 0; i < ss->sysval_count; ++i) {
1052 int sysval = ss->sysval[i];
1053
1054 switch (PAN_SYSVAL_TYPE(sysval)) {
1055 case PAN_SYSVAL_VIEWPORT_SCALE:
1056 panfrost_upload_viewport_scale_sysval(batch,
1057 &uniforms[i]);
1058 break;
1059 case PAN_SYSVAL_VIEWPORT_OFFSET:
1060 panfrost_upload_viewport_offset_sysval(batch,
1061 &uniforms[i]);
1062 break;
1063 case PAN_SYSVAL_TEXTURE_SIZE:
1064 panfrost_upload_txs_sysval(batch, st,
1065 PAN_SYSVAL_ID(sysval),
1066 &uniforms[i]);
1067 break;
1068 case PAN_SYSVAL_SSBO:
1069 panfrost_upload_ssbo_sysval(batch, st,
1070 PAN_SYSVAL_ID(sysval),
1071 &uniforms[i]);
1072 break;
1073 case PAN_SYSVAL_NUM_WORK_GROUPS:
1074 panfrost_upload_num_work_groups_sysval(batch,
1075 &uniforms[i]);
1076 break;
1077 case PAN_SYSVAL_SAMPLER:
1078 panfrost_upload_sampler_sysval(batch, st,
1079 PAN_SYSVAL_ID(sysval),
1080 &uniforms[i]);
1081 break;
1082 default:
1083 assert(0);
1084 }
1085 }
1086 }
1087
1088 static const void *
1089 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1090 unsigned index)
1091 {
1092 struct pipe_constant_buffer *cb = &buf->cb[index];
1093 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1094
1095 if (rsrc)
1096 return rsrc->bo->cpu;
1097 else if (cb->user_buffer)
1098 return cb->user_buffer;
1099 else
1100 unreachable("No constant buffer");
1101 }
1102
1103 void
1104 panfrost_emit_const_buf(struct panfrost_batch *batch,
1105 enum pipe_shader_type stage,
1106 struct mali_vertex_tiler_postfix *postfix)
1107 {
1108 struct panfrost_context *ctx = batch->ctx;
1109 struct panfrost_shader_variants *all = ctx->shader[stage];
1110
1111 if (!all)
1112 return;
1113
1114 struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1115
1116 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1117
1118 /* Uniforms are implicitly UBO #0 */
1119 bool has_uniforms = buf->enabled_mask & (1 << 0);
1120
1121 /* Allocate room for the sysval and the uniforms */
1122 size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1123 size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1124 size_t size = sys_size + uniform_size;
1125 struct panfrost_transfer transfer =
1126 panfrost_pool_alloc_aligned(&batch->pool, size, 16);
1127
1128 /* Upload sysvals requested by the shader */
1129 panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1130
1131 /* Upload uniforms */
1132 if (has_uniforms && uniform_size) {
1133 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1134 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1135 }
1136
1137 /* Next up, attach UBOs. UBO #0 is the uniforms we just
1138 * uploaded */
1139
1140 unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1141 assert(ubo_count >= 1);
1142
1143 size_t sz = MALI_UNIFORM_BUFFER_LENGTH * ubo_count;
1144 struct panfrost_transfer ubos =
1145 panfrost_pool_alloc_aligned(&batch->pool, sz,
1146 MALI_UNIFORM_BUFFER_LENGTH);
1147
1148 uint64_t *ubo_ptr = (uint64_t *) ubos.cpu;
1149
1150 /* Upload uniforms as a UBO */
1151
1152 if (ss->uniform_count) {
1153 pan_pack(ubo_ptr, UNIFORM_BUFFER, cfg) {
1154 cfg.entries = ss->uniform_count;
1155 cfg.pointer = transfer.gpu;
1156 }
1157 } else {
1158 *ubo_ptr = 0;
1159 }
1160
1161 /* The rest are honest-to-goodness UBOs */
1162
1163 for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1164 size_t usz = buf->cb[ubo].buffer_size;
1165 bool enabled = buf->enabled_mask & (1 << ubo);
1166 bool empty = usz == 0;
1167
1168 if (!enabled || empty) {
1169 ubo_ptr[ubo] = 0;
1170 continue;
1171 }
1172
1173 pan_pack(ubo_ptr + ubo, UNIFORM_BUFFER, cfg) {
1174 cfg.entries = DIV_ROUND_UP(usz, 16);
1175 cfg.pointer = panfrost_map_constant_buffer_gpu(batch,
1176 stage, buf, ubo);
1177 }
1178 }
1179
1180 postfix->uniforms = transfer.gpu;
1181 postfix->uniform_buffers = ubos.gpu;
1182
1183 buf->dirty_mask = 0;
1184 }
1185
1186 void
1187 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1188 const struct pipe_grid_info *info,
1189 struct midgard_payload_vertex_tiler *vtp)
1190 {
1191 struct panfrost_context *ctx = batch->ctx;
1192 struct panfrost_device *dev = pan_device(ctx->base.screen);
1193 struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1194 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1195 unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1196 128));
1197
1198 unsigned log2_instances =
1199 util_logbase2_ceil(info->grid[0]) +
1200 util_logbase2_ceil(info->grid[1]) +
1201 util_logbase2_ceil(info->grid[2]);
1202
1203 unsigned shared_size = single_size * (1 << log2_instances) * dev->core_count;
1204 struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1205 shared_size,
1206 1);
1207
1208 struct mali_shared_memory shared = {
1209 .shared_memory = bo->gpu,
1210 .shared_workgroup_count = log2_instances,
1211 .shared_shift = util_logbase2(single_size) + 1
1212 };
1213
1214 vtp->postfix.shared_memory = panfrost_pool_upload_aligned(&batch->pool, &shared,
1215 sizeof(shared), 64);
1216 }
1217
1218 static mali_ptr
1219 panfrost_get_tex_desc(struct panfrost_batch *batch,
1220 enum pipe_shader_type st,
1221 struct panfrost_sampler_view *view)
1222 {
1223 if (!view)
1224 return (mali_ptr) 0;
1225
1226 struct pipe_sampler_view *pview = &view->base;
1227 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1228
1229 /* Add the BO to the job so it's retained until the job is done. */
1230
1231 panfrost_batch_add_bo(batch, rsrc->bo,
1232 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1233 panfrost_bo_access_for_stage(st));
1234
1235 panfrost_batch_add_bo(batch, view->bo,
1236 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1237 panfrost_bo_access_for_stage(st));
1238
1239 return view->bo->gpu;
1240 }
1241
1242 static void
1243 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
1244 struct pipe_context *pctx)
1245 {
1246 struct panfrost_resource *rsrc = pan_resource(view->base.texture);
1247 if (view->texture_bo != rsrc->bo->gpu ||
1248 view->modifier != rsrc->modifier) {
1249 panfrost_bo_unreference(view->bo);
1250 panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);
1251 }
1252 }
1253
1254 void
1255 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1256 enum pipe_shader_type stage,
1257 struct mali_vertex_tiler_postfix *postfix)
1258 {
1259 struct panfrost_context *ctx = batch->ctx;
1260 struct panfrost_device *device = pan_device(ctx->base.screen);
1261
1262 if (!ctx->sampler_view_count[stage])
1263 return;
1264
1265 if (device->quirks & IS_BIFROST) {
1266 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1267 MALI_BIFROST_TEXTURE_LENGTH *
1268 ctx->sampler_view_count[stage],
1269 MALI_BIFROST_TEXTURE_LENGTH);
1270
1271 struct mali_bifrost_texture_packed *out =
1272 (struct mali_bifrost_texture_packed *) T.cpu;
1273
1274 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1275 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1276 struct pipe_sampler_view *pview = &view->base;
1277 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1278
1279 panfrost_update_sampler_view(view, &ctx->base);
1280 out[i] = view->bifrost_descriptor;
1281
1282 /* Add the BOs to the job so they are retained until the job is done. */
1283
1284 panfrost_batch_add_bo(batch, rsrc->bo,
1285 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1286 panfrost_bo_access_for_stage(stage));
1287
1288 panfrost_batch_add_bo(batch, view->bo,
1289 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1290 panfrost_bo_access_for_stage(stage));
1291 }
1292
1293 postfix->textures = T.gpu;
1294 } else {
1295 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1296
1297 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1298 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1299
1300 panfrost_update_sampler_view(view, &ctx->base);
1301
1302 trampolines[i] = panfrost_get_tex_desc(batch, stage, view);
1303 }
1304
1305 postfix->textures = panfrost_pool_upload_aligned(&batch->pool,
1306 trampolines,
1307 sizeof(uint64_t) *
1308 ctx->sampler_view_count[stage],
1309 sizeof(uint64_t));
1310 }
1311 }
1312
1313 void
1314 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1315 enum pipe_shader_type stage,
1316 struct mali_vertex_tiler_postfix *postfix)
1317 {
1318 struct panfrost_context *ctx = batch->ctx;
1319
1320 if (!ctx->sampler_count[stage])
1321 return;
1322
1323 size_t desc_size = MALI_BIFROST_SAMPLER_LENGTH;
1324 assert(MALI_BIFROST_SAMPLER_LENGTH == MALI_MIDGARD_SAMPLER_LENGTH);
1325
1326 size_t sz = desc_size * ctx->sampler_count[stage];
1327 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool, sz, desc_size);
1328 struct mali_midgard_sampler_packed *out = (struct mali_midgard_sampler_packed *) T.cpu;
1329
1330 for (unsigned i = 0; i < ctx->sampler_count[stage]; ++i)
1331 out[i] = ctx->samplers[stage][i]->hw;
1332
1333 postfix->sampler_descriptor = T.gpu;
1334 }
1335
1336 void
1337 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1338 struct mali_vertex_tiler_postfix *vertex_postfix)
1339 {
1340 struct panfrost_context *ctx = batch->ctx;
1341 struct panfrost_vertex_state *so = ctx->vertex;
1342 struct panfrost_shader_state *vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1343
1344 unsigned instance_shift = vertex_postfix->instance_shift;
1345 unsigned instance_odd = vertex_postfix->instance_odd;
1346
1347 /* Worst case: everything is NPOT, which is only possible if instancing
1348 * is enabled. Otherwise single record is gauranteed */
1349 bool could_npot = instance_shift || instance_odd;
1350
1351 struct panfrost_transfer S = panfrost_pool_alloc_aligned(&batch->pool,
1352 MALI_ATTRIBUTE_BUFFER_LENGTH * vs->attribute_count *
1353 (could_npot ? 2 : 1),
1354 MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
1355
1356 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1357 MALI_ATTRIBUTE_LENGTH * vs->attribute_count,
1358 MALI_ATTRIBUTE_LENGTH);
1359
1360 struct mali_attribute_buffer_packed *bufs =
1361 (struct mali_attribute_buffer_packed *) S.cpu;
1362
1363 struct mali_attribute_packed *out =
1364 (struct mali_attribute_packed *) T.cpu;
1365
1366 unsigned attrib_to_buffer[PIPE_MAX_ATTRIBS] = { 0 };
1367 unsigned k = 0;
1368
1369 for (unsigned i = 0; i < so->num_elements; ++i) {
1370 /* We map buffers 1:1 with the attributes, which
1371 * means duplicating some vertex buffers (who cares? aside from
1372 * maybe some caching implications but I somehow doubt that
1373 * matters) */
1374
1375 struct pipe_vertex_element *elem = &so->pipe[i];
1376 unsigned vbi = elem->vertex_buffer_index;
1377 attrib_to_buffer[i] = k;
1378
1379 if (!(ctx->vb_mask & (1 << vbi)))
1380 continue;
1381
1382 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1383 struct panfrost_resource *rsrc;
1384
1385 rsrc = pan_resource(buf->buffer.resource);
1386 if (!rsrc)
1387 continue;
1388
1389 /* Add a dependency of the batch on the vertex buffer */
1390 panfrost_batch_add_bo(batch, rsrc->bo,
1391 PAN_BO_ACCESS_SHARED |
1392 PAN_BO_ACCESS_READ |
1393 PAN_BO_ACCESS_VERTEX_TILER);
1394
1395 /* Mask off lower bits, see offset fixup below */
1396 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1397 mali_ptr addr = raw_addr & ~63;
1398
1399 /* Since we advanced the base pointer, we shrink the buffer
1400 * size, but add the offset we subtracted */
1401 unsigned size = rsrc->base.width0 + (raw_addr - addr)
1402 - buf->buffer_offset;
1403
1404 /* When there is a divisor, the hardware-level divisor is
1405 * the product of the instance divisor and the padded count */
1406 unsigned divisor = elem->instance_divisor;
1407 unsigned hw_divisor = ctx->padded_count * divisor;
1408 unsigned stride = buf->stride;
1409
1410 /* If there's a divisor(=1) but no instancing, we want every
1411 * attribute to be the same */
1412
1413 if (divisor && ctx->instance_count == 1)
1414 stride = 0;
1415
1416 if (!divisor || ctx->instance_count <= 1) {
1417 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1418 if (ctx->instance_count > 1)
1419 cfg.type = MALI_ATTRIBUTE_TYPE_1D_MODULUS;
1420
1421 cfg.pointer = addr;
1422 cfg.stride = stride;
1423 cfg.size = size;
1424 cfg.divisor_r = instance_shift;
1425 cfg.divisor_p = instance_odd;
1426 }
1427 } else if (util_is_power_of_two_or_zero(hw_divisor)) {
1428 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1429 cfg.type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;
1430 cfg.pointer = addr;
1431 cfg.stride = stride;
1432 cfg.size = size;
1433 cfg.divisor_r = __builtin_ctz(hw_divisor);
1434 }
1435
1436 } else {
1437 unsigned shift = 0, extra_flags = 0;
1438
1439 unsigned magic_divisor =
1440 panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags);
1441
1442 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1443 cfg.type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;
1444 cfg.pointer = addr;
1445 cfg.stride = stride;
1446 cfg.size = size;
1447
1448 cfg.divisor_r = shift;
1449 cfg.divisor_e = extra_flags;
1450 }
1451
1452 pan_pack(bufs + k + 1, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) {
1453 cfg.divisor_numerator = magic_divisor;
1454 cfg.divisor = divisor;
1455 }
1456
1457 ++k;
1458 }
1459
1460 ++k;
1461 }
1462
1463 /* Add special gl_VertexID/gl_InstanceID buffers */
1464
1465 if (unlikely(vs->attribute_count >= PAN_VERTEX_ID)) {
1466 panfrost_vertex_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1467
1468 pan_pack(out + PAN_VERTEX_ID, ATTRIBUTE, cfg) {
1469 cfg.buffer_index = k++;
1470 cfg.format = so->formats[PAN_VERTEX_ID];
1471 }
1472
1473 panfrost_instance_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1474
1475 pan_pack(out + PAN_INSTANCE_ID, ATTRIBUTE, cfg) {
1476 cfg.buffer_index = k++;
1477 cfg.format = so->formats[PAN_INSTANCE_ID];
1478 }
1479 }
1480
1481 /* Attribute addresses require 64-byte alignment, so let:
1482 *
1483 * base' = base & ~63 = base - (base & 63)
1484 * offset' = offset + (base & 63)
1485 *
1486 * Since base' + offset' = base + offset, these are equivalent
1487 * addressing modes and now base is 64 aligned.
1488 */
1489
1490 unsigned start = vertex_postfix->offset_start;
1491
1492 for (unsigned i = 0; i < so->num_elements; ++i) {
1493 unsigned vbi = so->pipe[i].vertex_buffer_index;
1494 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1495
1496 /* Adjust by the masked off bits of the offset. Make sure we
1497 * read src_offset from so->hw (which is not GPU visible)
1498 * rather than target (which is) due to caching effects */
1499
1500 unsigned src_offset = so->pipe[i].src_offset;
1501
1502 /* BOs aligned to 4k so guaranteed aligned to 64 */
1503 src_offset += (buf->buffer_offset & 63);
1504
1505 /* Also, somewhat obscurely per-instance data needs to be
1506 * offset in response to a delayed start in an indexed draw */
1507
1508 if (so->pipe[i].instance_divisor && ctx->instance_count > 1 && start)
1509 src_offset -= buf->stride * start;
1510
1511 pan_pack(out + i, ATTRIBUTE, cfg) {
1512 cfg.buffer_index = attrib_to_buffer[i];
1513 cfg.format = so->formats[i];
1514 cfg.offset = src_offset;
1515 }
1516 }
1517
1518 vertex_postfix->attributes = S.gpu;
1519 vertex_postfix->attribute_meta = T.gpu;
1520 }
1521
1522 static mali_ptr
1523 panfrost_emit_varyings(struct panfrost_batch *batch,
1524 struct mali_attribute_buffer_packed *slot,
1525 unsigned stride, unsigned count)
1526 {
1527 unsigned size = stride * count;
1528 mali_ptr ptr = panfrost_pool_alloc_aligned(&batch->invisible_pool, size, 64).gpu;
1529
1530 pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1531 cfg.stride = stride;
1532 cfg.size = size;
1533 cfg.pointer = ptr;
1534 }
1535
1536 return ptr;
1537 }
1538
1539 static unsigned
1540 panfrost_streamout_offset(unsigned stride, unsigned offset,
1541 struct pipe_stream_output_target *target)
1542 {
1543 return (target->buffer_offset + (offset * stride * 4)) & 63;
1544 }
1545
1546 static void
1547 panfrost_emit_streamout(struct panfrost_batch *batch,
1548 struct mali_attribute_buffer_packed *slot,
1549 unsigned stride_words, unsigned offset, unsigned count,
1550 struct pipe_stream_output_target *target)
1551 {
1552 unsigned stride = stride_words * 4;
1553 unsigned max_size = target->buffer_size;
1554 unsigned expected_size = stride * count;
1555
1556 /* Grab the BO and bind it to the batch */
1557 struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1558
1559 /* Varyings are WRITE from the perspective of the VERTEX but READ from
1560 * the perspective of the TILER and FRAGMENT.
1561 */
1562 panfrost_batch_add_bo(batch, bo,
1563 PAN_BO_ACCESS_SHARED |
1564 PAN_BO_ACCESS_RW |
1565 PAN_BO_ACCESS_VERTEX_TILER |
1566 PAN_BO_ACCESS_FRAGMENT);
1567
1568 /* We will have an offset applied to get alignment */
1569 mali_ptr addr = bo->gpu + target->buffer_offset + (offset * stride);
1570
1571 pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1572 cfg.pointer = (addr & ~63);
1573 cfg.stride = stride;
1574 cfg.size = MIN2(max_size, expected_size) + (addr & 63);
1575 }
1576 }
1577
1578 static bool
1579 has_point_coord(unsigned mask, gl_varying_slot loc)
1580 {
1581 if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1582 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1583 else if (loc == VARYING_SLOT_PNTC)
1584 return (mask & (1 << 8));
1585 else
1586 return false;
1587 }
1588
1589 /* Helpers for manipulating stream out information so we can pack varyings
1590 * accordingly. Compute the src_offset for a given captured varying */
1591
1592 static struct pipe_stream_output *
1593 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1594 {
1595 for (unsigned i = 0; i < info->num_outputs; ++i) {
1596 if (info->output[i].register_index == loc)
1597 return &info->output[i];
1598 }
1599
1600 unreachable("Varying not captured");
1601 }
1602
1603 static unsigned
1604 pan_varying_size(enum mali_format fmt)
1605 {
1606 unsigned type = MALI_EXTRACT_TYPE(fmt);
1607 unsigned chan = MALI_EXTRACT_CHANNELS(fmt);
1608 unsigned bits = MALI_EXTRACT_BITS(fmt);
1609 unsigned bpc = 0;
1610
1611 if (bits == MALI_CHANNEL_FLOAT) {
1612 /* No doubles */
1613 bool fp16 = (type == MALI_FORMAT_SINT);
1614 assert(fp16 || (type == MALI_FORMAT_UNORM));
1615
1616 bpc = fp16 ? 2 : 4;
1617 } else {
1618 assert(type >= MALI_FORMAT_SNORM && type <= MALI_FORMAT_SINT);
1619
1620 /* See the enums */
1621 bits = 1 << bits;
1622 assert(bits >= 8);
1623 bpc = bits / 8;
1624 }
1625
1626 return bpc * chan;
1627 }
1628
1629 /* Indices for named (non-XFB) varyings that are present. These are packed
1630 * tightly so they correspond to a bitfield present (P) indexed by (1 <<
1631 * PAN_VARY_*). This has the nice property that you can lookup the buffer index
1632 * of a given special field given a shift S by:
1633 *
1634 * idx = popcount(P & ((1 << S) - 1))
1635 *
1636 * That is... look at all of the varyings that come earlier and count them, the
1637 * count is the new index since plus one. Likewise, the total number of special
1638 * buffers required is simply popcount(P)
1639 */
1640
1641 enum pan_special_varying {
1642 PAN_VARY_GENERAL = 0,
1643 PAN_VARY_POSITION = 1,
1644 PAN_VARY_PSIZ = 2,
1645 PAN_VARY_PNTCOORD = 3,
1646 PAN_VARY_FACE = 4,
1647 PAN_VARY_FRAGCOORD = 5,
1648
1649 /* Keep last */
1650 PAN_VARY_MAX,
1651 };
1652
1653 /* Given a varying, figure out which index it correpsonds to */
1654
1655 static inline unsigned
1656 pan_varying_index(unsigned present, enum pan_special_varying v)
1657 {
1658 unsigned mask = (1 << v) - 1;
1659 return util_bitcount(present & mask);
1660 }
1661
1662 /* Get the base offset for XFB buffers, which by convention come after
1663 * everything else. Wrapper function for semantic reasons; by construction this
1664 * is just popcount. */
1665
1666 static inline unsigned
1667 pan_xfb_base(unsigned present)
1668 {
1669 return util_bitcount(present);
1670 }
1671
1672 /* Computes the present mask for varyings so we can start emitting varying records */
1673
1674 static inline unsigned
1675 pan_varying_present(
1676 struct panfrost_shader_state *vs,
1677 struct panfrost_shader_state *fs,
1678 unsigned quirks)
1679 {
1680 /* At the moment we always emit general and position buffers. Not
1681 * strictly necessary but usually harmless */
1682
1683 unsigned present = (1 << PAN_VARY_GENERAL) | (1 << PAN_VARY_POSITION);
1684
1685 /* Enable special buffers by the shader info */
1686
1687 if (vs->writes_point_size)
1688 present |= (1 << PAN_VARY_PSIZ);
1689
1690 if (fs->reads_point_coord)
1691 present |= (1 << PAN_VARY_PNTCOORD);
1692
1693 if (fs->reads_face)
1694 present |= (1 << PAN_VARY_FACE);
1695
1696 if (fs->reads_frag_coord && !(quirks & IS_BIFROST))
1697 present |= (1 << PAN_VARY_FRAGCOORD);
1698
1699 /* Also, if we have a point sprite, we need a point coord buffer */
1700
1701 for (unsigned i = 0; i < fs->varying_count; i++) {
1702 gl_varying_slot loc = fs->varyings_loc[i];
1703
1704 if (has_point_coord(fs->point_sprite_mask, loc))
1705 present |= (1 << PAN_VARY_PNTCOORD);
1706 }
1707
1708 return present;
1709 }
1710
1711 /* Emitters for varying records */
1712
1713 static void
1714 pan_emit_vary(struct mali_attribute_packed *out,
1715 unsigned present, enum pan_special_varying buf,
1716 unsigned quirks, enum mali_format format,
1717 unsigned offset)
1718 {
1719 unsigned nr_channels = MALI_EXTRACT_CHANNELS(format);
1720 unsigned swizzle = quirks & HAS_SWIZZLES ?
1721 panfrost_get_default_swizzle(nr_channels) :
1722 panfrost_bifrost_swizzle(nr_channels);
1723
1724 pan_pack(out, ATTRIBUTE, cfg) {
1725 cfg.buffer_index = pan_varying_index(present, buf);
1726 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1727 cfg.format = (format << 12) | swizzle;
1728 cfg.offset = offset;
1729 }
1730 }
1731
1732 /* General varying that is unused */
1733
1734 static void
1735 pan_emit_vary_only(struct mali_attribute_packed *out,
1736 unsigned present, unsigned quirks)
1737 {
1738 pan_emit_vary(out, present, 0, quirks, MALI_VARYING_DISCARD, 0);
1739 }
1740
1741 /* Special records */
1742
1743 static const enum mali_format pan_varying_formats[PAN_VARY_MAX] = {
1744 [PAN_VARY_POSITION] = MALI_VARYING_POS,
1745 [PAN_VARY_PSIZ] = MALI_R16F,
1746 [PAN_VARY_PNTCOORD] = MALI_R16F,
1747 [PAN_VARY_FACE] = MALI_R32I,
1748 [PAN_VARY_FRAGCOORD] = MALI_RGBA32F
1749 };
1750
1751 static void
1752 pan_emit_vary_special(struct mali_attribute_packed *out,
1753 unsigned present, enum pan_special_varying buf,
1754 unsigned quirks)
1755 {
1756 assert(buf < PAN_VARY_MAX);
1757 pan_emit_vary(out, present, buf, quirks, pan_varying_formats[buf], 0);
1758 }
1759
1760 static enum mali_format
1761 pan_xfb_format(enum mali_format format, unsigned nr)
1762 {
1763 if (MALI_EXTRACT_BITS(format) == MALI_CHANNEL_FLOAT)
1764 return MALI_R32F | MALI_NR_CHANNELS(nr);
1765 else
1766 return MALI_EXTRACT_TYPE(format) | MALI_NR_CHANNELS(nr) | MALI_CHANNEL_32;
1767 }
1768
1769 /* Transform feedback records. Note struct pipe_stream_output is (if packed as
1770 * a bitfield) 32-bit, smaller than a 64-bit pointer, so may as well pass by
1771 * value. */
1772
1773 static void
1774 pan_emit_vary_xfb(struct mali_attribute_packed *out,
1775 unsigned present,
1776 unsigned max_xfb,
1777 unsigned *streamout_offsets,
1778 unsigned quirks,
1779 enum mali_format format,
1780 struct pipe_stream_output o)
1781 {
1782 unsigned swizzle = quirks & HAS_SWIZZLES ?
1783 panfrost_get_default_swizzle(o.num_components) :
1784 panfrost_bifrost_swizzle(o.num_components);
1785
1786 pan_pack(out, ATTRIBUTE, cfg) {
1787 /* XFB buffers come after everything else */
1788 cfg.buffer_index = pan_xfb_base(present) + o.output_buffer;
1789 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1790
1791 /* Override number of channels and precision to highp */
1792 cfg.format = (pan_xfb_format(format, o.num_components) << 12) | swizzle;
1793
1794 /* Apply given offsets together */
1795 cfg.offset = (o.dst_offset * 4) /* dwords */
1796 + streamout_offsets[o.output_buffer];
1797 }
1798 }
1799
1800 /* Determine if we should capture a varying for XFB. This requires actually
1801 * having a buffer for it. If we don't capture it, we'll fallback to a general
1802 * varying path (linked or unlinked, possibly discarding the write) */
1803
1804 static bool
1805 panfrost_xfb_captured(struct panfrost_shader_state *xfb,
1806 unsigned loc, unsigned max_xfb)
1807 {
1808 if (!(xfb->so_mask & (1ll << loc)))
1809 return false;
1810
1811 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1812 return o->output_buffer < max_xfb;
1813 }
1814
1815 static void
1816 pan_emit_general_varying(struct mali_attribute_packed *out,
1817 struct panfrost_shader_state *other,
1818 struct panfrost_shader_state *xfb,
1819 gl_varying_slot loc,
1820 enum mali_format format,
1821 unsigned present,
1822 unsigned quirks,
1823 unsigned *gen_offsets,
1824 enum mali_format *gen_formats,
1825 unsigned *gen_stride,
1826 unsigned idx,
1827 bool should_alloc)
1828 {
1829 /* Check if we're linked */
1830 signed other_idx = -1;
1831
1832 for (unsigned j = 0; j < other->varying_count; ++j) {
1833 if (other->varyings_loc[j] == loc) {
1834 other_idx = j;
1835 break;
1836 }
1837 }
1838
1839 if (other_idx < 0) {
1840 pan_emit_vary_only(out, present, quirks);
1841 return;
1842 }
1843
1844 unsigned offset = gen_offsets[other_idx];
1845
1846 if (should_alloc) {
1847 /* We're linked, so allocate a space via a watermark allocation */
1848 enum mali_format alt = other->varyings[other_idx];
1849
1850 /* Do interpolation at minimum precision */
1851 unsigned size_main = pan_varying_size(format);
1852 unsigned size_alt = pan_varying_size(alt);
1853 unsigned size = MIN2(size_main, size_alt);
1854
1855 /* If a varying is marked for XFB but not actually captured, we
1856 * should match the format to the format that would otherwise
1857 * be used for XFB, since dEQP checks for invariance here. It's
1858 * unclear if this is required by the spec. */
1859
1860 if (xfb->so_mask & (1ull << loc)) {
1861 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1862 format = pan_xfb_format(format, o->num_components);
1863 size = pan_varying_size(format);
1864 } else if (size == size_alt) {
1865 format = alt;
1866 }
1867
1868 gen_offsets[idx] = *gen_stride;
1869 gen_formats[other_idx] = format;
1870 offset = *gen_stride;
1871 *gen_stride += size;
1872 }
1873
1874 pan_emit_vary(out, present, PAN_VARY_GENERAL, quirks, format, offset);
1875 }
1876
1877 /* Higher-level wrapper around all of the above, classifying a varying into one
1878 * of the above types */
1879
1880 static void
1881 panfrost_emit_varying(
1882 struct mali_attribute_packed *out,
1883 struct panfrost_shader_state *stage,
1884 struct panfrost_shader_state *other,
1885 struct panfrost_shader_state *xfb,
1886 unsigned present,
1887 unsigned max_xfb,
1888 unsigned *streamout_offsets,
1889 unsigned quirks,
1890 unsigned *gen_offsets,
1891 enum mali_format *gen_formats,
1892 unsigned *gen_stride,
1893 unsigned idx,
1894 bool should_alloc,
1895 bool is_fragment)
1896 {
1897 gl_varying_slot loc = stage->varyings_loc[idx];
1898 enum mali_format format = stage->varyings[idx];
1899
1900 /* Override format to match linkage */
1901 if (!should_alloc && gen_formats[idx])
1902 format = gen_formats[idx];
1903
1904 if (has_point_coord(stage->point_sprite_mask, loc)) {
1905 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1906 } else if (panfrost_xfb_captured(xfb, loc, max_xfb)) {
1907 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1908 pan_emit_vary_xfb(out, present, max_xfb, streamout_offsets, quirks, format, *o);
1909 } else if (loc == VARYING_SLOT_POS) {
1910 if (is_fragment)
1911 pan_emit_vary_special(out, present, PAN_VARY_FRAGCOORD, quirks);
1912 else
1913 pan_emit_vary_special(out, present, PAN_VARY_POSITION, quirks);
1914 } else if (loc == VARYING_SLOT_PSIZ) {
1915 pan_emit_vary_special(out, present, PAN_VARY_PSIZ, quirks);
1916 } else if (loc == VARYING_SLOT_PNTC) {
1917 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1918 } else if (loc == VARYING_SLOT_FACE) {
1919 pan_emit_vary_special(out, present, PAN_VARY_FACE, quirks);
1920 } else {
1921 pan_emit_general_varying(out, other, xfb, loc, format, present,
1922 quirks, gen_offsets, gen_formats, gen_stride,
1923 idx, should_alloc);
1924 }
1925 }
1926
1927 static void
1928 pan_emit_special_input(struct mali_attribute_buffer_packed *out,
1929 unsigned present,
1930 enum pan_special_varying v,
1931 unsigned special)
1932 {
1933 if (present & (1 << v)) {
1934 unsigned idx = pan_varying_index(present, v);
1935
1936 pan_pack(out + idx, ATTRIBUTE_BUFFER, cfg) {
1937 cfg.special = special;
1938 cfg.type = 0;
1939 }
1940 }
1941 }
1942
1943 void
1944 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1945 unsigned vertex_count,
1946 struct mali_vertex_tiler_postfix *vertex_postfix,
1947 struct mali_vertex_tiler_postfix *tiler_postfix,
1948 union midgard_primitive_size *primitive_size)
1949 {
1950 /* Load the shaders */
1951 struct panfrost_context *ctx = batch->ctx;
1952 struct panfrost_device *dev = pan_device(ctx->base.screen);
1953 struct panfrost_shader_state *vs, *fs;
1954 size_t vs_size, fs_size;
1955
1956 /* Allocate the varying descriptor */
1957
1958 vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1959 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1960 vs_size = MALI_ATTRIBUTE_LENGTH * vs->varying_count;
1961 fs_size = MALI_ATTRIBUTE_LENGTH * fs->varying_count;
1962
1963 struct panfrost_transfer trans = panfrost_pool_alloc_aligned(
1964 &batch->pool, vs_size + fs_size, MALI_ATTRIBUTE_LENGTH);
1965
1966 struct pipe_stream_output_info *so = &vs->stream_output;
1967 unsigned present = pan_varying_present(vs, fs, dev->quirks);
1968
1969 /* Check if this varying is linked by us. This is the case for
1970 * general-purpose, non-captured varyings. If it is, link it. If it's
1971 * not, use the provided stream out information to determine the
1972 * offset, since it was already linked for us. */
1973
1974 unsigned gen_offsets[32];
1975 enum mali_format gen_formats[32];
1976 memset(gen_offsets, 0, sizeof(gen_offsets));
1977 memset(gen_formats, 0, sizeof(gen_formats));
1978
1979 unsigned gen_stride = 0;
1980 assert(vs->varying_count < ARRAY_SIZE(gen_offsets));
1981 assert(fs->varying_count < ARRAY_SIZE(gen_offsets));
1982
1983 unsigned streamout_offsets[32];
1984
1985 for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1986 streamout_offsets[i] = panfrost_streamout_offset(
1987 so->stride[i],
1988 ctx->streamout.offsets[i],
1989 ctx->streamout.targets[i]);
1990 }
1991
1992 struct mali_attribute_packed *ovs = (struct mali_attribute_packed *)trans.cpu;
1993 struct mali_attribute_packed *ofs = ovs + vs->varying_count;
1994
1995 for (unsigned i = 0; i < vs->varying_count; i++) {
1996 panfrost_emit_varying(ovs + i, vs, fs, vs, present,
1997 ctx->streamout.num_targets, streamout_offsets,
1998 dev->quirks,
1999 gen_offsets, gen_formats, &gen_stride, i, true, false);
2000 }
2001
2002 for (unsigned i = 0; i < fs->varying_count; i++) {
2003 panfrost_emit_varying(ofs + i, fs, vs, vs, present,
2004 ctx->streamout.num_targets, streamout_offsets,
2005 dev->quirks,
2006 gen_offsets, gen_formats, &gen_stride, i, false, true);
2007 }
2008
2009 unsigned xfb_base = pan_xfb_base(present);
2010 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
2011 MALI_ATTRIBUTE_BUFFER_LENGTH * (xfb_base + ctx->streamout.num_targets),
2012 MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
2013 struct mali_attribute_buffer_packed *varyings =
2014 (struct mali_attribute_buffer_packed *) T.cpu;
2015
2016 /* Emit the stream out buffers */
2017
2018 unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
2019 ctx->vertex_count);
2020
2021 for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
2022 panfrost_emit_streamout(batch, &varyings[xfb_base + i],
2023 so->stride[i],
2024 ctx->streamout.offsets[i],
2025 out_count,
2026 ctx->streamout.targets[i]);
2027 }
2028
2029 panfrost_emit_varyings(batch,
2030 &varyings[pan_varying_index(present, PAN_VARY_GENERAL)],
2031 gen_stride, vertex_count);
2032
2033 /* fp32 vec4 gl_Position */
2034 tiler_postfix->position_varying = panfrost_emit_varyings(batch,
2035 &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
2036 sizeof(float) * 4, vertex_count);
2037
2038 if (present & (1 << PAN_VARY_PSIZ)) {
2039 primitive_size->pointer = panfrost_emit_varyings(batch,
2040 &varyings[pan_varying_index(present, PAN_VARY_PSIZ)],
2041 2, vertex_count);
2042 }
2043
2044 pan_emit_special_input(varyings, present, PAN_VARY_PNTCOORD, MALI_ATTRIBUTE_SPECIAL_POINT_COORD);
2045 pan_emit_special_input(varyings, present, PAN_VARY_FACE, MALI_ATTRIBUTE_SPECIAL_FRONT_FACING);
2046 pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD, MALI_ATTRIBUTE_SPECIAL_FRAG_COORD);
2047
2048 vertex_postfix->varyings = T.gpu;
2049 tiler_postfix->varyings = T.gpu;
2050
2051 vertex_postfix->varying_meta = trans.gpu;
2052 tiler_postfix->varying_meta = trans.gpu + vs_size;
2053 }
2054
2055 void
2056 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
2057 struct mali_vertex_tiler_prefix *vertex_prefix,
2058 struct mali_vertex_tiler_postfix *vertex_postfix,
2059 struct mali_vertex_tiler_prefix *tiler_prefix,
2060 struct mali_vertex_tiler_postfix *tiler_postfix,
2061 union midgard_primitive_size *primitive_size)
2062 {
2063 struct panfrost_context *ctx = batch->ctx;
2064 struct panfrost_device *device = pan_device(ctx->base.screen);
2065 bool wallpapering = ctx->wallpaper_batch && batch->scoreboard.tiler_dep;
2066 struct bifrost_payload_vertex bifrost_vertex = {0,};
2067 struct bifrost_payload_tiler bifrost_tiler = {0,};
2068 struct midgard_payload_vertex_tiler midgard_vertex = {0,};
2069 struct midgard_payload_vertex_tiler midgard_tiler = {0,};
2070 void *vp, *tp;
2071 size_t vp_size, tp_size;
2072
2073 if (device->quirks & IS_BIFROST) {
2074 bifrost_vertex.prefix = *vertex_prefix;
2075 bifrost_vertex.postfix = *vertex_postfix;
2076 vp = &bifrost_vertex;
2077 vp_size = sizeof(bifrost_vertex);
2078
2079 bifrost_tiler.prefix = *tiler_prefix;
2080 bifrost_tiler.tiler.primitive_size = *primitive_size;
2081 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
2082 bifrost_tiler.postfix = *tiler_postfix;
2083 tp = &bifrost_tiler;
2084 tp_size = sizeof(bifrost_tiler);
2085 } else {
2086 midgard_vertex.prefix = *vertex_prefix;
2087 midgard_vertex.postfix = *vertex_postfix;
2088 vp = &midgard_vertex;
2089 vp_size = sizeof(midgard_vertex);
2090
2091 midgard_tiler.prefix = *tiler_prefix;
2092 midgard_tiler.postfix = *tiler_postfix;
2093 midgard_tiler.primitive_size = *primitive_size;
2094 tp = &midgard_tiler;
2095 tp_size = sizeof(midgard_tiler);
2096 }
2097
2098 if (wallpapering) {
2099 /* Inject in reverse order, with "predicted" job indices.
2100 * THIS IS A HACK XXX */
2101 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false,
2102 batch->scoreboard.job_index + 2, tp, tp_size, true);
2103 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2104 vp, vp_size, true);
2105 return;
2106 }
2107
2108 /* If rasterizer discard is enable, only submit the vertex */
2109
2110 unsigned vertex = panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2111 vp, vp_size, false);
2112
2113 if (ctx->rasterizer->base.rasterizer_discard)
2114 return;
2115
2116 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false, vertex, tp, tp_size,
2117 false);
2118 }
2119
2120 /* TODO: stop hardcoding this */
2121 mali_ptr
2122 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2123 {
2124 uint16_t locations[] = {
2125 128, 128,
2126 0, 256,
2127 0, 256,
2128 0, 256,
2129 0, 256,
2130 0, 256,
2131 0, 256,
2132 0, 256,
2133 0, 256,
2134 0, 256,
2135 0, 256,
2136 0, 256,
2137 0, 256,
2138 0, 256,
2139 0, 256,
2140 0, 256,
2141 0, 256,
2142 0, 256,
2143 0, 256,
2144 0, 256,
2145 0, 256,
2146 0, 256,
2147 0, 256,
2148 0, 256,
2149 0, 256,
2150 0, 256,
2151 0, 256,
2152 0, 256,
2153 0, 256,
2154 0, 256,
2155 0, 256,
2156 0, 256,
2157 128, 128,
2158 0, 0,
2159 0, 0,
2160 0, 0,
2161 0, 0,
2162 0, 0,
2163 0, 0,
2164 0, 0,
2165 0, 0,
2166 0, 0,
2167 0, 0,
2168 0, 0,
2169 0, 0,
2170 0, 0,
2171 0, 0,
2172 0, 0,
2173 };
2174
2175 return panfrost_pool_upload_aligned(&batch->pool, locations, 96 * sizeof(uint16_t), 64);
2176 }