panfrost: XMLify beginning of shader descriptor
[mesa.git] / src / gallium / drivers / panfrost / pan_cmdstream.c
1 /*
2 * Copyright (C) 2018 Alyssa Rosenzweig
3 * Copyright (C) 2020 Collabora Ltd.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "util/macros.h"
26 #include "util/u_prim.h"
27 #include "util/u_vbuf.h"
28
29 #include "panfrost-quirks.h"
30
31 #include "pan_pool.h"
32 #include "pan_bo.h"
33 #include "pan_cmdstream.h"
34 #include "pan_context.h"
35 #include "pan_job.h"
36
37 /* If a BO is accessed for a particular shader stage, will it be in the primary
38 * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
39 * fragment will be primary, e.g. compute jobs will be considered
40 * "vertex/tiler" by analogy */
41
42 static inline uint32_t
43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
44 {
45 assert(stage == PIPE_SHADER_FRAGMENT ||
46 stage == PIPE_SHADER_VERTEX ||
47 stage == PIPE_SHADER_COMPUTE);
48
49 return stage == PIPE_SHADER_FRAGMENT ?
50 PAN_BO_ACCESS_FRAGMENT :
51 PAN_BO_ACCESS_VERTEX_TILER;
52 }
53
54 static void
55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
56 struct mali_vertex_tiler_postfix *postfix)
57 {
58 struct panfrost_device *dev = pan_device(ctx->base.screen);
59 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
60
61 struct mali_shared_memory shared = {
62 .shared_workgroup_count = ~0,
63 };
64
65 if (batch->stack_size) {
66 struct panfrost_bo *stack =
67 panfrost_batch_get_scratchpad(batch, batch->stack_size,
68 dev->thread_tls_alloc,
69 dev->core_count);
70
71 shared.stack_shift = panfrost_get_stack_shift(batch->stack_size);
72 shared.scratchpad = stack->gpu;
73 }
74
75 postfix->shared_memory = panfrost_pool_upload_aligned(&batch->pool, &shared, sizeof(shared), 64);
76 }
77
78 static void
79 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
80 struct mali_vertex_tiler_postfix *postfix)
81 {
82 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
83 postfix->shared_memory = panfrost_batch_reserve_framebuffer(batch);
84 }
85
86 static void
87 panfrost_vt_update_rasterizer(struct panfrost_rasterizer *rasterizer,
88 struct mali_vertex_tiler_prefix *prefix,
89 struct mali_vertex_tiler_postfix *postfix)
90 {
91 postfix->gl_enables |= 0x7;
92 SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
93 rasterizer->base.front_ccw);
94 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
95 (rasterizer->base.cull_face & PIPE_FACE_FRONT));
96 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
97 (rasterizer->base.cull_face & PIPE_FACE_BACK));
98 SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
99 rasterizer->base.flatshade_first);
100 }
101
102 void
103 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
104 struct mali_vertex_tiler_prefix *prefix,
105 union midgard_primitive_size *primitive_size)
106 {
107 struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
108
109 if (!panfrost_writes_point_size(ctx)) {
110 float val = (prefix->draw_mode == MALI_DRAW_MODE_POINTS) ?
111 rasterizer->base.point_size :
112 rasterizer->base.line_width;
113
114 primitive_size->constant = val;
115 }
116 }
117
118 static void
119 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
120 struct mali_vertex_tiler_postfix *postfix)
121 {
122 SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
123 if (ctx->occlusion_query) {
124 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
125 panfrost_batch_add_bo(ctx->batch, ctx->occlusion_query->bo,
126 PAN_BO_ACCESS_SHARED |
127 PAN_BO_ACCESS_RW |
128 PAN_BO_ACCESS_FRAGMENT);
129 } else {
130 postfix->occlusion_counter = 0;
131 }
132 }
133
134 void
135 panfrost_vt_init(struct panfrost_context *ctx,
136 enum pipe_shader_type stage,
137 struct mali_vertex_tiler_prefix *prefix,
138 struct mali_vertex_tiler_postfix *postfix)
139 {
140 struct panfrost_device *device = pan_device(ctx->base.screen);
141
142 if (!ctx->shader[stage])
143 return;
144
145 memset(prefix, 0, sizeof(*prefix));
146 memset(postfix, 0, sizeof(*postfix));
147
148 if (device->quirks & IS_BIFROST) {
149 postfix->gl_enables = 0x2;
150 panfrost_vt_emit_shared_memory(ctx, postfix);
151 } else {
152 postfix->gl_enables = 0x6;
153 panfrost_vt_attach_framebuffer(ctx, postfix);
154 }
155
156 if (stage == PIPE_SHADER_FRAGMENT) {
157 panfrost_vt_update_occlusion_query(ctx, postfix);
158 panfrost_vt_update_rasterizer(ctx->rasterizer, prefix, postfix);
159 }
160 }
161
162 static unsigned
163 panfrost_translate_index_size(unsigned size)
164 {
165 switch (size) {
166 case 1:
167 return MALI_DRAW_INDEXED_UINT8;
168
169 case 2:
170 return MALI_DRAW_INDEXED_UINT16;
171
172 case 4:
173 return MALI_DRAW_INDEXED_UINT32;
174
175 default:
176 unreachable("Invalid index size");
177 }
178 }
179
180 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
181 * good for the duration of the draw (transient), could last longer. Also get
182 * the bounds on the index buffer for the range accessed by the draw. We do
183 * these operations together because there are natural optimizations which
184 * require them to be together. */
185
186 static mali_ptr
187 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
188 const struct pipe_draw_info *info,
189 unsigned *min_index, unsigned *max_index)
190 {
191 struct panfrost_resource *rsrc = pan_resource(info->index.resource);
192 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
193 off_t offset = info->start * info->index_size;
194 bool needs_indices = true;
195 mali_ptr out = 0;
196
197 if (info->max_index != ~0u) {
198 *min_index = info->min_index;
199 *max_index = info->max_index;
200 needs_indices = false;
201 }
202
203 if (!info->has_user_indices) {
204 /* Only resources can be directly mapped */
205 panfrost_batch_add_bo(batch, rsrc->bo,
206 PAN_BO_ACCESS_SHARED |
207 PAN_BO_ACCESS_READ |
208 PAN_BO_ACCESS_VERTEX_TILER);
209 out = rsrc->bo->gpu + offset;
210
211 /* Check the cache */
212 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
213 info->start,
214 info->count,
215 min_index,
216 max_index);
217 } else {
218 /* Otherwise, we need to upload to transient memory */
219 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
220 struct panfrost_transfer T =
221 panfrost_pool_alloc_aligned(&batch->pool,
222 info->count * info->index_size,
223 info->index_size);
224
225 memcpy(T.cpu, ibuf8 + offset, info->count * info->index_size);
226 out = T.gpu;
227 }
228
229 if (needs_indices) {
230 /* Fallback */
231 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
232
233 if (!info->has_user_indices)
234 panfrost_minmax_cache_add(rsrc->index_cache,
235 info->start, info->count,
236 *min_index, *max_index);
237 }
238
239 return out;
240 }
241
242 void
243 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
244 const struct pipe_draw_info *info,
245 enum mali_draw_mode draw_mode,
246 struct mali_vertex_tiler_postfix *vertex_postfix,
247 struct mali_vertex_tiler_prefix *tiler_prefix,
248 struct mali_vertex_tiler_postfix *tiler_postfix,
249 unsigned *vertex_count,
250 unsigned *padded_count)
251 {
252 tiler_prefix->draw_mode = draw_mode;
253
254 unsigned draw_flags = 0;
255
256 if (panfrost_writes_point_size(ctx))
257 draw_flags |= MALI_DRAW_VARYING_SIZE;
258
259 if (info->primitive_restart)
260 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
261
262 /* These doesn't make much sense */
263
264 draw_flags |= 0x3000;
265
266 if (info->index_size) {
267 unsigned min_index = 0, max_index = 0;
268
269 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
270 info,
271 &min_index,
272 &max_index);
273
274 /* Use the corresponding values */
275 *vertex_count = max_index - min_index + 1;
276 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
277 tiler_prefix->offset_bias_correction = -min_index;
278 tiler_prefix->index_count = MALI_POSITIVE(info->count);
279 draw_flags |= panfrost_translate_index_size(info->index_size);
280 } else {
281 tiler_prefix->indices = 0;
282 *vertex_count = ctx->vertex_count;
283 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
284 tiler_prefix->offset_bias_correction = 0;
285 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
286 }
287
288 tiler_prefix->unknown_draw = draw_flags;
289
290 /* Encode the padded vertex count */
291
292 if (info->instance_count > 1) {
293 *padded_count = panfrost_padded_vertex_count(*vertex_count);
294
295 unsigned shift = __builtin_ctz(ctx->padded_count);
296 unsigned k = ctx->padded_count >> (shift + 1);
297
298 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
299 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
300 } else {
301 *padded_count = *vertex_count;
302
303 /* Reset instancing state */
304 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
305 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
306 }
307 }
308
309 static void
310 panfrost_emit_compute_shader(struct panfrost_context *ctx,
311 enum pipe_shader_type st,
312 struct mali_shader_meta *meta)
313 {
314 const struct panfrost_device *dev = pan_device(ctx->base.screen);
315 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
316
317 memset(meta, 0, sizeof(*meta));
318 memcpy(&meta->shader, &ss->shader, sizeof(ss->shader));
319
320 if (dev->quirks & IS_BIFROST) {
321 struct mali_bifrost_properties_packed prop;
322 struct mali_preload_vertex_packed preload;
323
324 pan_pack(&prop, BIFROST_PROPERTIES, cfg) {
325 cfg.unknown = 0x800000; /* XXX */
326 cfg.uniform_buffer_count = panfrost_ubo_count(ctx, st);
327 }
328
329 /* TODO: True compute shaders */
330 pan_pack(&preload, PRELOAD_VERTEX, cfg) {
331 cfg.uniform_count = ss->uniform_count;
332 cfg.vertex_id = true;
333 cfg.instance_id = true;
334 }
335
336 memcpy(&meta->bifrost_props, &prop, sizeof(prop));
337 memcpy(&meta->bifrost_preload, &preload, sizeof(preload));
338 } else {
339 struct mali_midgard_properties_packed prop;
340
341 pan_pack(&prop, MIDGARD_PROPERTIES, cfg) {
342 cfg.uniform_buffer_count = panfrost_ubo_count(ctx, st);
343 cfg.uniform_count = ss->uniform_count;
344 cfg.work_register_count = ss->work_reg_count;
345 cfg.writes_globals = ss->writes_global;
346 cfg.suppress_inf_nan = true; /* XXX */
347 }
348
349 memcpy(&meta->midgard_props, &prop, sizeof(prop));
350 }
351 }
352
353 static unsigned
354 translate_tex_wrap(enum pipe_tex_wrap w)
355 {
356 switch (w) {
357 case PIPE_TEX_WRAP_REPEAT: return MALI_WRAP_MODE_REPEAT;
358 case PIPE_TEX_WRAP_CLAMP: return MALI_WRAP_MODE_CLAMP;
359 case PIPE_TEX_WRAP_CLAMP_TO_EDGE: return MALI_WRAP_MODE_CLAMP_TO_EDGE;
360 case PIPE_TEX_WRAP_CLAMP_TO_BORDER: return MALI_WRAP_MODE_CLAMP_TO_BORDER;
361 case PIPE_TEX_WRAP_MIRROR_REPEAT: return MALI_WRAP_MODE_MIRRORED_REPEAT;
362 case PIPE_TEX_WRAP_MIRROR_CLAMP: return MALI_WRAP_MODE_MIRRORED_CLAMP;
363 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE;
364 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER;
365 default: unreachable("Invalid wrap");
366 }
367 }
368
369 /* The hardware compares in the wrong order order, so we have to flip before
370 * encoding. Yes, really. */
371
372 static enum mali_func
373 panfrost_sampler_compare_func(const struct pipe_sampler_state *cso)
374 {
375 if (!cso->compare_mode)
376 return MALI_FUNC_NEVER;
377
378 enum mali_func f = panfrost_translate_compare_func(cso->compare_func);
379 return panfrost_flip_compare_func(f);
380 }
381
382 static enum mali_mipmap_mode
383 pan_pipe_to_mipmode(enum pipe_tex_mipfilter f)
384 {
385 switch (f) {
386 case PIPE_TEX_MIPFILTER_NEAREST: return MALI_MIPMAP_MODE_NEAREST;
387 case PIPE_TEX_MIPFILTER_LINEAR: return MALI_MIPMAP_MODE_TRILINEAR;
388 case PIPE_TEX_MIPFILTER_NONE: return MALI_MIPMAP_MODE_NONE;
389 default: unreachable("Invalid");
390 }
391 }
392
393 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
394 struct mali_midgard_sampler_packed *hw)
395 {
396 pan_pack(hw, MIDGARD_SAMPLER, cfg) {
397 cfg.magnify_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
398 cfg.minify_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
399 cfg.mipmap_mode = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR) ?
400 MALI_MIPMAP_MODE_TRILINEAR : MALI_MIPMAP_MODE_NEAREST;
401 cfg.normalized_coordinates = cso->normalized_coords;
402
403 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
404
405 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
406
407 /* If necessary, we disable mipmapping in the sampler descriptor by
408 * clamping the LOD as tight as possible (from 0 to epsilon,
409 * essentially -- remember these are fixed point numbers, so
410 * epsilon=1/256) */
411
412 cfg.maximum_lod = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) ?
413 cfg.minimum_lod + 1 :
414 FIXED_16(cso->max_lod, false);
415
416 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
417 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
418 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
419
420 cfg.compare_function = panfrost_sampler_compare_func(cso);
421 cfg.seamless_cube_map = cso->seamless_cube_map;
422
423 cfg.border_color_r = cso->border_color.f[0];
424 cfg.border_color_g = cso->border_color.f[1];
425 cfg.border_color_b = cso->border_color.f[2];
426 cfg.border_color_a = cso->border_color.f[3];
427 }
428 }
429
430 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
431 struct mali_bifrost_sampler_packed *hw)
432 {
433 pan_pack(hw, BIFROST_SAMPLER, cfg) {
434 cfg.magnify_linear = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR;
435 cfg.minify_linear = cso->min_img_filter == PIPE_TEX_FILTER_LINEAR;
436 cfg.mipmap_mode = pan_pipe_to_mipmode(cso->min_mip_filter);
437 cfg.normalized_coordinates = cso->normalized_coords;
438
439 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
440 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
441 cfg.maximum_lod = FIXED_16(cso->max_lod, false);
442
443 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
444 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
445 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
446
447 cfg.compare_function = panfrost_sampler_compare_func(cso);
448 cfg.seamless_cube_map = cso->seamless_cube_map;
449 }
450 }
451
452 static bool
453 panfrost_fs_required(
454 struct panfrost_shader_state *fs,
455 struct panfrost_blend_final *blend,
456 unsigned rt_count)
457 {
458 /* If we generally have side effects */
459 if (fs->fs_sidefx)
460 return true;
461
462 /* If colour is written we need to execute */
463 for (unsigned i = 0; i < rt_count; ++i) {
464 if (!blend[i].no_colour)
465 return true;
466 }
467
468 /* If depth is written and not implied we need to execute.
469 * TODO: Predicate on Z/S writes being enabled */
470 return (fs->writes_depth || fs->writes_stencil);
471 }
472
473 static void
474 panfrost_emit_blend(struct panfrost_batch *batch, void *rts,
475 struct panfrost_blend_final *blend)
476 {
477 const struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
478 struct panfrost_shader_state *fs = panfrost_get_shader_state(batch->ctx, PIPE_SHADER_FRAGMENT);
479 unsigned rt_count = batch->key.nr_cbufs;
480
481 struct bifrost_blend_rt *brts = rts;
482 struct midgard_blend_rt *mrts = rts;
483
484 /* Disable blending for depth-only on Bifrost */
485
486 if (rt_count == 0 && dev->quirks & IS_BIFROST)
487 brts[0].unk2 = 0x3;
488
489 for (unsigned i = 0; i < rt_count; ++i) {
490 unsigned flags = 0;
491
492 pan_pack(&flags, BLEND_FLAGS, cfg) {
493 if (blend[i].no_colour) {
494 cfg.enable = false;
495 break;
496 }
497
498 batch->draws |= (PIPE_CLEAR_COLOR0 << i);
499
500 cfg.srgb = util_format_is_srgb(batch->key.cbufs[i]->format);
501 cfg.load_destination = blend[i].load_dest;
502 cfg.dither_disable = !batch->ctx->blend->base.dither;
503
504 if (!(dev->quirks & IS_BIFROST))
505 cfg.midgard_blend_shader = blend[i].is_shader;
506 }
507
508 if (dev->quirks & IS_BIFROST) {
509 brts[i].flags = flags;
510
511 if (blend[i].is_shader) {
512 /* The blend shader's address needs to be at
513 * the same top 32 bit as the fragment shader.
514 * TODO: Ensure that's always the case.
515 */
516 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
517 (fs->bo->gpu & (0xffffffffull << 32)));
518 brts[i].shader = blend[i].shader.gpu;
519 brts[i].unk2 = 0x0;
520 } else {
521 enum pipe_format format = batch->key.cbufs[i]->format;
522 const struct util_format_description *format_desc;
523 format_desc = util_format_description(format);
524
525 brts[i].equation = blend[i].equation.equation;
526
527 /* TODO: this is a bit more complicated */
528 brts[i].constant = blend[i].equation.constant;
529
530 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
531
532 /* 0x19 disables blending and forces REPLACE
533 * mode (equivalent to rgb_mode = alpha_mode =
534 * x122, colour mask = 0xF). 0x1a allows
535 * blending. */
536 brts[i].unk2 = blend[i].opaque ? 0x19 : 0x1a;
537
538 brts[i].shader_type = fs->blend_types[i];
539 }
540 } else {
541 memcpy(&mrts[i].flags, &flags, sizeof(flags));
542
543 if (blend[i].is_shader) {
544 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
545 } else {
546 mrts[i].blend.equation = blend[i].equation.equation;
547 mrts[i].blend.constant = blend[i].equation.constant;
548 }
549 }
550 }
551 }
552
553 static struct mali_shader_packed
554 panfrost_pack_shaderless(bool midgard)
555 {
556 struct mali_shader_packed pack;
557
558 pan_pack(&pack, SHADER, cfg) {
559 cfg.shader = midgard ? 0x1 : 0x0;
560 }
561
562 return pack;
563 }
564
565 static void
566 panfrost_emit_frag_shader(struct panfrost_context *ctx,
567 struct mali_shader_meta *fragmeta,
568 struct panfrost_blend_final *blend)
569 {
570 const struct panfrost_device *dev = pan_device(ctx->base.screen);
571 struct panfrost_shader_state *fs;
572
573 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
574
575 struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
576 const struct panfrost_zsa_state *zsa = ctx->depth_stencil;
577 unsigned rt_count = ctx->pipe_framebuffer.nr_cbufs;
578
579 memset(fragmeta, 0, sizeof(*fragmeta));
580 memcpy(&fragmeta->shader, &fs->shader, sizeof(fs->shader));
581
582 if (dev->quirks & IS_BIFROST) {
583 struct mali_bifrost_properties_packed prop;
584 struct mali_preload_fragment_packed preload;
585
586 bool no_blend = true;
587
588 for (unsigned i = 0; i < rt_count; ++i)
589 no_blend &= (!blend[i].load_dest | blend[i].no_colour);
590
591 pan_pack(&prop, BIFROST_PROPERTIES, cfg) {
592 cfg.unknown = 0x950020; /* XXX */
593 cfg.uniform_buffer_count = panfrost_ubo_count(ctx, PIPE_SHADER_FRAGMENT);
594 cfg.early_z_enable = !fs->can_discard && !fs->writes_depth && no_blend;
595 }
596
597 pan_pack(&preload, PRELOAD_FRAGMENT, cfg) {
598 cfg.uniform_count = fs->uniform_count;
599 cfg.fragment_position = fs->reads_frag_coord;
600 }
601
602 memcpy(&fragmeta->bifrost_props, &prop, sizeof(prop));
603 memcpy(&fragmeta->bifrost_preload, &preload, sizeof(preload));
604 } else {
605 struct mali_midgard_properties_packed prop;
606
607 /* Reasons to disable early-Z from a shader perspective */
608 bool late_z = fs->can_discard || fs->writes_global ||
609 fs->writes_depth || fs->writes_stencil;
610
611 /* Reasons to disable early-Z from a CSO perspective */
612 bool alpha_to_coverage = ctx->blend->base.alpha_to_coverage;
613
614 /* If either depth or stencil is enabled, discard matters */
615 bool zs_enabled =
616 (zsa->base.depth.enabled && zsa->base.depth.func != PIPE_FUNC_ALWAYS) ||
617 zsa->base.stencil[0].enabled;
618
619 bool has_blend_shader = false;
620
621 for (unsigned c = 0; c < rt_count; ++c)
622 has_blend_shader |= blend[c].is_shader;
623
624 pan_pack(&prop, MIDGARD_PROPERTIES, cfg) {
625 cfg.uniform_buffer_count = panfrost_ubo_count(ctx, PIPE_SHADER_FRAGMENT);
626 cfg.uniform_count = fs->uniform_count;
627 cfg.work_register_count = fs->work_reg_count;
628 cfg.writes_globals = fs->writes_global;
629 cfg.suppress_inf_nan = true; /* XXX */
630
631 /* TODO: Reduce this limit? */
632 if (has_blend_shader)
633 cfg.work_register_count = MAX2(cfg.work_register_count, 8);
634
635 cfg.stencil_from_shader = fs->writes_stencil;
636 cfg.helper_invocation_enable = fs->helper_invocations;
637 cfg.depth_source = fs->writes_depth ?
638 MALI_DEPTH_SOURCE_SHADER :
639 MALI_DEPTH_SOURCE_FIXED_FUNCTION;
640
641 /* Depend on other state */
642 cfg.early_z_enable = !(late_z || alpha_to_coverage);
643 cfg.reads_tilebuffer = fs->outputs_read || (!zs_enabled && fs->can_discard);
644 cfg.reads_depth_stencil = zs_enabled && fs->can_discard;
645 }
646
647 memcpy(&fragmeta->midgard_props, &prop, sizeof(prop));
648 }
649
650 bool msaa = rast->multisample;
651 fragmeta->coverage_mask = msaa ? ctx->sample_mask : ~0;
652
653 fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x10;
654 fragmeta->unknown2_4 = 0x4e0;
655
656 /* TODO: Sample size */
657 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
658 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
659
660 /* EXT_shader_framebuffer_fetch requires the shader to be run
661 * per-sample when outputs are read. */
662 bool per_sample = ctx->min_samples > 1 || fs->outputs_read;
663 SET_BIT(fragmeta->unknown2_3, MALI_PER_SAMPLE, msaa && per_sample);
664
665 fragmeta->depth_units = rast->offset_units * 2.0f;
666 fragmeta->depth_factor = rast->offset_scale;
667
668 /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
669
670 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, rast->offset_tri);
671 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, rast->offset_tri);
672
673 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_NEAR, rast->depth_clip_near);
674 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_FAR, rast->depth_clip_far);
675
676 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
677 zsa->base.stencil[0].enabled);
678
679 fragmeta->stencil_mask_front = zsa->stencil_mask_front;
680 fragmeta->stencil_mask_back = zsa->stencil_mask_back;
681
682 /* Bottom bits for stencil ref, exactly one word */
683 fragmeta->stencil_front.opaque[0] = zsa->stencil_front.opaque[0] | ctx->stencil_ref.ref_value[0];
684
685 /* If back-stencil is not enabled, use the front values */
686
687 if (zsa->base.stencil[1].enabled)
688 fragmeta->stencil_back.opaque[0] = zsa->stencil_back.opaque[0] | ctx->stencil_ref.ref_value[1];
689 else
690 fragmeta->stencil_back = fragmeta->stencil_front;
691
692 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
693 zsa->base.depth.writemask);
694
695 fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
696 fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(
697 zsa->base.depth.enabled ? zsa->base.depth.func : PIPE_FUNC_ALWAYS));
698
699 SET_BIT(fragmeta->unknown2_4, MALI_ALPHA_TO_COVERAGE,
700 ctx->blend->base.alpha_to_coverage);
701
702 /* Disable shader execution if we can */
703 if (!panfrost_fs_required(fs, blend, rt_count)) {
704 struct mali_shader_packed shader =
705 panfrost_pack_shaderless(!(dev->quirks & IS_BIFROST));
706
707 memcpy(&fragmeta->shader, &shader, sizeof(shader));
708
709 struct mali_midgard_properties_packed prop;
710
711 if (dev->quirks & IS_BIFROST) {
712 pan_pack(&prop, BIFROST_PROPERTIES, cfg) {
713 cfg.unknown = 0x950020; /* XXX */
714 cfg.early_z_enable = true;
715 }
716 } else {
717 pan_pack(&prop, MIDGARD_PROPERTIES, cfg) {
718 cfg.work_register_count = 1;
719 cfg.depth_source = MALI_DEPTH_SOURCE_FIXED_FUNCTION;
720 cfg.early_z_enable = true;
721 }
722 }
723
724 memcpy(&fragmeta->midgard_props, &prop, sizeof(prop));
725 }
726
727 if (dev->quirks & MIDGARD_SFBD) {
728 /* When only a single render target platform is used, the blend
729 * information is inside the shader meta itself. We additionally
730 * need to signal CAN_DISCARD for nontrivial blend modes (so
731 * we're able to read back the destination buffer) */
732
733 if (blend[0].no_colour)
734 return;
735
736 fragmeta->unknown2_4 |= MALI_SFBD_ENABLE;
737
738 SET_BIT(fragmeta->unknown2_4, MALI_SFBD_SRGB,
739 util_format_is_srgb(ctx->pipe_framebuffer.cbufs[0]->format));
740
741 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
742 blend[0].is_shader);
743
744 if (blend[0].is_shader) {
745 fragmeta->blend.shader = blend[0].shader.gpu |
746 blend[0].shader.first_tag;
747 } else {
748 fragmeta->blend.equation = blend[0].equation.equation;
749 fragmeta->blend.constant = blend[0].equation.constant;
750 }
751
752 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
753 blend[0].load_dest);
754
755 SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER, !ctx->blend->base.dither);
756 } else if (!(dev->quirks & IS_BIFROST)) {
757 /* Bug where MRT-capable hw apparently reads the last blend
758 * shader from here instead of the usual location? */
759
760 for (signed rt = ((signed) rt_count - 1); rt >= 0; --rt) {
761 if (!blend[rt].is_shader)
762 continue;
763
764 fragmeta->blend.shader = blend[rt].shader.gpu |
765 blend[rt].shader.first_tag;
766 break;
767 }
768 }
769 }
770
771 void
772 panfrost_emit_shader_meta(struct panfrost_batch *batch,
773 enum pipe_shader_type st,
774 struct mali_vertex_tiler_postfix *postfix)
775 {
776 struct panfrost_context *ctx = batch->ctx;
777 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
778
779 if (!ss) {
780 postfix->shader = 0;
781 return;
782 }
783
784 struct mali_shader_meta meta;
785
786 /* Add the shader BO to the batch. */
787 panfrost_batch_add_bo(batch, ss->bo,
788 PAN_BO_ACCESS_PRIVATE |
789 PAN_BO_ACCESS_READ |
790 panfrost_bo_access_for_stage(st));
791
792 mali_ptr shader_ptr;
793
794 if (st == PIPE_SHADER_FRAGMENT) {
795 struct panfrost_device *dev = pan_device(ctx->base.screen);
796 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
797 size_t desc_size = sizeof(meta);
798 void *rts = NULL;
799 struct panfrost_transfer xfer;
800 unsigned rt_size;
801
802 if (dev->quirks & MIDGARD_SFBD)
803 rt_size = 0;
804 else if (dev->quirks & IS_BIFROST)
805 rt_size = sizeof(struct bifrost_blend_rt);
806 else
807 rt_size = sizeof(struct midgard_blend_rt);
808
809 desc_size += rt_size * rt_count;
810
811 if (rt_size)
812 rts = rzalloc_size(ctx, rt_size * rt_count);
813
814 struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
815
816 for (unsigned c = 0; c < ctx->pipe_framebuffer.nr_cbufs; ++c)
817 blend[c] = panfrost_get_blend_for_context(ctx, c);
818
819 panfrost_emit_frag_shader(ctx, &meta, blend);
820
821 if (!(dev->quirks & MIDGARD_SFBD))
822 panfrost_emit_blend(batch, rts, blend);
823 else
824 batch->draws |= PIPE_CLEAR_COLOR0;
825
826 xfer = panfrost_pool_alloc_aligned(&batch->pool, desc_size, sizeof(meta));
827
828 memcpy(xfer.cpu, &meta, sizeof(meta));
829 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
830
831 if (rt_size)
832 ralloc_free(rts);
833
834 shader_ptr = xfer.gpu;
835 } else {
836 panfrost_emit_compute_shader(ctx, st, &meta);
837
838 shader_ptr = panfrost_pool_upload(&batch->pool, &meta,
839 sizeof(meta));
840 }
841
842 postfix->shader = shader_ptr;
843 }
844
845 void
846 panfrost_emit_viewport(struct panfrost_batch *batch,
847 struct mali_vertex_tiler_postfix *tiler_postfix)
848 {
849 struct panfrost_context *ctx = batch->ctx;
850 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
851 const struct pipe_scissor_state *ss = &ctx->scissor;
852 const struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
853 const struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer;
854
855 /* Derive min/max from translate/scale. Note since |x| >= 0 by
856 * definition, we have that -|x| <= |x| hence translate - |scale| <=
857 * translate + |scale|, so the ordering is correct here. */
858 float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
859 float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
860 float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
861 float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
862 float minz = (vp->translate[2] - fabsf(vp->scale[2]));
863 float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
864
865 /* Scissor to the intersection of viewport and to the scissor, clamped
866 * to the framebuffer */
867
868 unsigned minx = MIN2(fb->width, vp_minx);
869 unsigned maxx = MIN2(fb->width, vp_maxx);
870 unsigned miny = MIN2(fb->height, vp_miny);
871 unsigned maxy = MIN2(fb->height, vp_maxy);
872
873 if (ss && rast->scissor) {
874 minx = MAX2(ss->minx, minx);
875 miny = MAX2(ss->miny, miny);
876 maxx = MIN2(ss->maxx, maxx);
877 maxy = MIN2(ss->maxy, maxy);
878 }
879
880 struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool, MALI_VIEWPORT_LENGTH);
881
882 pan_pack(T.cpu, VIEWPORT, cfg) {
883 cfg.scissor_minimum_x = minx;
884 cfg.scissor_minimum_y = miny;
885 cfg.scissor_maximum_x = maxx - 1;
886 cfg.scissor_maximum_y = maxy - 1;
887
888 cfg.minimum_z = rast->depth_clip_near ? minz : -INFINITY;
889 cfg.maximum_z = rast->depth_clip_far ? maxz : INFINITY;
890 }
891
892 tiler_postfix->viewport = T.gpu;
893 panfrost_batch_union_scissor(batch, minx, miny, maxx, maxy);
894 }
895
896 static mali_ptr
897 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
898 enum pipe_shader_type st,
899 struct panfrost_constant_buffer *buf,
900 unsigned index)
901 {
902 struct pipe_constant_buffer *cb = &buf->cb[index];
903 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
904
905 if (rsrc) {
906 panfrost_batch_add_bo(batch, rsrc->bo,
907 PAN_BO_ACCESS_SHARED |
908 PAN_BO_ACCESS_READ |
909 panfrost_bo_access_for_stage(st));
910
911 /* Alignment gauranteed by
912 * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
913 return rsrc->bo->gpu + cb->buffer_offset;
914 } else if (cb->user_buffer) {
915 return panfrost_pool_upload_aligned(&batch->pool,
916 cb->user_buffer +
917 cb->buffer_offset,
918 cb->buffer_size, 16);
919 } else {
920 unreachable("No constant buffer");
921 }
922 }
923
924 struct sysval_uniform {
925 union {
926 float f[4];
927 int32_t i[4];
928 uint32_t u[4];
929 uint64_t du[2];
930 };
931 };
932
933 static void
934 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
935 struct sysval_uniform *uniform)
936 {
937 struct panfrost_context *ctx = batch->ctx;
938 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
939
940 uniform->f[0] = vp->scale[0];
941 uniform->f[1] = vp->scale[1];
942 uniform->f[2] = vp->scale[2];
943 }
944
945 static void
946 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
947 struct sysval_uniform *uniform)
948 {
949 struct panfrost_context *ctx = batch->ctx;
950 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
951
952 uniform->f[0] = vp->translate[0];
953 uniform->f[1] = vp->translate[1];
954 uniform->f[2] = vp->translate[2];
955 }
956
957 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
958 enum pipe_shader_type st,
959 unsigned int sysvalid,
960 struct sysval_uniform *uniform)
961 {
962 struct panfrost_context *ctx = batch->ctx;
963 unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
964 unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
965 bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
966 struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
967
968 assert(dim);
969 uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
970
971 if (dim > 1)
972 uniform->i[1] = u_minify(tex->texture->height0,
973 tex->u.tex.first_level);
974
975 if (dim > 2)
976 uniform->i[2] = u_minify(tex->texture->depth0,
977 tex->u.tex.first_level);
978
979 if (is_array)
980 uniform->i[dim] = tex->texture->array_size;
981 }
982
983 static void
984 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
985 enum pipe_shader_type st,
986 unsigned ssbo_id,
987 struct sysval_uniform *uniform)
988 {
989 struct panfrost_context *ctx = batch->ctx;
990
991 assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
992 struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
993
994 /* Compute address */
995 struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
996
997 panfrost_batch_add_bo(batch, bo,
998 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
999 panfrost_bo_access_for_stage(st));
1000
1001 /* Upload address and size as sysval */
1002 uniform->du[0] = bo->gpu + sb.buffer_offset;
1003 uniform->u[2] = sb.buffer_size;
1004 }
1005
1006 static void
1007 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1008 enum pipe_shader_type st,
1009 unsigned samp_idx,
1010 struct sysval_uniform *uniform)
1011 {
1012 struct panfrost_context *ctx = batch->ctx;
1013 struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1014
1015 uniform->f[0] = sampl->min_lod;
1016 uniform->f[1] = sampl->max_lod;
1017 uniform->f[2] = sampl->lod_bias;
1018
1019 /* Even without any errata, Midgard represents "no mipmapping" as
1020 * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1021 * panfrost_create_sampler_state which also explains our choice of
1022 * epsilon value (again to keep behaviour consistent) */
1023
1024 if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1025 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1026 }
1027
1028 static void
1029 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1030 struct sysval_uniform *uniform)
1031 {
1032 struct panfrost_context *ctx = batch->ctx;
1033
1034 uniform->u[0] = ctx->compute_grid->grid[0];
1035 uniform->u[1] = ctx->compute_grid->grid[1];
1036 uniform->u[2] = ctx->compute_grid->grid[2];
1037 }
1038
1039 static void
1040 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1041 struct panfrost_shader_state *ss,
1042 enum pipe_shader_type st)
1043 {
1044 struct sysval_uniform *uniforms = (void *)buf;
1045
1046 for (unsigned i = 0; i < ss->sysval_count; ++i) {
1047 int sysval = ss->sysval[i];
1048
1049 switch (PAN_SYSVAL_TYPE(sysval)) {
1050 case PAN_SYSVAL_VIEWPORT_SCALE:
1051 panfrost_upload_viewport_scale_sysval(batch,
1052 &uniforms[i]);
1053 break;
1054 case PAN_SYSVAL_VIEWPORT_OFFSET:
1055 panfrost_upload_viewport_offset_sysval(batch,
1056 &uniforms[i]);
1057 break;
1058 case PAN_SYSVAL_TEXTURE_SIZE:
1059 panfrost_upload_txs_sysval(batch, st,
1060 PAN_SYSVAL_ID(sysval),
1061 &uniforms[i]);
1062 break;
1063 case PAN_SYSVAL_SSBO:
1064 panfrost_upload_ssbo_sysval(batch, st,
1065 PAN_SYSVAL_ID(sysval),
1066 &uniforms[i]);
1067 break;
1068 case PAN_SYSVAL_NUM_WORK_GROUPS:
1069 panfrost_upload_num_work_groups_sysval(batch,
1070 &uniforms[i]);
1071 break;
1072 case PAN_SYSVAL_SAMPLER:
1073 panfrost_upload_sampler_sysval(batch, st,
1074 PAN_SYSVAL_ID(sysval),
1075 &uniforms[i]);
1076 break;
1077 default:
1078 assert(0);
1079 }
1080 }
1081 }
1082
1083 static const void *
1084 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1085 unsigned index)
1086 {
1087 struct pipe_constant_buffer *cb = &buf->cb[index];
1088 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1089
1090 if (rsrc)
1091 return rsrc->bo->cpu;
1092 else if (cb->user_buffer)
1093 return cb->user_buffer;
1094 else
1095 unreachable("No constant buffer");
1096 }
1097
1098 void
1099 panfrost_emit_const_buf(struct panfrost_batch *batch,
1100 enum pipe_shader_type stage,
1101 struct mali_vertex_tiler_postfix *postfix)
1102 {
1103 struct panfrost_context *ctx = batch->ctx;
1104 struct panfrost_shader_variants *all = ctx->shader[stage];
1105
1106 if (!all)
1107 return;
1108
1109 struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1110
1111 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1112
1113 /* Uniforms are implicitly UBO #0 */
1114 bool has_uniforms = buf->enabled_mask & (1 << 0);
1115
1116 /* Allocate room for the sysval and the uniforms */
1117 size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1118 size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1119 size_t size = sys_size + uniform_size;
1120 struct panfrost_transfer transfer =
1121 panfrost_pool_alloc_aligned(&batch->pool, size, 16);
1122
1123 /* Upload sysvals requested by the shader */
1124 panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1125
1126 /* Upload uniforms */
1127 if (has_uniforms && uniform_size) {
1128 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1129 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1130 }
1131
1132 /* Next up, attach UBOs. UBO #0 is the uniforms we just
1133 * uploaded */
1134
1135 unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1136 assert(ubo_count >= 1);
1137
1138 size_t sz = MALI_UNIFORM_BUFFER_LENGTH * ubo_count;
1139 struct panfrost_transfer ubos =
1140 panfrost_pool_alloc_aligned(&batch->pool, sz,
1141 MALI_UNIFORM_BUFFER_LENGTH);
1142
1143 uint64_t *ubo_ptr = (uint64_t *) ubos.cpu;
1144
1145 /* Upload uniforms as a UBO */
1146
1147 if (size) {
1148 pan_pack(ubo_ptr, UNIFORM_BUFFER, cfg) {
1149 cfg.entries = DIV_ROUND_UP(size, 16);
1150 cfg.pointer = transfer.gpu;
1151 }
1152 } else {
1153 *ubo_ptr = 0;
1154 }
1155
1156 /* The rest are honest-to-goodness UBOs */
1157
1158 for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1159 size_t usz = buf->cb[ubo].buffer_size;
1160 bool enabled = buf->enabled_mask & (1 << ubo);
1161 bool empty = usz == 0;
1162
1163 if (!enabled || empty) {
1164 ubo_ptr[ubo] = 0;
1165 continue;
1166 }
1167
1168 pan_pack(ubo_ptr + ubo, UNIFORM_BUFFER, cfg) {
1169 cfg.entries = DIV_ROUND_UP(usz, 16);
1170 cfg.pointer = panfrost_map_constant_buffer_gpu(batch,
1171 stage, buf, ubo);
1172 }
1173 }
1174
1175 postfix->uniforms = transfer.gpu;
1176 postfix->uniform_buffers = ubos.gpu;
1177
1178 buf->dirty_mask = 0;
1179 }
1180
1181 void
1182 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1183 const struct pipe_grid_info *info,
1184 struct midgard_payload_vertex_tiler *vtp)
1185 {
1186 struct panfrost_context *ctx = batch->ctx;
1187 struct panfrost_device *dev = pan_device(ctx->base.screen);
1188 struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1189 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1190 unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1191 128));
1192
1193 unsigned log2_instances =
1194 util_logbase2_ceil(info->grid[0]) +
1195 util_logbase2_ceil(info->grid[1]) +
1196 util_logbase2_ceil(info->grid[2]);
1197
1198 unsigned shared_size = single_size * (1 << log2_instances) * dev->core_count;
1199 struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1200 shared_size,
1201 1);
1202
1203 struct mali_shared_memory shared = {
1204 .shared_memory = bo->gpu,
1205 .shared_workgroup_count = log2_instances,
1206 .shared_shift = util_logbase2(single_size) + 1
1207 };
1208
1209 vtp->postfix.shared_memory = panfrost_pool_upload_aligned(&batch->pool, &shared,
1210 sizeof(shared), 64);
1211 }
1212
1213 static mali_ptr
1214 panfrost_get_tex_desc(struct panfrost_batch *batch,
1215 enum pipe_shader_type st,
1216 struct panfrost_sampler_view *view)
1217 {
1218 if (!view)
1219 return (mali_ptr) 0;
1220
1221 struct pipe_sampler_view *pview = &view->base;
1222 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1223
1224 /* Add the BO to the job so it's retained until the job is done. */
1225
1226 panfrost_batch_add_bo(batch, rsrc->bo,
1227 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1228 panfrost_bo_access_for_stage(st));
1229
1230 panfrost_batch_add_bo(batch, view->bo,
1231 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1232 panfrost_bo_access_for_stage(st));
1233
1234 return view->bo->gpu;
1235 }
1236
1237 static void
1238 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
1239 struct pipe_context *pctx)
1240 {
1241 struct panfrost_resource *rsrc = pan_resource(view->base.texture);
1242 if (view->texture_bo != rsrc->bo->gpu ||
1243 view->modifier != rsrc->modifier) {
1244 panfrost_bo_unreference(view->bo);
1245 panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);
1246 }
1247 }
1248
1249 void
1250 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1251 enum pipe_shader_type stage,
1252 struct mali_vertex_tiler_postfix *postfix)
1253 {
1254 struct panfrost_context *ctx = batch->ctx;
1255 struct panfrost_device *device = pan_device(ctx->base.screen);
1256
1257 if (!ctx->sampler_view_count[stage])
1258 return;
1259
1260 if (device->quirks & IS_BIFROST) {
1261 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1262 MALI_BIFROST_TEXTURE_LENGTH *
1263 ctx->sampler_view_count[stage],
1264 MALI_BIFROST_TEXTURE_LENGTH);
1265
1266 struct mali_bifrost_texture_packed *out =
1267 (struct mali_bifrost_texture_packed *) T.cpu;
1268
1269 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1270 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1271 struct pipe_sampler_view *pview = &view->base;
1272 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1273
1274 panfrost_update_sampler_view(view, &ctx->base);
1275 out[i] = view->bifrost_descriptor;
1276
1277 /* Add the BOs to the job so they are retained until the job is done. */
1278
1279 panfrost_batch_add_bo(batch, rsrc->bo,
1280 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1281 panfrost_bo_access_for_stage(stage));
1282
1283 panfrost_batch_add_bo(batch, view->bo,
1284 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1285 panfrost_bo_access_for_stage(stage));
1286 }
1287
1288 postfix->textures = T.gpu;
1289 } else {
1290 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1291
1292 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1293 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1294
1295 panfrost_update_sampler_view(view, &ctx->base);
1296
1297 trampolines[i] = panfrost_get_tex_desc(batch, stage, view);
1298 }
1299
1300 postfix->textures = panfrost_pool_upload_aligned(&batch->pool,
1301 trampolines,
1302 sizeof(uint64_t) *
1303 ctx->sampler_view_count[stage],
1304 sizeof(uint64_t));
1305 }
1306 }
1307
1308 void
1309 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1310 enum pipe_shader_type stage,
1311 struct mali_vertex_tiler_postfix *postfix)
1312 {
1313 struct panfrost_context *ctx = batch->ctx;
1314
1315 if (!ctx->sampler_count[stage])
1316 return;
1317
1318 size_t desc_size = MALI_BIFROST_SAMPLER_LENGTH;
1319 assert(MALI_BIFROST_SAMPLER_LENGTH == MALI_MIDGARD_SAMPLER_LENGTH);
1320
1321 size_t sz = desc_size * ctx->sampler_count[stage];
1322 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool, sz, desc_size);
1323 struct mali_midgard_sampler_packed *out = (struct mali_midgard_sampler_packed *) T.cpu;
1324
1325 for (unsigned i = 0; i < ctx->sampler_count[stage]; ++i)
1326 out[i] = ctx->samplers[stage][i]->hw;
1327
1328 postfix->sampler_descriptor = T.gpu;
1329 }
1330
1331 void
1332 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1333 struct mali_vertex_tiler_postfix *vertex_postfix)
1334 {
1335 struct panfrost_context *ctx = batch->ctx;
1336 struct panfrost_vertex_state *so = ctx->vertex;
1337 struct panfrost_shader_state *vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1338
1339 unsigned instance_shift = vertex_postfix->instance_shift;
1340 unsigned instance_odd = vertex_postfix->instance_odd;
1341
1342 /* Worst case: everything is NPOT, which is only possible if instancing
1343 * is enabled. Otherwise single record is gauranteed */
1344 bool could_npot = instance_shift || instance_odd;
1345
1346 struct panfrost_transfer S = panfrost_pool_alloc_aligned(&batch->pool,
1347 MALI_ATTRIBUTE_BUFFER_LENGTH * vs->attribute_count *
1348 (could_npot ? 2 : 1),
1349 MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
1350
1351 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1352 MALI_ATTRIBUTE_LENGTH * vs->attribute_count,
1353 MALI_ATTRIBUTE_LENGTH);
1354
1355 struct mali_attribute_buffer_packed *bufs =
1356 (struct mali_attribute_buffer_packed *) S.cpu;
1357
1358 struct mali_attribute_packed *out =
1359 (struct mali_attribute_packed *) T.cpu;
1360
1361 unsigned attrib_to_buffer[PIPE_MAX_ATTRIBS] = { 0 };
1362 unsigned k = 0;
1363
1364 for (unsigned i = 0; i < so->num_elements; ++i) {
1365 /* We map buffers 1:1 with the attributes, which
1366 * means duplicating some vertex buffers (who cares? aside from
1367 * maybe some caching implications but I somehow doubt that
1368 * matters) */
1369
1370 struct pipe_vertex_element *elem = &so->pipe[i];
1371 unsigned vbi = elem->vertex_buffer_index;
1372 attrib_to_buffer[i] = k;
1373
1374 if (!(ctx->vb_mask & (1 << vbi)))
1375 continue;
1376
1377 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1378 struct panfrost_resource *rsrc;
1379
1380 rsrc = pan_resource(buf->buffer.resource);
1381 if (!rsrc)
1382 continue;
1383
1384 /* Add a dependency of the batch on the vertex buffer */
1385 panfrost_batch_add_bo(batch, rsrc->bo,
1386 PAN_BO_ACCESS_SHARED |
1387 PAN_BO_ACCESS_READ |
1388 PAN_BO_ACCESS_VERTEX_TILER);
1389
1390 /* Mask off lower bits, see offset fixup below */
1391 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1392 mali_ptr addr = raw_addr & ~63;
1393
1394 /* Since we advanced the base pointer, we shrink the buffer
1395 * size, but add the offset we subtracted */
1396 unsigned size = rsrc->base.width0 + (raw_addr - addr)
1397 - buf->buffer_offset;
1398
1399 /* When there is a divisor, the hardware-level divisor is
1400 * the product of the instance divisor and the padded count */
1401 unsigned divisor = elem->instance_divisor;
1402 unsigned hw_divisor = ctx->padded_count * divisor;
1403 unsigned stride = buf->stride;
1404
1405 /* If there's a divisor(=1) but no instancing, we want every
1406 * attribute to be the same */
1407
1408 if (divisor && ctx->instance_count == 1)
1409 stride = 0;
1410
1411 if (!divisor || ctx->instance_count <= 1) {
1412 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1413 if (ctx->instance_count > 1)
1414 cfg.type = MALI_ATTRIBUTE_TYPE_1D_MODULUS;
1415
1416 cfg.pointer = addr;
1417 cfg.stride = stride;
1418 cfg.size = size;
1419 cfg.divisor_r = instance_shift;
1420 cfg.divisor_p = instance_odd;
1421 }
1422 } else if (util_is_power_of_two_or_zero(hw_divisor)) {
1423 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1424 cfg.type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;
1425 cfg.pointer = addr;
1426 cfg.stride = stride;
1427 cfg.size = size;
1428 cfg.divisor_r = __builtin_ctz(hw_divisor);
1429 }
1430
1431 } else {
1432 unsigned shift = 0, extra_flags = 0;
1433
1434 unsigned magic_divisor =
1435 panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags);
1436
1437 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1438 cfg.type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;
1439 cfg.pointer = addr;
1440 cfg.stride = stride;
1441 cfg.size = size;
1442
1443 cfg.divisor_r = shift;
1444 cfg.divisor_e = extra_flags;
1445 }
1446
1447 pan_pack(bufs + k + 1, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) {
1448 cfg.divisor_numerator = magic_divisor;
1449 cfg.divisor = divisor;
1450 }
1451
1452 ++k;
1453 }
1454
1455 ++k;
1456 }
1457
1458 /* Add special gl_VertexID/gl_InstanceID buffers */
1459
1460 if (unlikely(vs->attribute_count >= PAN_VERTEX_ID)) {
1461 panfrost_vertex_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1462
1463 pan_pack(out + PAN_VERTEX_ID, ATTRIBUTE, cfg) {
1464 cfg.buffer_index = k++;
1465 cfg.format = so->formats[PAN_VERTEX_ID];
1466 }
1467
1468 panfrost_instance_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1469
1470 pan_pack(out + PAN_INSTANCE_ID, ATTRIBUTE, cfg) {
1471 cfg.buffer_index = k++;
1472 cfg.format = so->formats[PAN_INSTANCE_ID];
1473 }
1474 }
1475
1476 /* Attribute addresses require 64-byte alignment, so let:
1477 *
1478 * base' = base & ~63 = base - (base & 63)
1479 * offset' = offset + (base & 63)
1480 *
1481 * Since base' + offset' = base + offset, these are equivalent
1482 * addressing modes and now base is 64 aligned.
1483 */
1484
1485 unsigned start = vertex_postfix->offset_start;
1486
1487 for (unsigned i = 0; i < so->num_elements; ++i) {
1488 unsigned vbi = so->pipe[i].vertex_buffer_index;
1489 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1490
1491 /* Adjust by the masked off bits of the offset. Make sure we
1492 * read src_offset from so->hw (which is not GPU visible)
1493 * rather than target (which is) due to caching effects */
1494
1495 unsigned src_offset = so->pipe[i].src_offset;
1496
1497 /* BOs aligned to 4k so guaranteed aligned to 64 */
1498 src_offset += (buf->buffer_offset & 63);
1499
1500 /* Also, somewhat obscurely per-instance data needs to be
1501 * offset in response to a delayed start in an indexed draw */
1502
1503 if (so->pipe[i].instance_divisor && ctx->instance_count > 1 && start)
1504 src_offset -= buf->stride * start;
1505
1506 pan_pack(out + i, ATTRIBUTE, cfg) {
1507 cfg.buffer_index = attrib_to_buffer[i];
1508 cfg.format = so->formats[i];
1509 cfg.offset = src_offset;
1510 }
1511 }
1512
1513 vertex_postfix->attributes = S.gpu;
1514 vertex_postfix->attribute_meta = T.gpu;
1515 }
1516
1517 static mali_ptr
1518 panfrost_emit_varyings(struct panfrost_batch *batch,
1519 struct mali_attribute_buffer_packed *slot,
1520 unsigned stride, unsigned count)
1521 {
1522 unsigned size = stride * count;
1523 mali_ptr ptr = panfrost_pool_alloc_aligned(&batch->invisible_pool, size, 64).gpu;
1524
1525 pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1526 cfg.stride = stride;
1527 cfg.size = size;
1528 cfg.pointer = ptr;
1529 }
1530
1531 return ptr;
1532 }
1533
1534 static unsigned
1535 panfrost_streamout_offset(unsigned stride, unsigned offset,
1536 struct pipe_stream_output_target *target)
1537 {
1538 return (target->buffer_offset + (offset * stride * 4)) & 63;
1539 }
1540
1541 static void
1542 panfrost_emit_streamout(struct panfrost_batch *batch,
1543 struct mali_attribute_buffer_packed *slot,
1544 unsigned stride_words, unsigned offset, unsigned count,
1545 struct pipe_stream_output_target *target)
1546 {
1547 unsigned stride = stride_words * 4;
1548 unsigned max_size = target->buffer_size;
1549 unsigned expected_size = stride * count;
1550
1551 /* Grab the BO and bind it to the batch */
1552 struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1553
1554 /* Varyings are WRITE from the perspective of the VERTEX but READ from
1555 * the perspective of the TILER and FRAGMENT.
1556 */
1557 panfrost_batch_add_bo(batch, bo,
1558 PAN_BO_ACCESS_SHARED |
1559 PAN_BO_ACCESS_RW |
1560 PAN_BO_ACCESS_VERTEX_TILER |
1561 PAN_BO_ACCESS_FRAGMENT);
1562
1563 /* We will have an offset applied to get alignment */
1564 mali_ptr addr = bo->gpu + target->buffer_offset + (offset * stride);
1565
1566 pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1567 cfg.pointer = (addr & ~63);
1568 cfg.stride = stride;
1569 cfg.size = MIN2(max_size, expected_size) + (addr & 63);
1570 }
1571 }
1572
1573 static bool
1574 has_point_coord(unsigned mask, gl_varying_slot loc)
1575 {
1576 if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1577 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1578 else if (loc == VARYING_SLOT_PNTC)
1579 return (mask & (1 << 8));
1580 else
1581 return false;
1582 }
1583
1584 /* Helpers for manipulating stream out information so we can pack varyings
1585 * accordingly. Compute the src_offset for a given captured varying */
1586
1587 static struct pipe_stream_output *
1588 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1589 {
1590 for (unsigned i = 0; i < info->num_outputs; ++i) {
1591 if (info->output[i].register_index == loc)
1592 return &info->output[i];
1593 }
1594
1595 unreachable("Varying not captured");
1596 }
1597
1598 static unsigned
1599 pan_varying_size(enum mali_format fmt)
1600 {
1601 unsigned type = MALI_EXTRACT_TYPE(fmt);
1602 unsigned chan = MALI_EXTRACT_CHANNELS(fmt);
1603 unsigned bits = MALI_EXTRACT_BITS(fmt);
1604 unsigned bpc = 0;
1605
1606 if (bits == MALI_CHANNEL_FLOAT) {
1607 /* No doubles */
1608 bool fp16 = (type == MALI_FORMAT_SINT);
1609 assert(fp16 || (type == MALI_FORMAT_UNORM));
1610
1611 bpc = fp16 ? 2 : 4;
1612 } else {
1613 assert(type >= MALI_FORMAT_SNORM && type <= MALI_FORMAT_SINT);
1614
1615 /* See the enums */
1616 bits = 1 << bits;
1617 assert(bits >= 8);
1618 bpc = bits / 8;
1619 }
1620
1621 return bpc * chan;
1622 }
1623
1624 /* Indices for named (non-XFB) varyings that are present. These are packed
1625 * tightly so they correspond to a bitfield present (P) indexed by (1 <<
1626 * PAN_VARY_*). This has the nice property that you can lookup the buffer index
1627 * of a given special field given a shift S by:
1628 *
1629 * idx = popcount(P & ((1 << S) - 1))
1630 *
1631 * That is... look at all of the varyings that come earlier and count them, the
1632 * count is the new index since plus one. Likewise, the total number of special
1633 * buffers required is simply popcount(P)
1634 */
1635
1636 enum pan_special_varying {
1637 PAN_VARY_GENERAL = 0,
1638 PAN_VARY_POSITION = 1,
1639 PAN_VARY_PSIZ = 2,
1640 PAN_VARY_PNTCOORD = 3,
1641 PAN_VARY_FACE = 4,
1642 PAN_VARY_FRAGCOORD = 5,
1643
1644 /* Keep last */
1645 PAN_VARY_MAX,
1646 };
1647
1648 /* Given a varying, figure out which index it correpsonds to */
1649
1650 static inline unsigned
1651 pan_varying_index(unsigned present, enum pan_special_varying v)
1652 {
1653 unsigned mask = (1 << v) - 1;
1654 return util_bitcount(present & mask);
1655 }
1656
1657 /* Get the base offset for XFB buffers, which by convention come after
1658 * everything else. Wrapper function for semantic reasons; by construction this
1659 * is just popcount. */
1660
1661 static inline unsigned
1662 pan_xfb_base(unsigned present)
1663 {
1664 return util_bitcount(present);
1665 }
1666
1667 /* Computes the present mask for varyings so we can start emitting varying records */
1668
1669 static inline unsigned
1670 pan_varying_present(
1671 struct panfrost_shader_state *vs,
1672 struct panfrost_shader_state *fs,
1673 unsigned quirks)
1674 {
1675 /* At the moment we always emit general and position buffers. Not
1676 * strictly necessary but usually harmless */
1677
1678 unsigned present = (1 << PAN_VARY_GENERAL) | (1 << PAN_VARY_POSITION);
1679
1680 /* Enable special buffers by the shader info */
1681
1682 if (vs->writes_point_size)
1683 present |= (1 << PAN_VARY_PSIZ);
1684
1685 if (fs->reads_point_coord)
1686 present |= (1 << PAN_VARY_PNTCOORD);
1687
1688 if (fs->reads_face)
1689 present |= (1 << PAN_VARY_FACE);
1690
1691 if (fs->reads_frag_coord && !(quirks & IS_BIFROST))
1692 present |= (1 << PAN_VARY_FRAGCOORD);
1693
1694 /* Also, if we have a point sprite, we need a point coord buffer */
1695
1696 for (unsigned i = 0; i < fs->varying_count; i++) {
1697 gl_varying_slot loc = fs->varyings_loc[i];
1698
1699 if (has_point_coord(fs->point_sprite_mask, loc))
1700 present |= (1 << PAN_VARY_PNTCOORD);
1701 }
1702
1703 return present;
1704 }
1705
1706 /* Emitters for varying records */
1707
1708 static void
1709 pan_emit_vary(struct mali_attribute_packed *out,
1710 unsigned present, enum pan_special_varying buf,
1711 unsigned quirks, enum mali_format format,
1712 unsigned offset)
1713 {
1714 unsigned nr_channels = MALI_EXTRACT_CHANNELS(format);
1715 unsigned swizzle = quirks & HAS_SWIZZLES ?
1716 panfrost_get_default_swizzle(nr_channels) :
1717 panfrost_bifrost_swizzle(nr_channels);
1718
1719 pan_pack(out, ATTRIBUTE, cfg) {
1720 cfg.buffer_index = pan_varying_index(present, buf);
1721 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1722 cfg.format = (format << 12) | swizzle;
1723 cfg.offset = offset;
1724 }
1725 }
1726
1727 /* General varying that is unused */
1728
1729 static void
1730 pan_emit_vary_only(struct mali_attribute_packed *out,
1731 unsigned present, unsigned quirks)
1732 {
1733 pan_emit_vary(out, present, 0, quirks, MALI_VARYING_DISCARD, 0);
1734 }
1735
1736 /* Special records */
1737
1738 static const enum mali_format pan_varying_formats[PAN_VARY_MAX] = {
1739 [PAN_VARY_POSITION] = MALI_VARYING_POS,
1740 [PAN_VARY_PSIZ] = MALI_R16F,
1741 [PAN_VARY_PNTCOORD] = MALI_R16F,
1742 [PAN_VARY_FACE] = MALI_R32I,
1743 [PAN_VARY_FRAGCOORD] = MALI_RGBA32F
1744 };
1745
1746 static void
1747 pan_emit_vary_special(struct mali_attribute_packed *out,
1748 unsigned present, enum pan_special_varying buf,
1749 unsigned quirks)
1750 {
1751 assert(buf < PAN_VARY_MAX);
1752 pan_emit_vary(out, present, buf, quirks, pan_varying_formats[buf], 0);
1753 }
1754
1755 static enum mali_format
1756 pan_xfb_format(enum mali_format format, unsigned nr)
1757 {
1758 if (MALI_EXTRACT_BITS(format) == MALI_CHANNEL_FLOAT)
1759 return MALI_R32F | MALI_NR_CHANNELS(nr);
1760 else
1761 return MALI_EXTRACT_TYPE(format) | MALI_NR_CHANNELS(nr) | MALI_CHANNEL_32;
1762 }
1763
1764 /* Transform feedback records. Note struct pipe_stream_output is (if packed as
1765 * a bitfield) 32-bit, smaller than a 64-bit pointer, so may as well pass by
1766 * value. */
1767
1768 static void
1769 pan_emit_vary_xfb(struct mali_attribute_packed *out,
1770 unsigned present,
1771 unsigned max_xfb,
1772 unsigned *streamout_offsets,
1773 unsigned quirks,
1774 enum mali_format format,
1775 struct pipe_stream_output o)
1776 {
1777 unsigned swizzle = quirks & HAS_SWIZZLES ?
1778 panfrost_get_default_swizzle(o.num_components) :
1779 panfrost_bifrost_swizzle(o.num_components);
1780
1781 pan_pack(out, ATTRIBUTE, cfg) {
1782 /* XFB buffers come after everything else */
1783 cfg.buffer_index = pan_xfb_base(present) + o.output_buffer;
1784 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1785
1786 /* Override number of channels and precision to highp */
1787 cfg.format = (pan_xfb_format(format, o.num_components) << 12) | swizzle;
1788
1789 /* Apply given offsets together */
1790 cfg.offset = (o.dst_offset * 4) /* dwords */
1791 + streamout_offsets[o.output_buffer];
1792 }
1793 }
1794
1795 /* Determine if we should capture a varying for XFB. This requires actually
1796 * having a buffer for it. If we don't capture it, we'll fallback to a general
1797 * varying path (linked or unlinked, possibly discarding the write) */
1798
1799 static bool
1800 panfrost_xfb_captured(struct panfrost_shader_state *xfb,
1801 unsigned loc, unsigned max_xfb)
1802 {
1803 if (!(xfb->so_mask & (1ll << loc)))
1804 return false;
1805
1806 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1807 return o->output_buffer < max_xfb;
1808 }
1809
1810 static void
1811 pan_emit_general_varying(struct mali_attribute_packed *out,
1812 struct panfrost_shader_state *other,
1813 struct panfrost_shader_state *xfb,
1814 gl_varying_slot loc,
1815 enum mali_format format,
1816 unsigned present,
1817 unsigned quirks,
1818 unsigned *gen_offsets,
1819 enum mali_format *gen_formats,
1820 unsigned *gen_stride,
1821 unsigned idx,
1822 bool should_alloc)
1823 {
1824 /* Check if we're linked */
1825 signed other_idx = -1;
1826
1827 for (unsigned j = 0; j < other->varying_count; ++j) {
1828 if (other->varyings_loc[j] == loc) {
1829 other_idx = j;
1830 break;
1831 }
1832 }
1833
1834 if (other_idx < 0) {
1835 pan_emit_vary_only(out, present, quirks);
1836 return;
1837 }
1838
1839 unsigned offset = gen_offsets[other_idx];
1840
1841 if (should_alloc) {
1842 /* We're linked, so allocate a space via a watermark allocation */
1843 enum mali_format alt = other->varyings[other_idx];
1844
1845 /* Do interpolation at minimum precision */
1846 unsigned size_main = pan_varying_size(format);
1847 unsigned size_alt = pan_varying_size(alt);
1848 unsigned size = MIN2(size_main, size_alt);
1849
1850 /* If a varying is marked for XFB but not actually captured, we
1851 * should match the format to the format that would otherwise
1852 * be used for XFB, since dEQP checks for invariance here. It's
1853 * unclear if this is required by the spec. */
1854
1855 if (xfb->so_mask & (1ull << loc)) {
1856 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1857 format = pan_xfb_format(format, o->num_components);
1858 size = pan_varying_size(format);
1859 } else if (size == size_alt) {
1860 format = alt;
1861 }
1862
1863 gen_offsets[idx] = *gen_stride;
1864 gen_formats[other_idx] = format;
1865 offset = *gen_stride;
1866 *gen_stride += size;
1867 }
1868
1869 pan_emit_vary(out, present, PAN_VARY_GENERAL, quirks, format, offset);
1870 }
1871
1872 /* Higher-level wrapper around all of the above, classifying a varying into one
1873 * of the above types */
1874
1875 static void
1876 panfrost_emit_varying(
1877 struct mali_attribute_packed *out,
1878 struct panfrost_shader_state *stage,
1879 struct panfrost_shader_state *other,
1880 struct panfrost_shader_state *xfb,
1881 unsigned present,
1882 unsigned max_xfb,
1883 unsigned *streamout_offsets,
1884 unsigned quirks,
1885 unsigned *gen_offsets,
1886 enum mali_format *gen_formats,
1887 unsigned *gen_stride,
1888 unsigned idx,
1889 bool should_alloc,
1890 bool is_fragment)
1891 {
1892 gl_varying_slot loc = stage->varyings_loc[idx];
1893 enum mali_format format = stage->varyings[idx];
1894
1895 /* Override format to match linkage */
1896 if (!should_alloc && gen_formats[idx])
1897 format = gen_formats[idx];
1898
1899 if (has_point_coord(stage->point_sprite_mask, loc)) {
1900 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1901 } else if (panfrost_xfb_captured(xfb, loc, max_xfb)) {
1902 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1903 pan_emit_vary_xfb(out, present, max_xfb, streamout_offsets, quirks, format, *o);
1904 } else if (loc == VARYING_SLOT_POS) {
1905 if (is_fragment)
1906 pan_emit_vary_special(out, present, PAN_VARY_FRAGCOORD, quirks);
1907 else
1908 pan_emit_vary_special(out, present, PAN_VARY_POSITION, quirks);
1909 } else if (loc == VARYING_SLOT_PSIZ) {
1910 pan_emit_vary_special(out, present, PAN_VARY_PSIZ, quirks);
1911 } else if (loc == VARYING_SLOT_PNTC) {
1912 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1913 } else if (loc == VARYING_SLOT_FACE) {
1914 pan_emit_vary_special(out, present, PAN_VARY_FACE, quirks);
1915 } else {
1916 pan_emit_general_varying(out, other, xfb, loc, format, present,
1917 quirks, gen_offsets, gen_formats, gen_stride,
1918 idx, should_alloc);
1919 }
1920 }
1921
1922 static void
1923 pan_emit_special_input(struct mali_attribute_buffer_packed *out,
1924 unsigned present,
1925 enum pan_special_varying v,
1926 unsigned special)
1927 {
1928 if (present & (1 << v)) {
1929 unsigned idx = pan_varying_index(present, v);
1930
1931 pan_pack(out + idx, ATTRIBUTE_BUFFER, cfg) {
1932 cfg.special = special;
1933 cfg.type = 0;
1934 }
1935 }
1936 }
1937
1938 void
1939 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1940 unsigned vertex_count,
1941 struct mali_vertex_tiler_postfix *vertex_postfix,
1942 struct mali_vertex_tiler_postfix *tiler_postfix,
1943 union midgard_primitive_size *primitive_size)
1944 {
1945 /* Load the shaders */
1946 struct panfrost_context *ctx = batch->ctx;
1947 struct panfrost_device *dev = pan_device(ctx->base.screen);
1948 struct panfrost_shader_state *vs, *fs;
1949 size_t vs_size, fs_size;
1950
1951 /* Allocate the varying descriptor */
1952
1953 vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1954 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1955 vs_size = MALI_ATTRIBUTE_LENGTH * vs->varying_count;
1956 fs_size = MALI_ATTRIBUTE_LENGTH * fs->varying_count;
1957
1958 struct panfrost_transfer trans = panfrost_pool_alloc_aligned(
1959 &batch->pool, vs_size + fs_size, MALI_ATTRIBUTE_LENGTH);
1960
1961 struct pipe_stream_output_info *so = &vs->stream_output;
1962 unsigned present = pan_varying_present(vs, fs, dev->quirks);
1963
1964 /* Check if this varying is linked by us. This is the case for
1965 * general-purpose, non-captured varyings. If it is, link it. If it's
1966 * not, use the provided stream out information to determine the
1967 * offset, since it was already linked for us. */
1968
1969 unsigned gen_offsets[32];
1970 enum mali_format gen_formats[32];
1971 memset(gen_offsets, 0, sizeof(gen_offsets));
1972 memset(gen_formats, 0, sizeof(gen_formats));
1973
1974 unsigned gen_stride = 0;
1975 assert(vs->varying_count < ARRAY_SIZE(gen_offsets));
1976 assert(fs->varying_count < ARRAY_SIZE(gen_offsets));
1977
1978 unsigned streamout_offsets[32];
1979
1980 for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1981 streamout_offsets[i] = panfrost_streamout_offset(
1982 so->stride[i],
1983 ctx->streamout.offsets[i],
1984 ctx->streamout.targets[i]);
1985 }
1986
1987 struct mali_attribute_packed *ovs = (struct mali_attribute_packed *)trans.cpu;
1988 struct mali_attribute_packed *ofs = ovs + vs->varying_count;
1989
1990 for (unsigned i = 0; i < vs->varying_count; i++) {
1991 panfrost_emit_varying(ovs + i, vs, fs, vs, present,
1992 ctx->streamout.num_targets, streamout_offsets,
1993 dev->quirks,
1994 gen_offsets, gen_formats, &gen_stride, i, true, false);
1995 }
1996
1997 for (unsigned i = 0; i < fs->varying_count; i++) {
1998 panfrost_emit_varying(ofs + i, fs, vs, vs, present,
1999 ctx->streamout.num_targets, streamout_offsets,
2000 dev->quirks,
2001 gen_offsets, gen_formats, &gen_stride, i, false, true);
2002 }
2003
2004 unsigned xfb_base = pan_xfb_base(present);
2005 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
2006 MALI_ATTRIBUTE_BUFFER_LENGTH * (xfb_base + ctx->streamout.num_targets),
2007 MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
2008 struct mali_attribute_buffer_packed *varyings =
2009 (struct mali_attribute_buffer_packed *) T.cpu;
2010
2011 /* Emit the stream out buffers */
2012
2013 unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
2014 ctx->vertex_count);
2015
2016 for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
2017 panfrost_emit_streamout(batch, &varyings[xfb_base + i],
2018 so->stride[i],
2019 ctx->streamout.offsets[i],
2020 out_count,
2021 ctx->streamout.targets[i]);
2022 }
2023
2024 panfrost_emit_varyings(batch,
2025 &varyings[pan_varying_index(present, PAN_VARY_GENERAL)],
2026 gen_stride, vertex_count);
2027
2028 /* fp32 vec4 gl_Position */
2029 tiler_postfix->position_varying = panfrost_emit_varyings(batch,
2030 &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
2031 sizeof(float) * 4, vertex_count);
2032
2033 if (present & (1 << PAN_VARY_PSIZ)) {
2034 primitive_size->pointer = panfrost_emit_varyings(batch,
2035 &varyings[pan_varying_index(present, PAN_VARY_PSIZ)],
2036 2, vertex_count);
2037 }
2038
2039 pan_emit_special_input(varyings, present, PAN_VARY_PNTCOORD, MALI_ATTRIBUTE_SPECIAL_POINT_COORD);
2040 pan_emit_special_input(varyings, present, PAN_VARY_FACE, MALI_ATTRIBUTE_SPECIAL_FRONT_FACING);
2041 pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD, MALI_ATTRIBUTE_SPECIAL_FRAG_COORD);
2042
2043 vertex_postfix->varyings = T.gpu;
2044 tiler_postfix->varyings = T.gpu;
2045
2046 vertex_postfix->varying_meta = trans.gpu;
2047 tiler_postfix->varying_meta = trans.gpu + vs_size;
2048 }
2049
2050 void
2051 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
2052 struct mali_vertex_tiler_prefix *vertex_prefix,
2053 struct mali_vertex_tiler_postfix *vertex_postfix,
2054 struct mali_vertex_tiler_prefix *tiler_prefix,
2055 struct mali_vertex_tiler_postfix *tiler_postfix,
2056 union midgard_primitive_size *primitive_size)
2057 {
2058 struct panfrost_context *ctx = batch->ctx;
2059 struct panfrost_device *device = pan_device(ctx->base.screen);
2060 bool wallpapering = ctx->wallpaper_batch && batch->scoreboard.tiler_dep;
2061 struct bifrost_payload_vertex bifrost_vertex = {0,};
2062 struct bifrost_payload_tiler bifrost_tiler = {0,};
2063 struct midgard_payload_vertex_tiler midgard_vertex = {0,};
2064 struct midgard_payload_vertex_tiler midgard_tiler = {0,};
2065 void *vp, *tp;
2066 size_t vp_size, tp_size;
2067
2068 if (device->quirks & IS_BIFROST) {
2069 bifrost_vertex.prefix = *vertex_prefix;
2070 bifrost_vertex.postfix = *vertex_postfix;
2071 vp = &bifrost_vertex;
2072 vp_size = sizeof(bifrost_vertex);
2073
2074 bifrost_tiler.prefix = *tiler_prefix;
2075 bifrost_tiler.tiler.primitive_size = *primitive_size;
2076 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
2077 bifrost_tiler.postfix = *tiler_postfix;
2078 tp = &bifrost_tiler;
2079 tp_size = sizeof(bifrost_tiler);
2080 } else {
2081 midgard_vertex.prefix = *vertex_prefix;
2082 midgard_vertex.postfix = *vertex_postfix;
2083 vp = &midgard_vertex;
2084 vp_size = sizeof(midgard_vertex);
2085
2086 midgard_tiler.prefix = *tiler_prefix;
2087 midgard_tiler.postfix = *tiler_postfix;
2088 midgard_tiler.primitive_size = *primitive_size;
2089 tp = &midgard_tiler;
2090 tp_size = sizeof(midgard_tiler);
2091 }
2092
2093 if (wallpapering) {
2094 /* Inject in reverse order, with "predicted" job indices.
2095 * THIS IS A HACK XXX */
2096 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false,
2097 batch->scoreboard.job_index + 2, tp, tp_size, true);
2098 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2099 vp, vp_size, true);
2100 return;
2101 }
2102
2103 /* If rasterizer discard is enable, only submit the vertex */
2104
2105 unsigned vertex = panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2106 vp, vp_size, false);
2107
2108 if (ctx->rasterizer->base.rasterizer_discard)
2109 return;
2110
2111 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false, vertex, tp, tp_size,
2112 false);
2113 }
2114
2115 /* TODO: stop hardcoding this */
2116 mali_ptr
2117 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2118 {
2119 uint16_t locations[] = {
2120 128, 128,
2121 0, 256,
2122 0, 256,
2123 0, 256,
2124 0, 256,
2125 0, 256,
2126 0, 256,
2127 0, 256,
2128 0, 256,
2129 0, 256,
2130 0, 256,
2131 0, 256,
2132 0, 256,
2133 0, 256,
2134 0, 256,
2135 0, 256,
2136 0, 256,
2137 0, 256,
2138 0, 256,
2139 0, 256,
2140 0, 256,
2141 0, 256,
2142 0, 256,
2143 0, 256,
2144 0, 256,
2145 0, 256,
2146 0, 256,
2147 0, 256,
2148 0, 256,
2149 0, 256,
2150 0, 256,
2151 0, 256,
2152 128, 128,
2153 0, 0,
2154 0, 0,
2155 0, 0,
2156 0, 0,
2157 0, 0,
2158 0, 0,
2159 0, 0,
2160 0, 0,
2161 0, 0,
2162 0, 0,
2163 0, 0,
2164 0, 0,
2165 0, 0,
2166 0, 0,
2167 0, 0,
2168 };
2169
2170 return panfrost_pool_upload_aligned(&batch->pool, locations, 96 * sizeof(uint16_t), 64);
2171 }