panfrost: Group SFBD code tighter
[mesa.git] / src / gallium / drivers / panfrost / pan_cmdstream.c
1 /*
2 * Copyright (C) 2018 Alyssa Rosenzweig
3 * Copyright (C) 2020 Collabora Ltd.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "util/macros.h"
26 #include "util/u_prim.h"
27 #include "util/u_vbuf.h"
28
29 #include "panfrost-quirks.h"
30
31 #include "pan_pool.h"
32 #include "pan_bo.h"
33 #include "pan_cmdstream.h"
34 #include "pan_context.h"
35 #include "pan_job.h"
36
37 /* If a BO is accessed for a particular shader stage, will it be in the primary
38 * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
39 * fragment will be primary, e.g. compute jobs will be considered
40 * "vertex/tiler" by analogy */
41
42 static inline uint32_t
43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
44 {
45 assert(stage == PIPE_SHADER_FRAGMENT ||
46 stage == PIPE_SHADER_VERTEX ||
47 stage == PIPE_SHADER_COMPUTE);
48
49 return stage == PIPE_SHADER_FRAGMENT ?
50 PAN_BO_ACCESS_FRAGMENT :
51 PAN_BO_ACCESS_VERTEX_TILER;
52 }
53
54 static void
55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
56 struct mali_vertex_tiler_postfix *postfix)
57 {
58 struct panfrost_device *dev = pan_device(ctx->base.screen);
59 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
60
61 struct mali_shared_memory shared = {
62 .shared_workgroup_count = ~0,
63 };
64
65 if (batch->stack_size) {
66 struct panfrost_bo *stack =
67 panfrost_batch_get_scratchpad(batch, batch->stack_size,
68 dev->thread_tls_alloc,
69 dev->core_count);
70
71 shared.stack_shift = panfrost_get_stack_shift(batch->stack_size);
72 shared.scratchpad = stack->gpu;
73 }
74
75 postfix->shared_memory = panfrost_pool_upload_aligned(&batch->pool, &shared, sizeof(shared), 64);
76 }
77
78 static void
79 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
80 struct mali_vertex_tiler_postfix *postfix)
81 {
82 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
83 postfix->shared_memory = panfrost_batch_reserve_framebuffer(batch);
84 }
85
86 static void
87 panfrost_vt_update_rasterizer(struct panfrost_rasterizer *rasterizer,
88 struct mali_vertex_tiler_prefix *prefix,
89 struct mali_vertex_tiler_postfix *postfix)
90 {
91 postfix->gl_enables |= 0x7;
92 SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
93 rasterizer->base.front_ccw);
94 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
95 (rasterizer->base.cull_face & PIPE_FACE_FRONT));
96 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
97 (rasterizer->base.cull_face & PIPE_FACE_BACK));
98 SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
99 rasterizer->base.flatshade_first);
100 }
101
102 void
103 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
104 struct mali_vertex_tiler_prefix *prefix,
105 union midgard_primitive_size *primitive_size)
106 {
107 struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
108
109 if (!panfrost_writes_point_size(ctx)) {
110 float val = (prefix->draw_mode == MALI_DRAW_MODE_POINTS) ?
111 rasterizer->base.point_size :
112 rasterizer->base.line_width;
113
114 primitive_size->constant = val;
115 }
116 }
117
118 static void
119 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
120 struct mali_vertex_tiler_postfix *postfix)
121 {
122 SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
123 if (ctx->occlusion_query) {
124 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
125 panfrost_batch_add_bo(ctx->batch, ctx->occlusion_query->bo,
126 PAN_BO_ACCESS_SHARED |
127 PAN_BO_ACCESS_RW |
128 PAN_BO_ACCESS_FRAGMENT);
129 } else {
130 postfix->occlusion_counter = 0;
131 }
132 }
133
134 void
135 panfrost_vt_init(struct panfrost_context *ctx,
136 enum pipe_shader_type stage,
137 struct mali_vertex_tiler_prefix *prefix,
138 struct mali_vertex_tiler_postfix *postfix)
139 {
140 struct panfrost_device *device = pan_device(ctx->base.screen);
141
142 if (!ctx->shader[stage])
143 return;
144
145 memset(prefix, 0, sizeof(*prefix));
146 memset(postfix, 0, sizeof(*postfix));
147
148 if (device->quirks & IS_BIFROST) {
149 postfix->gl_enables = 0x2;
150 panfrost_vt_emit_shared_memory(ctx, postfix);
151 } else {
152 postfix->gl_enables = 0x6;
153 panfrost_vt_attach_framebuffer(ctx, postfix);
154 }
155
156 if (stage == PIPE_SHADER_FRAGMENT) {
157 panfrost_vt_update_occlusion_query(ctx, postfix);
158 panfrost_vt_update_rasterizer(ctx->rasterizer, prefix, postfix);
159 }
160 }
161
162 static unsigned
163 panfrost_translate_index_size(unsigned size)
164 {
165 switch (size) {
166 case 1:
167 return MALI_DRAW_INDEXED_UINT8;
168
169 case 2:
170 return MALI_DRAW_INDEXED_UINT16;
171
172 case 4:
173 return MALI_DRAW_INDEXED_UINT32;
174
175 default:
176 unreachable("Invalid index size");
177 }
178 }
179
180 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
181 * good for the duration of the draw (transient), could last longer. Also get
182 * the bounds on the index buffer for the range accessed by the draw. We do
183 * these operations together because there are natural optimizations which
184 * require them to be together. */
185
186 static mali_ptr
187 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
188 const struct pipe_draw_info *info,
189 unsigned *min_index, unsigned *max_index)
190 {
191 struct panfrost_resource *rsrc = pan_resource(info->index.resource);
192 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
193 off_t offset = info->start * info->index_size;
194 bool needs_indices = true;
195 mali_ptr out = 0;
196
197 if (info->max_index != ~0u) {
198 *min_index = info->min_index;
199 *max_index = info->max_index;
200 needs_indices = false;
201 }
202
203 if (!info->has_user_indices) {
204 /* Only resources can be directly mapped */
205 panfrost_batch_add_bo(batch, rsrc->bo,
206 PAN_BO_ACCESS_SHARED |
207 PAN_BO_ACCESS_READ |
208 PAN_BO_ACCESS_VERTEX_TILER);
209 out = rsrc->bo->gpu + offset;
210
211 /* Check the cache */
212 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
213 info->start,
214 info->count,
215 min_index,
216 max_index);
217 } else {
218 /* Otherwise, we need to upload to transient memory */
219 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
220 struct panfrost_transfer T =
221 panfrost_pool_alloc_aligned(&batch->pool,
222 info->count * info->index_size,
223 info->index_size);
224
225 memcpy(T.cpu, ibuf8 + offset, info->count * info->index_size);
226 out = T.gpu;
227 }
228
229 if (needs_indices) {
230 /* Fallback */
231 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
232
233 if (!info->has_user_indices)
234 panfrost_minmax_cache_add(rsrc->index_cache,
235 info->start, info->count,
236 *min_index, *max_index);
237 }
238
239 return out;
240 }
241
242 void
243 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
244 const struct pipe_draw_info *info,
245 enum mali_draw_mode draw_mode,
246 struct mali_vertex_tiler_postfix *vertex_postfix,
247 struct mali_vertex_tiler_prefix *tiler_prefix,
248 struct mali_vertex_tiler_postfix *tiler_postfix,
249 unsigned *vertex_count,
250 unsigned *padded_count)
251 {
252 tiler_prefix->draw_mode = draw_mode;
253
254 unsigned draw_flags = 0;
255
256 if (panfrost_writes_point_size(ctx))
257 draw_flags |= MALI_DRAW_VARYING_SIZE;
258
259 if (info->primitive_restart)
260 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
261
262 /* These doesn't make much sense */
263
264 draw_flags |= 0x3000;
265
266 if (info->index_size) {
267 unsigned min_index = 0, max_index = 0;
268
269 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
270 info,
271 &min_index,
272 &max_index);
273
274 /* Use the corresponding values */
275 *vertex_count = max_index - min_index + 1;
276 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
277 tiler_prefix->offset_bias_correction = -min_index;
278 tiler_prefix->index_count = MALI_POSITIVE(info->count);
279 draw_flags |= panfrost_translate_index_size(info->index_size);
280 } else {
281 tiler_prefix->indices = 0;
282 *vertex_count = ctx->vertex_count;
283 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
284 tiler_prefix->offset_bias_correction = 0;
285 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
286 }
287
288 tiler_prefix->unknown_draw = draw_flags;
289
290 /* Encode the padded vertex count */
291
292 if (info->instance_count > 1) {
293 *padded_count = panfrost_padded_vertex_count(*vertex_count);
294
295 unsigned shift = __builtin_ctz(ctx->padded_count);
296 unsigned k = ctx->padded_count >> (shift + 1);
297
298 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
299 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
300 } else {
301 *padded_count = *vertex_count;
302
303 /* Reset instancing state */
304 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
305 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
306 }
307 }
308
309 static void
310 panfrost_emit_compute_shader(struct panfrost_context *ctx,
311 enum pipe_shader_type st,
312 struct mali_shader_meta *meta)
313 {
314 const struct panfrost_device *dev = pan_device(ctx->base.screen);
315 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
316
317 memset(meta, 0, sizeof(*meta));
318 meta->shader = ss->shader;
319 meta->attribute_count = ss->attribute_count;
320 meta->varying_count = ss->varying_count;
321 meta->texture_count = ctx->sampler_view_count[st];
322 meta->sampler_count = ctx->sampler_count[st];
323
324 if (dev->quirks & IS_BIFROST) {
325 struct mali_bifrost_properties_packed prop;
326
327 pan_pack(&prop, BIFROST_PROPERTIES, cfg) {
328 cfg.unknown = 0x800000; /* XXX */
329 cfg.uniform_buffer_count = panfrost_ubo_count(ctx, st);
330 }
331
332 memcpy(&meta->bifrost_props, &prop, sizeof(prop));
333
334 meta->bifrost2.preload_regs = 0xC0;
335 meta->bifrost2.uniform_count = ss->uniform_count;
336 } else {
337 struct mali_midgard_properties_packed prop;
338
339 pan_pack(&prop, MIDGARD_PROPERTIES, cfg) {
340 cfg.uniform_buffer_count = panfrost_ubo_count(ctx, st);
341 cfg.uniform_count = ss->uniform_count;
342 cfg.work_register_count = ss->work_reg_count;
343 cfg.writes_globals = ss->writes_global;
344 cfg.suppress_inf_nan = true; /* XXX */
345 }
346
347 memcpy(&meta->midgard_props, &prop, sizeof(prop));
348 }
349 }
350
351 static unsigned
352 translate_tex_wrap(enum pipe_tex_wrap w)
353 {
354 switch (w) {
355 case PIPE_TEX_WRAP_REPEAT: return MALI_WRAP_MODE_REPEAT;
356 case PIPE_TEX_WRAP_CLAMP: return MALI_WRAP_MODE_CLAMP;
357 case PIPE_TEX_WRAP_CLAMP_TO_EDGE: return MALI_WRAP_MODE_CLAMP_TO_EDGE;
358 case PIPE_TEX_WRAP_CLAMP_TO_BORDER: return MALI_WRAP_MODE_CLAMP_TO_BORDER;
359 case PIPE_TEX_WRAP_MIRROR_REPEAT: return MALI_WRAP_MODE_MIRRORED_REPEAT;
360 case PIPE_TEX_WRAP_MIRROR_CLAMP: return MALI_WRAP_MODE_MIRRORED_CLAMP;
361 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE;
362 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER;
363 default: unreachable("Invalid wrap");
364 }
365 }
366
367 /* The hardware compares in the wrong order order, so we have to flip before
368 * encoding. Yes, really. */
369
370 static enum mali_func
371 panfrost_sampler_compare_func(const struct pipe_sampler_state *cso)
372 {
373 if (!cso->compare_mode)
374 return MALI_FUNC_NEVER;
375
376 enum mali_func f = panfrost_translate_compare_func(cso->compare_func);
377 return panfrost_flip_compare_func(f);
378 }
379
380 static enum mali_mipmap_mode
381 pan_pipe_to_mipmode(enum pipe_tex_mipfilter f)
382 {
383 switch (f) {
384 case PIPE_TEX_MIPFILTER_NEAREST: return MALI_MIPMAP_MODE_NEAREST;
385 case PIPE_TEX_MIPFILTER_LINEAR: return MALI_MIPMAP_MODE_TRILINEAR;
386 case PIPE_TEX_MIPFILTER_NONE: return MALI_MIPMAP_MODE_NONE;
387 default: unreachable("Invalid");
388 }
389 }
390
391 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
392 struct mali_midgard_sampler_packed *hw)
393 {
394 pan_pack(hw, MIDGARD_SAMPLER, cfg) {
395 cfg.magnify_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
396 cfg.minify_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
397 cfg.mipmap_mode = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR) ?
398 MALI_MIPMAP_MODE_TRILINEAR : MALI_MIPMAP_MODE_NEAREST;
399 cfg.normalized_coordinates = cso->normalized_coords;
400
401 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
402
403 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
404
405 /* If necessary, we disable mipmapping in the sampler descriptor by
406 * clamping the LOD as tight as possible (from 0 to epsilon,
407 * essentially -- remember these are fixed point numbers, so
408 * epsilon=1/256) */
409
410 cfg.maximum_lod = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) ?
411 cfg.minimum_lod + 1 :
412 FIXED_16(cso->max_lod, false);
413
414 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
415 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
416 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
417
418 cfg.compare_function = panfrost_sampler_compare_func(cso);
419 cfg.seamless_cube_map = cso->seamless_cube_map;
420
421 cfg.border_color_r = cso->border_color.f[0];
422 cfg.border_color_g = cso->border_color.f[1];
423 cfg.border_color_b = cso->border_color.f[2];
424 cfg.border_color_a = cso->border_color.f[3];
425 }
426 }
427
428 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
429 struct mali_bifrost_sampler_packed *hw)
430 {
431 pan_pack(hw, BIFROST_SAMPLER, cfg) {
432 cfg.magnify_linear = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR;
433 cfg.minify_linear = cso->min_img_filter == PIPE_TEX_FILTER_LINEAR;
434 cfg.mipmap_mode = pan_pipe_to_mipmode(cso->min_mip_filter);
435 cfg.normalized_coordinates = cso->normalized_coords;
436
437 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
438 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
439 cfg.maximum_lod = FIXED_16(cso->max_lod, false);
440
441 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
442 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
443 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
444
445 cfg.compare_function = panfrost_sampler_compare_func(cso);
446 cfg.seamless_cube_map = cso->seamless_cube_map;
447 }
448 }
449
450 static bool
451 panfrost_fs_required(
452 struct panfrost_shader_state *fs,
453 struct panfrost_blend_final *blend,
454 unsigned rt_count)
455 {
456 /* If we generally have side effects */
457 if (fs->fs_sidefx)
458 return true;
459
460 /* If colour is written we need to execute */
461 for (unsigned i = 0; i < rt_count; ++i) {
462 if (!blend[i].no_colour)
463 return true;
464 }
465
466 /* If depth is written and not implied we need to execute.
467 * TODO: Predicate on Z/S writes being enabled */
468 return (fs->writes_depth || fs->writes_stencil);
469 }
470
471 static void
472 panfrost_emit_blend(struct panfrost_batch *batch, void *rts,
473 struct panfrost_blend_final *blend)
474 {
475 const struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
476 struct panfrost_shader_state *fs = panfrost_get_shader_state(batch->ctx, PIPE_SHADER_FRAGMENT);
477 unsigned rt_count = batch->key.nr_cbufs;
478
479 struct bifrost_blend_rt *brts = rts;
480 struct midgard_blend_rt *mrts = rts;
481
482 /* Disable blending for depth-only on Bifrost */
483
484 if (rt_count == 0 && dev->quirks & IS_BIFROST)
485 brts[0].unk2 = 0x3;
486
487 for (unsigned i = 0; i < rt_count; ++i) {
488 unsigned flags = 0;
489
490 pan_pack(&flags, BLEND_FLAGS, cfg) {
491 if (blend[i].no_colour) {
492 cfg.enable = false;
493 break;
494 }
495
496 batch->draws |= (PIPE_CLEAR_COLOR0 << i);
497
498 cfg.srgb = util_format_is_srgb(batch->key.cbufs[i]->format);
499 cfg.load_destination = blend[i].load_dest;
500 cfg.dither_disable = !batch->ctx->blend->base.dither;
501
502 if (!(dev->quirks & IS_BIFROST))
503 cfg.midgard_blend_shader = blend[i].is_shader;
504 }
505
506 if (dev->quirks & IS_BIFROST) {
507 brts[i].flags = flags;
508
509 if (blend[i].is_shader) {
510 /* The blend shader's address needs to be at
511 * the same top 32 bit as the fragment shader.
512 * TODO: Ensure that's always the case.
513 */
514 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
515 (fs->bo->gpu & (0xffffffffull << 32)));
516 brts[i].shader = blend[i].shader.gpu;
517 brts[i].unk2 = 0x0;
518 } else {
519 enum pipe_format format = batch->key.cbufs[i]->format;
520 const struct util_format_description *format_desc;
521 format_desc = util_format_description(format);
522
523 brts[i].equation = blend[i].equation.equation;
524
525 /* TODO: this is a bit more complicated */
526 brts[i].constant = blend[i].equation.constant;
527
528 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
529
530 /* 0x19 disables blending and forces REPLACE
531 * mode (equivalent to rgb_mode = alpha_mode =
532 * x122, colour mask = 0xF). 0x1a allows
533 * blending. */
534 brts[i].unk2 = blend[i].opaque ? 0x19 : 0x1a;
535
536 brts[i].shader_type = fs->blend_types[i];
537 }
538 } else {
539 memcpy(&mrts[i].flags, &flags, sizeof(flags));
540
541 if (blend[i].is_shader) {
542 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
543 } else {
544 mrts[i].blend.equation = blend[i].equation.equation;
545 mrts[i].blend.constant = blend[i].equation.constant;
546 }
547 }
548 }
549 }
550
551 static void
552 panfrost_emit_frag_shader(struct panfrost_context *ctx,
553 struct mali_shader_meta *fragmeta,
554 struct panfrost_blend_final *blend)
555 {
556 const struct panfrost_device *dev = pan_device(ctx->base.screen);
557 struct panfrost_shader_state *fs;
558
559 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
560
561 struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
562 const struct panfrost_zsa_state *zsa = ctx->depth_stencil;
563 unsigned rt_count = ctx->pipe_framebuffer.nr_cbufs;
564
565 memset(fragmeta, 0, sizeof(*fragmeta));
566
567 fragmeta->shader = fs->shader;
568 fragmeta->attribute_count = fs->attribute_count;
569 fragmeta->varying_count = fs->varying_count;
570 fragmeta->texture_count = ctx->sampler_view_count[PIPE_SHADER_FRAGMENT];
571 fragmeta->sampler_count = ctx->sampler_count[PIPE_SHADER_FRAGMENT];
572
573 if (dev->quirks & IS_BIFROST) {
574 struct mali_bifrost_properties_packed prop;
575
576 bool no_blend = true;
577
578 for (unsigned i = 0; i < rt_count; ++i)
579 no_blend &= (!blend[i].load_dest | blend[i].no_colour);
580
581 pan_pack(&prop, BIFROST_PROPERTIES, cfg) {
582 cfg.unknown = 0x950020; /* XXX */
583 cfg.uniform_buffer_count = panfrost_ubo_count(ctx, PIPE_SHADER_FRAGMENT);
584 cfg.early_z_enable = !fs->can_discard && !fs->writes_depth && no_blend;
585 }
586
587 memcpy(&fragmeta->bifrost_props, &prop, sizeof(prop));
588
589 fragmeta->bifrost2.preload_regs = 0x1;
590 SET_BIT(fragmeta->bifrost2.preload_regs, 0x10, fs->reads_frag_coord);
591
592 fragmeta->bifrost2.uniform_count = fs->uniform_count;
593 } else {
594 struct mali_midgard_properties_packed prop;
595
596 /* Reasons to disable early-Z from a shader perspective */
597 bool late_z = fs->can_discard || fs->writes_global ||
598 fs->writes_depth || fs->writes_stencil;
599
600 /* Reasons to disable early-Z from a CSO perspective */
601 bool alpha_to_coverage = ctx->blend->base.alpha_to_coverage;
602
603 /* If either depth or stencil is enabled, discard matters */
604 bool zs_enabled =
605 (zsa->base.depth.enabled && zsa->base.depth.func != PIPE_FUNC_ALWAYS) ||
606 zsa->base.stencil[0].enabled;
607
608 bool has_blend_shader = false;
609
610 for (unsigned c = 0; c < rt_count; ++c)
611 has_blend_shader |= blend[c].is_shader;
612
613 pan_pack(&prop, MIDGARD_PROPERTIES, cfg) {
614 cfg.uniform_buffer_count = panfrost_ubo_count(ctx, PIPE_SHADER_FRAGMENT);
615 cfg.uniform_count = fs->uniform_count;
616 cfg.work_register_count = fs->work_reg_count;
617 cfg.writes_globals = fs->writes_global;
618 cfg.suppress_inf_nan = true; /* XXX */
619
620 /* TODO: Reduce this limit? */
621 if (has_blend_shader)
622 cfg.work_register_count = MAX2(cfg.work_register_count, 8);
623
624 cfg.stencil_from_shader = fs->writes_stencil;
625 cfg.helper_invocation_enable = fs->helper_invocations;
626 cfg.depth_source = fs->writes_depth ?
627 MALI_DEPTH_SOURCE_SHADER :
628 MALI_DEPTH_SOURCE_FIXED_FUNCTION;
629
630 /* Depend on other state */
631 cfg.early_z_enable = !(late_z || alpha_to_coverage);
632 cfg.reads_tilebuffer = fs->outputs_read || (!zs_enabled && fs->can_discard);
633 cfg.reads_depth_stencil = zs_enabled && fs->can_discard;
634 }
635
636 memcpy(&fragmeta->midgard_props, &prop, sizeof(prop));
637 }
638
639 bool msaa = rast->multisample;
640 fragmeta->coverage_mask = msaa ? ctx->sample_mask : ~0;
641
642 fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x10;
643 fragmeta->unknown2_4 = 0x4e0;
644
645 /* TODO: Sample size */
646 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
647 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
648
649 /* EXT_shader_framebuffer_fetch requires the shader to be run
650 * per-sample when outputs are read. */
651 bool per_sample = ctx->min_samples > 1 || fs->outputs_read;
652 SET_BIT(fragmeta->unknown2_3, MALI_PER_SAMPLE, msaa && per_sample);
653
654 fragmeta->depth_units = rast->offset_units * 2.0f;
655 fragmeta->depth_factor = rast->offset_scale;
656
657 /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
658
659 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, rast->offset_tri);
660 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, rast->offset_tri);
661
662 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_NEAR, rast->depth_clip_near);
663 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_FAR, rast->depth_clip_far);
664
665 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
666 zsa->base.stencil[0].enabled);
667
668 fragmeta->stencil_mask_front = zsa->stencil_mask_front;
669 fragmeta->stencil_mask_back = zsa->stencil_mask_back;
670
671 /* Bottom bits for stencil ref, exactly one word */
672 fragmeta->stencil_front.opaque[0] = zsa->stencil_front.opaque[0] | ctx->stencil_ref.ref_value[0];
673
674 /* If back-stencil is not enabled, use the front values */
675
676 if (zsa->base.stencil[1].enabled)
677 fragmeta->stencil_back.opaque[0] = zsa->stencil_back.opaque[0] | ctx->stencil_ref.ref_value[1];
678 else
679 fragmeta->stencil_back = fragmeta->stencil_front;
680
681 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
682 zsa->base.depth.writemask);
683
684 fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
685 fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(
686 zsa->base.depth.enabled ? zsa->base.depth.func : PIPE_FUNC_ALWAYS));
687
688 SET_BIT(fragmeta->unknown2_4, MALI_ALPHA_TO_COVERAGE,
689 ctx->blend->base.alpha_to_coverage);
690
691 /* Disable shader execution if we can */
692 if (dev->quirks & MIDGARD_SHADERLESS
693 && !panfrost_fs_required(fs, blend, rt_count)) {
694 fragmeta->shader = 0x1;
695 fragmeta->attribute_count = 0;
696 fragmeta->varying_count = 0;
697 fragmeta->texture_count = 0;
698 fragmeta->sampler_count = 0;
699
700 /* This feature is not known to work on Bifrost */
701 struct mali_midgard_properties_packed prop;
702
703 pan_pack(&prop, MIDGARD_PROPERTIES, cfg) {
704 cfg.work_register_count = 1;
705 cfg.depth_source = MALI_DEPTH_SOURCE_FIXED_FUNCTION;
706 cfg.early_z_enable = true;
707 }
708
709 memcpy(&fragmeta->midgard_props, &prop, sizeof(prop));
710 }
711
712 if (dev->quirks & MIDGARD_SFBD) {
713 /* When only a single render target platform is used, the blend
714 * information is inside the shader meta itself. We additionally
715 * need to signal CAN_DISCARD for nontrivial blend modes (so
716 * we're able to read back the destination buffer) */
717
718 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
719 blend[0].is_shader);
720
721 if (blend[0].is_shader) {
722 fragmeta->blend.shader = blend[0].shader.gpu |
723 blend[0].shader.first_tag;
724 } else {
725 fragmeta->blend.equation = blend[0].equation.equation;
726 fragmeta->blend.constant = blend[0].equation.constant;
727 }
728
729 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
730 blend[0].load_dest);
731
732 fragmeta->unknown2_4 |= 0x10;
733 SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER, !ctx->blend->base.dither);
734 } else if (!(dev->quirks & IS_BIFROST)) {
735 /* Bug where MRT-capable hw apparently reads the last blend
736 * shader from here instead of the usual location? */
737
738 for (signed rt = ((signed) rt_count - 1); rt >= 0; --rt) {
739 if (!blend[rt].is_shader)
740 continue;
741
742 fragmeta->blend.shader = blend[rt].shader.gpu |
743 blend[rt].shader.first_tag;
744 break;
745 }
746 }
747 }
748
749 void
750 panfrost_emit_shader_meta(struct panfrost_batch *batch,
751 enum pipe_shader_type st,
752 struct mali_vertex_tiler_postfix *postfix)
753 {
754 struct panfrost_context *ctx = batch->ctx;
755 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
756
757 if (!ss) {
758 postfix->shader = 0;
759 return;
760 }
761
762 struct mali_shader_meta meta;
763
764 /* Add the shader BO to the batch. */
765 panfrost_batch_add_bo(batch, ss->bo,
766 PAN_BO_ACCESS_PRIVATE |
767 PAN_BO_ACCESS_READ |
768 panfrost_bo_access_for_stage(st));
769
770 mali_ptr shader_ptr;
771
772 if (st == PIPE_SHADER_FRAGMENT) {
773 struct panfrost_device *dev = pan_device(ctx->base.screen);
774 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
775 size_t desc_size = sizeof(meta);
776 void *rts = NULL;
777 struct panfrost_transfer xfer;
778 unsigned rt_size;
779
780 if (dev->quirks & MIDGARD_SFBD)
781 rt_size = 0;
782 else if (dev->quirks & IS_BIFROST)
783 rt_size = sizeof(struct bifrost_blend_rt);
784 else
785 rt_size = sizeof(struct midgard_blend_rt);
786
787 desc_size += rt_size * rt_count;
788
789 if (rt_size)
790 rts = rzalloc_size(ctx, rt_size * rt_count);
791
792 struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
793
794 for (unsigned c = 0; c < ctx->pipe_framebuffer.nr_cbufs; ++c)
795 blend[c] = panfrost_get_blend_for_context(ctx, c);
796
797 panfrost_emit_frag_shader(ctx, &meta, blend);
798
799 if (!(dev->quirks & MIDGARD_SFBD))
800 panfrost_emit_blend(batch, rts, blend);
801 else
802 batch->draws |= PIPE_CLEAR_COLOR0;
803
804 xfer = panfrost_pool_alloc_aligned(&batch->pool, desc_size, sizeof(meta));
805
806 memcpy(xfer.cpu, &meta, sizeof(meta));
807 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
808
809 if (rt_size)
810 ralloc_free(rts);
811
812 shader_ptr = xfer.gpu;
813 } else {
814 panfrost_emit_compute_shader(ctx, st, &meta);
815
816 shader_ptr = panfrost_pool_upload(&batch->pool, &meta,
817 sizeof(meta));
818 }
819
820 postfix->shader = shader_ptr;
821 }
822
823 void
824 panfrost_emit_viewport(struct panfrost_batch *batch,
825 struct mali_vertex_tiler_postfix *tiler_postfix)
826 {
827 struct panfrost_context *ctx = batch->ctx;
828 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
829 const struct pipe_scissor_state *ss = &ctx->scissor;
830 const struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
831 const struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer;
832
833 /* Derive min/max from translate/scale. Note since |x| >= 0 by
834 * definition, we have that -|x| <= |x| hence translate - |scale| <=
835 * translate + |scale|, so the ordering is correct here. */
836 float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
837 float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
838 float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
839 float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
840 float minz = (vp->translate[2] - fabsf(vp->scale[2]));
841 float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
842
843 /* Scissor to the intersection of viewport and to the scissor, clamped
844 * to the framebuffer */
845
846 unsigned minx = MIN2(fb->width, vp_minx);
847 unsigned maxx = MIN2(fb->width, vp_maxx);
848 unsigned miny = MIN2(fb->height, vp_miny);
849 unsigned maxy = MIN2(fb->height, vp_maxy);
850
851 if (ss && rast->scissor) {
852 minx = MAX2(ss->minx, minx);
853 miny = MAX2(ss->miny, miny);
854 maxx = MIN2(ss->maxx, maxx);
855 maxy = MIN2(ss->maxy, maxy);
856 }
857
858 struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool, MALI_VIEWPORT_LENGTH);
859
860 pan_pack(T.cpu, VIEWPORT, cfg) {
861 cfg.scissor_minimum_x = minx;
862 cfg.scissor_minimum_y = miny;
863 cfg.scissor_maximum_x = maxx - 1;
864 cfg.scissor_maximum_y = maxy - 1;
865
866 cfg.minimum_z = rast->depth_clip_near ? minz : -INFINITY;
867 cfg.maximum_z = rast->depth_clip_far ? maxz : INFINITY;
868 }
869
870 tiler_postfix->viewport = T.gpu;
871 panfrost_batch_union_scissor(batch, minx, miny, maxx, maxy);
872 }
873
874 static mali_ptr
875 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
876 enum pipe_shader_type st,
877 struct panfrost_constant_buffer *buf,
878 unsigned index)
879 {
880 struct pipe_constant_buffer *cb = &buf->cb[index];
881 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
882
883 if (rsrc) {
884 panfrost_batch_add_bo(batch, rsrc->bo,
885 PAN_BO_ACCESS_SHARED |
886 PAN_BO_ACCESS_READ |
887 panfrost_bo_access_for_stage(st));
888
889 /* Alignment gauranteed by
890 * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
891 return rsrc->bo->gpu + cb->buffer_offset;
892 } else if (cb->user_buffer) {
893 return panfrost_pool_upload_aligned(&batch->pool,
894 cb->user_buffer +
895 cb->buffer_offset,
896 cb->buffer_size, 16);
897 } else {
898 unreachable("No constant buffer");
899 }
900 }
901
902 struct sysval_uniform {
903 union {
904 float f[4];
905 int32_t i[4];
906 uint32_t u[4];
907 uint64_t du[2];
908 };
909 };
910
911 static void
912 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
913 struct sysval_uniform *uniform)
914 {
915 struct panfrost_context *ctx = batch->ctx;
916 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
917
918 uniform->f[0] = vp->scale[0];
919 uniform->f[1] = vp->scale[1];
920 uniform->f[2] = vp->scale[2];
921 }
922
923 static void
924 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
925 struct sysval_uniform *uniform)
926 {
927 struct panfrost_context *ctx = batch->ctx;
928 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
929
930 uniform->f[0] = vp->translate[0];
931 uniform->f[1] = vp->translate[1];
932 uniform->f[2] = vp->translate[2];
933 }
934
935 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
936 enum pipe_shader_type st,
937 unsigned int sysvalid,
938 struct sysval_uniform *uniform)
939 {
940 struct panfrost_context *ctx = batch->ctx;
941 unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
942 unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
943 bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
944 struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
945
946 assert(dim);
947 uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
948
949 if (dim > 1)
950 uniform->i[1] = u_minify(tex->texture->height0,
951 tex->u.tex.first_level);
952
953 if (dim > 2)
954 uniform->i[2] = u_minify(tex->texture->depth0,
955 tex->u.tex.first_level);
956
957 if (is_array)
958 uniform->i[dim] = tex->texture->array_size;
959 }
960
961 static void
962 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
963 enum pipe_shader_type st,
964 unsigned ssbo_id,
965 struct sysval_uniform *uniform)
966 {
967 struct panfrost_context *ctx = batch->ctx;
968
969 assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
970 struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
971
972 /* Compute address */
973 struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
974
975 panfrost_batch_add_bo(batch, bo,
976 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
977 panfrost_bo_access_for_stage(st));
978
979 /* Upload address and size as sysval */
980 uniform->du[0] = bo->gpu + sb.buffer_offset;
981 uniform->u[2] = sb.buffer_size;
982 }
983
984 static void
985 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
986 enum pipe_shader_type st,
987 unsigned samp_idx,
988 struct sysval_uniform *uniform)
989 {
990 struct panfrost_context *ctx = batch->ctx;
991 struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
992
993 uniform->f[0] = sampl->min_lod;
994 uniform->f[1] = sampl->max_lod;
995 uniform->f[2] = sampl->lod_bias;
996
997 /* Even without any errata, Midgard represents "no mipmapping" as
998 * fixing the LOD with the clamps; keep behaviour consistent. c.f.
999 * panfrost_create_sampler_state which also explains our choice of
1000 * epsilon value (again to keep behaviour consistent) */
1001
1002 if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1003 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1004 }
1005
1006 static void
1007 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1008 struct sysval_uniform *uniform)
1009 {
1010 struct panfrost_context *ctx = batch->ctx;
1011
1012 uniform->u[0] = ctx->compute_grid->grid[0];
1013 uniform->u[1] = ctx->compute_grid->grid[1];
1014 uniform->u[2] = ctx->compute_grid->grid[2];
1015 }
1016
1017 static void
1018 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1019 struct panfrost_shader_state *ss,
1020 enum pipe_shader_type st)
1021 {
1022 struct sysval_uniform *uniforms = (void *)buf;
1023
1024 for (unsigned i = 0; i < ss->sysval_count; ++i) {
1025 int sysval = ss->sysval[i];
1026
1027 switch (PAN_SYSVAL_TYPE(sysval)) {
1028 case PAN_SYSVAL_VIEWPORT_SCALE:
1029 panfrost_upload_viewport_scale_sysval(batch,
1030 &uniforms[i]);
1031 break;
1032 case PAN_SYSVAL_VIEWPORT_OFFSET:
1033 panfrost_upload_viewport_offset_sysval(batch,
1034 &uniforms[i]);
1035 break;
1036 case PAN_SYSVAL_TEXTURE_SIZE:
1037 panfrost_upload_txs_sysval(batch, st,
1038 PAN_SYSVAL_ID(sysval),
1039 &uniforms[i]);
1040 break;
1041 case PAN_SYSVAL_SSBO:
1042 panfrost_upload_ssbo_sysval(batch, st,
1043 PAN_SYSVAL_ID(sysval),
1044 &uniforms[i]);
1045 break;
1046 case PAN_SYSVAL_NUM_WORK_GROUPS:
1047 panfrost_upload_num_work_groups_sysval(batch,
1048 &uniforms[i]);
1049 break;
1050 case PAN_SYSVAL_SAMPLER:
1051 panfrost_upload_sampler_sysval(batch, st,
1052 PAN_SYSVAL_ID(sysval),
1053 &uniforms[i]);
1054 break;
1055 default:
1056 assert(0);
1057 }
1058 }
1059 }
1060
1061 static const void *
1062 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1063 unsigned index)
1064 {
1065 struct pipe_constant_buffer *cb = &buf->cb[index];
1066 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1067
1068 if (rsrc)
1069 return rsrc->bo->cpu;
1070 else if (cb->user_buffer)
1071 return cb->user_buffer;
1072 else
1073 unreachable("No constant buffer");
1074 }
1075
1076 void
1077 panfrost_emit_const_buf(struct panfrost_batch *batch,
1078 enum pipe_shader_type stage,
1079 struct mali_vertex_tiler_postfix *postfix)
1080 {
1081 struct panfrost_context *ctx = batch->ctx;
1082 struct panfrost_shader_variants *all = ctx->shader[stage];
1083
1084 if (!all)
1085 return;
1086
1087 struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1088
1089 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1090
1091 /* Uniforms are implicitly UBO #0 */
1092 bool has_uniforms = buf->enabled_mask & (1 << 0);
1093
1094 /* Allocate room for the sysval and the uniforms */
1095 size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1096 size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1097 size_t size = sys_size + uniform_size;
1098 struct panfrost_transfer transfer =
1099 panfrost_pool_alloc_aligned(&batch->pool, size, 16);
1100
1101 /* Upload sysvals requested by the shader */
1102 panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1103
1104 /* Upload uniforms */
1105 if (has_uniforms && uniform_size) {
1106 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1107 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1108 }
1109
1110 /* Next up, attach UBOs. UBO #0 is the uniforms we just
1111 * uploaded */
1112
1113 unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1114 assert(ubo_count >= 1);
1115
1116 size_t sz = MALI_UNIFORM_BUFFER_LENGTH * ubo_count;
1117 struct panfrost_transfer ubos =
1118 panfrost_pool_alloc_aligned(&batch->pool, sz,
1119 MALI_UNIFORM_BUFFER_LENGTH);
1120
1121 uint64_t *ubo_ptr = (uint64_t *) ubos.cpu;
1122
1123 /* Upload uniforms as a UBO */
1124
1125 if (size) {
1126 pan_pack(ubo_ptr, UNIFORM_BUFFER, cfg) {
1127 cfg.entries = DIV_ROUND_UP(size, 16);
1128 cfg.pointer = transfer.gpu;
1129 }
1130 } else {
1131 *ubo_ptr = 0;
1132 }
1133
1134 /* The rest are honest-to-goodness UBOs */
1135
1136 for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1137 size_t usz = buf->cb[ubo].buffer_size;
1138 bool enabled = buf->enabled_mask & (1 << ubo);
1139 bool empty = usz == 0;
1140
1141 if (!enabled || empty) {
1142 ubo_ptr[ubo] = 0;
1143 continue;
1144 }
1145
1146 pan_pack(ubo_ptr + ubo, UNIFORM_BUFFER, cfg) {
1147 cfg.entries = DIV_ROUND_UP(usz, 16);
1148 cfg.pointer = panfrost_map_constant_buffer_gpu(batch,
1149 stage, buf, ubo);
1150 }
1151 }
1152
1153 postfix->uniforms = transfer.gpu;
1154 postfix->uniform_buffers = ubos.gpu;
1155
1156 buf->dirty_mask = 0;
1157 }
1158
1159 void
1160 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1161 const struct pipe_grid_info *info,
1162 struct midgard_payload_vertex_tiler *vtp)
1163 {
1164 struct panfrost_context *ctx = batch->ctx;
1165 struct panfrost_device *dev = pan_device(ctx->base.screen);
1166 struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1167 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1168 unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1169 128));
1170
1171 unsigned log2_instances =
1172 util_logbase2_ceil(info->grid[0]) +
1173 util_logbase2_ceil(info->grid[1]) +
1174 util_logbase2_ceil(info->grid[2]);
1175
1176 unsigned shared_size = single_size * (1 << log2_instances) * dev->core_count;
1177 struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1178 shared_size,
1179 1);
1180
1181 struct mali_shared_memory shared = {
1182 .shared_memory = bo->gpu,
1183 .shared_workgroup_count = log2_instances,
1184 .shared_shift = util_logbase2(single_size) + 1
1185 };
1186
1187 vtp->postfix.shared_memory = panfrost_pool_upload_aligned(&batch->pool, &shared,
1188 sizeof(shared), 64);
1189 }
1190
1191 static mali_ptr
1192 panfrost_get_tex_desc(struct panfrost_batch *batch,
1193 enum pipe_shader_type st,
1194 struct panfrost_sampler_view *view)
1195 {
1196 if (!view)
1197 return (mali_ptr) 0;
1198
1199 struct pipe_sampler_view *pview = &view->base;
1200 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1201
1202 /* Add the BO to the job so it's retained until the job is done. */
1203
1204 panfrost_batch_add_bo(batch, rsrc->bo,
1205 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1206 panfrost_bo_access_for_stage(st));
1207
1208 panfrost_batch_add_bo(batch, view->bo,
1209 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1210 panfrost_bo_access_for_stage(st));
1211
1212 return view->bo->gpu;
1213 }
1214
1215 static void
1216 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
1217 struct pipe_context *pctx)
1218 {
1219 struct panfrost_resource *rsrc = pan_resource(view->base.texture);
1220 if (view->texture_bo != rsrc->bo->gpu ||
1221 view->modifier != rsrc->modifier) {
1222 panfrost_bo_unreference(view->bo);
1223 panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);
1224 }
1225 }
1226
1227 void
1228 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1229 enum pipe_shader_type stage,
1230 struct mali_vertex_tiler_postfix *postfix)
1231 {
1232 struct panfrost_context *ctx = batch->ctx;
1233 struct panfrost_device *device = pan_device(ctx->base.screen);
1234
1235 if (!ctx->sampler_view_count[stage])
1236 return;
1237
1238 if (device->quirks & IS_BIFROST) {
1239 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1240 MALI_BIFROST_TEXTURE_LENGTH *
1241 ctx->sampler_view_count[stage],
1242 MALI_BIFROST_TEXTURE_LENGTH);
1243
1244 struct mali_bifrost_texture_packed *out =
1245 (struct mali_bifrost_texture_packed *) T.cpu;
1246
1247 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1248 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1249 struct pipe_sampler_view *pview = &view->base;
1250 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1251
1252 panfrost_update_sampler_view(view, &ctx->base);
1253 out[i] = view->bifrost_descriptor;
1254
1255 /* Add the BOs to the job so they are retained until the job is done. */
1256
1257 panfrost_batch_add_bo(batch, rsrc->bo,
1258 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1259 panfrost_bo_access_for_stage(stage));
1260
1261 panfrost_batch_add_bo(batch, view->bo,
1262 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1263 panfrost_bo_access_for_stage(stage));
1264 }
1265
1266 postfix->textures = T.gpu;
1267 } else {
1268 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1269
1270 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1271 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1272
1273 panfrost_update_sampler_view(view, &ctx->base);
1274
1275 trampolines[i] = panfrost_get_tex_desc(batch, stage, view);
1276 }
1277
1278 postfix->textures = panfrost_pool_upload_aligned(&batch->pool,
1279 trampolines,
1280 sizeof(uint64_t) *
1281 ctx->sampler_view_count[stage],
1282 sizeof(uint64_t));
1283 }
1284 }
1285
1286 void
1287 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1288 enum pipe_shader_type stage,
1289 struct mali_vertex_tiler_postfix *postfix)
1290 {
1291 struct panfrost_context *ctx = batch->ctx;
1292
1293 if (!ctx->sampler_count[stage])
1294 return;
1295
1296 size_t desc_size = MALI_BIFROST_SAMPLER_LENGTH;
1297 assert(MALI_BIFROST_SAMPLER_LENGTH == MALI_MIDGARD_SAMPLER_LENGTH);
1298
1299 size_t sz = desc_size * ctx->sampler_count[stage];
1300 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool, sz, desc_size);
1301 struct mali_midgard_sampler_packed *out = (struct mali_midgard_sampler_packed *) T.cpu;
1302
1303 for (unsigned i = 0; i < ctx->sampler_count[stage]; ++i)
1304 out[i] = ctx->samplers[stage][i]->hw;
1305
1306 postfix->sampler_descriptor = T.gpu;
1307 }
1308
1309 void
1310 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1311 struct mali_vertex_tiler_postfix *vertex_postfix)
1312 {
1313 struct panfrost_context *ctx = batch->ctx;
1314 struct panfrost_vertex_state *so = ctx->vertex;
1315 struct panfrost_shader_state *vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1316
1317 unsigned instance_shift = vertex_postfix->instance_shift;
1318 unsigned instance_odd = vertex_postfix->instance_odd;
1319
1320 /* Worst case: everything is NPOT, which is only possible if instancing
1321 * is enabled. Otherwise single record is gauranteed */
1322 bool could_npot = instance_shift || instance_odd;
1323
1324 struct panfrost_transfer S = panfrost_pool_alloc_aligned(&batch->pool,
1325 MALI_ATTRIBUTE_BUFFER_LENGTH * vs->attribute_count *
1326 (could_npot ? 2 : 1),
1327 MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
1328
1329 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1330 MALI_ATTRIBUTE_LENGTH * vs->attribute_count,
1331 MALI_ATTRIBUTE_LENGTH);
1332
1333 struct mali_attribute_buffer_packed *bufs =
1334 (struct mali_attribute_buffer_packed *) S.cpu;
1335
1336 struct mali_attribute_packed *out =
1337 (struct mali_attribute_packed *) T.cpu;
1338
1339 unsigned attrib_to_buffer[PIPE_MAX_ATTRIBS] = { 0 };
1340 unsigned k = 0;
1341
1342 for (unsigned i = 0; i < so->num_elements; ++i) {
1343 /* We map buffers 1:1 with the attributes, which
1344 * means duplicating some vertex buffers (who cares? aside from
1345 * maybe some caching implications but I somehow doubt that
1346 * matters) */
1347
1348 struct pipe_vertex_element *elem = &so->pipe[i];
1349 unsigned vbi = elem->vertex_buffer_index;
1350 attrib_to_buffer[i] = k;
1351
1352 if (!(ctx->vb_mask & (1 << vbi)))
1353 continue;
1354
1355 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1356 struct panfrost_resource *rsrc;
1357
1358 rsrc = pan_resource(buf->buffer.resource);
1359 if (!rsrc)
1360 continue;
1361
1362 /* Add a dependency of the batch on the vertex buffer */
1363 panfrost_batch_add_bo(batch, rsrc->bo,
1364 PAN_BO_ACCESS_SHARED |
1365 PAN_BO_ACCESS_READ |
1366 PAN_BO_ACCESS_VERTEX_TILER);
1367
1368 /* Mask off lower bits, see offset fixup below */
1369 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1370 mali_ptr addr = raw_addr & ~63;
1371
1372 /* Since we advanced the base pointer, we shrink the buffer
1373 * size, but add the offset we subtracted */
1374 unsigned size = rsrc->base.width0 + (raw_addr - addr)
1375 - buf->buffer_offset;
1376
1377 /* When there is a divisor, the hardware-level divisor is
1378 * the product of the instance divisor and the padded count */
1379 unsigned divisor = elem->instance_divisor;
1380 unsigned hw_divisor = ctx->padded_count * divisor;
1381 unsigned stride = buf->stride;
1382
1383 /* If there's a divisor(=1) but no instancing, we want every
1384 * attribute to be the same */
1385
1386 if (divisor && ctx->instance_count == 1)
1387 stride = 0;
1388
1389 if (!divisor || ctx->instance_count <= 1) {
1390 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1391 if (ctx->instance_count > 1)
1392 cfg.type = MALI_ATTRIBUTE_TYPE_1D_MODULUS;
1393
1394 cfg.pointer = addr;
1395 cfg.stride = stride;
1396 cfg.size = size;
1397 cfg.divisor_r = instance_shift;
1398 cfg.divisor_p = instance_odd;
1399 }
1400 } else if (util_is_power_of_two_or_zero(hw_divisor)) {
1401 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1402 cfg.type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;
1403 cfg.pointer = addr;
1404 cfg.stride = stride;
1405 cfg.size = size;
1406 cfg.divisor_r = __builtin_ctz(hw_divisor);
1407 }
1408
1409 } else {
1410 unsigned shift = 0, extra_flags = 0;
1411
1412 unsigned magic_divisor =
1413 panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags);
1414
1415 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1416 cfg.type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;
1417 cfg.pointer = addr;
1418 cfg.stride = stride;
1419 cfg.size = size;
1420
1421 cfg.divisor_r = shift;
1422 cfg.divisor_e = extra_flags;
1423 }
1424
1425 pan_pack(bufs + k + 1, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) {
1426 cfg.divisor_numerator = magic_divisor;
1427 cfg.divisor = divisor;
1428 }
1429
1430 ++k;
1431 }
1432
1433 ++k;
1434 }
1435
1436 /* Add special gl_VertexID/gl_InstanceID buffers */
1437
1438 if (unlikely(vs->attribute_count >= PAN_VERTEX_ID)) {
1439 panfrost_vertex_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1440
1441 pan_pack(out + PAN_VERTEX_ID, ATTRIBUTE, cfg) {
1442 cfg.buffer_index = k++;
1443 cfg.format = so->formats[PAN_VERTEX_ID];
1444 }
1445
1446 panfrost_instance_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1447
1448 pan_pack(out + PAN_INSTANCE_ID, ATTRIBUTE, cfg) {
1449 cfg.buffer_index = k++;
1450 cfg.format = so->formats[PAN_INSTANCE_ID];
1451 }
1452 }
1453
1454 /* Attribute addresses require 64-byte alignment, so let:
1455 *
1456 * base' = base & ~63 = base - (base & 63)
1457 * offset' = offset + (base & 63)
1458 *
1459 * Since base' + offset' = base + offset, these are equivalent
1460 * addressing modes and now base is 64 aligned.
1461 */
1462
1463 unsigned start = vertex_postfix->offset_start;
1464
1465 for (unsigned i = 0; i < so->num_elements; ++i) {
1466 unsigned vbi = so->pipe[i].vertex_buffer_index;
1467 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1468
1469 /* Adjust by the masked off bits of the offset. Make sure we
1470 * read src_offset from so->hw (which is not GPU visible)
1471 * rather than target (which is) due to caching effects */
1472
1473 unsigned src_offset = so->pipe[i].src_offset;
1474
1475 /* BOs aligned to 4k so guaranteed aligned to 64 */
1476 src_offset += (buf->buffer_offset & 63);
1477
1478 /* Also, somewhat obscurely per-instance data needs to be
1479 * offset in response to a delayed start in an indexed draw */
1480
1481 if (so->pipe[i].instance_divisor && ctx->instance_count > 1 && start)
1482 src_offset -= buf->stride * start;
1483
1484 pan_pack(out + i, ATTRIBUTE, cfg) {
1485 cfg.buffer_index = attrib_to_buffer[i];
1486 cfg.format = so->formats[i];
1487 cfg.offset = src_offset;
1488 }
1489 }
1490
1491 vertex_postfix->attributes = S.gpu;
1492 vertex_postfix->attribute_meta = T.gpu;
1493 }
1494
1495 static mali_ptr
1496 panfrost_emit_varyings(struct panfrost_batch *batch,
1497 struct mali_attribute_buffer_packed *slot,
1498 unsigned stride, unsigned count)
1499 {
1500 unsigned size = stride * count;
1501 mali_ptr ptr = panfrost_pool_alloc_aligned(&batch->invisible_pool, size, 64).gpu;
1502
1503 pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1504 cfg.stride = stride;
1505 cfg.size = size;
1506 cfg.pointer = ptr;
1507 }
1508
1509 return ptr;
1510 }
1511
1512 static unsigned
1513 panfrost_streamout_offset(unsigned stride, unsigned offset,
1514 struct pipe_stream_output_target *target)
1515 {
1516 return (target->buffer_offset + (offset * stride * 4)) & 63;
1517 }
1518
1519 static void
1520 panfrost_emit_streamout(struct panfrost_batch *batch,
1521 struct mali_attribute_buffer_packed *slot,
1522 unsigned stride_words, unsigned offset, unsigned count,
1523 struct pipe_stream_output_target *target)
1524 {
1525 unsigned stride = stride_words * 4;
1526 unsigned max_size = target->buffer_size;
1527 unsigned expected_size = stride * count;
1528
1529 /* Grab the BO and bind it to the batch */
1530 struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1531
1532 /* Varyings are WRITE from the perspective of the VERTEX but READ from
1533 * the perspective of the TILER and FRAGMENT.
1534 */
1535 panfrost_batch_add_bo(batch, bo,
1536 PAN_BO_ACCESS_SHARED |
1537 PAN_BO_ACCESS_RW |
1538 PAN_BO_ACCESS_VERTEX_TILER |
1539 PAN_BO_ACCESS_FRAGMENT);
1540
1541 /* We will have an offset applied to get alignment */
1542 mali_ptr addr = bo->gpu + target->buffer_offset + (offset * stride);
1543
1544 pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1545 cfg.pointer = (addr & ~63);
1546 cfg.stride = stride;
1547 cfg.size = MIN2(max_size, expected_size) + (addr & 63);
1548 }
1549 }
1550
1551 static bool
1552 has_point_coord(unsigned mask, gl_varying_slot loc)
1553 {
1554 if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1555 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1556 else if (loc == VARYING_SLOT_PNTC)
1557 return (mask & (1 << 8));
1558 else
1559 return false;
1560 }
1561
1562 /* Helpers for manipulating stream out information so we can pack varyings
1563 * accordingly. Compute the src_offset for a given captured varying */
1564
1565 static struct pipe_stream_output *
1566 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1567 {
1568 for (unsigned i = 0; i < info->num_outputs; ++i) {
1569 if (info->output[i].register_index == loc)
1570 return &info->output[i];
1571 }
1572
1573 unreachable("Varying not captured");
1574 }
1575
1576 static unsigned
1577 pan_varying_size(enum mali_format fmt)
1578 {
1579 unsigned type = MALI_EXTRACT_TYPE(fmt);
1580 unsigned chan = MALI_EXTRACT_CHANNELS(fmt);
1581 unsigned bits = MALI_EXTRACT_BITS(fmt);
1582 unsigned bpc = 0;
1583
1584 if (bits == MALI_CHANNEL_FLOAT) {
1585 /* No doubles */
1586 bool fp16 = (type == MALI_FORMAT_SINT);
1587 assert(fp16 || (type == MALI_FORMAT_UNORM));
1588
1589 bpc = fp16 ? 2 : 4;
1590 } else {
1591 assert(type >= MALI_FORMAT_SNORM && type <= MALI_FORMAT_SINT);
1592
1593 /* See the enums */
1594 bits = 1 << bits;
1595 assert(bits >= 8);
1596 bpc = bits / 8;
1597 }
1598
1599 return bpc * chan;
1600 }
1601
1602 /* Indices for named (non-XFB) varyings that are present. These are packed
1603 * tightly so they correspond to a bitfield present (P) indexed by (1 <<
1604 * PAN_VARY_*). This has the nice property that you can lookup the buffer index
1605 * of a given special field given a shift S by:
1606 *
1607 * idx = popcount(P & ((1 << S) - 1))
1608 *
1609 * That is... look at all of the varyings that come earlier and count them, the
1610 * count is the new index since plus one. Likewise, the total number of special
1611 * buffers required is simply popcount(P)
1612 */
1613
1614 enum pan_special_varying {
1615 PAN_VARY_GENERAL = 0,
1616 PAN_VARY_POSITION = 1,
1617 PAN_VARY_PSIZ = 2,
1618 PAN_VARY_PNTCOORD = 3,
1619 PAN_VARY_FACE = 4,
1620 PAN_VARY_FRAGCOORD = 5,
1621
1622 /* Keep last */
1623 PAN_VARY_MAX,
1624 };
1625
1626 /* Given a varying, figure out which index it correpsonds to */
1627
1628 static inline unsigned
1629 pan_varying_index(unsigned present, enum pan_special_varying v)
1630 {
1631 unsigned mask = (1 << v) - 1;
1632 return util_bitcount(present & mask);
1633 }
1634
1635 /* Get the base offset for XFB buffers, which by convention come after
1636 * everything else. Wrapper function for semantic reasons; by construction this
1637 * is just popcount. */
1638
1639 static inline unsigned
1640 pan_xfb_base(unsigned present)
1641 {
1642 return util_bitcount(present);
1643 }
1644
1645 /* Computes the present mask for varyings so we can start emitting varying records */
1646
1647 static inline unsigned
1648 pan_varying_present(
1649 struct panfrost_shader_state *vs,
1650 struct panfrost_shader_state *fs,
1651 unsigned quirks)
1652 {
1653 /* At the moment we always emit general and position buffers. Not
1654 * strictly necessary but usually harmless */
1655
1656 unsigned present = (1 << PAN_VARY_GENERAL) | (1 << PAN_VARY_POSITION);
1657
1658 /* Enable special buffers by the shader info */
1659
1660 if (vs->writes_point_size)
1661 present |= (1 << PAN_VARY_PSIZ);
1662
1663 if (fs->reads_point_coord)
1664 present |= (1 << PAN_VARY_PNTCOORD);
1665
1666 if (fs->reads_face)
1667 present |= (1 << PAN_VARY_FACE);
1668
1669 if (fs->reads_frag_coord && !(quirks & IS_BIFROST))
1670 present |= (1 << PAN_VARY_FRAGCOORD);
1671
1672 /* Also, if we have a point sprite, we need a point coord buffer */
1673
1674 for (unsigned i = 0; i < fs->varying_count; i++) {
1675 gl_varying_slot loc = fs->varyings_loc[i];
1676
1677 if (has_point_coord(fs->point_sprite_mask, loc))
1678 present |= (1 << PAN_VARY_PNTCOORD);
1679 }
1680
1681 return present;
1682 }
1683
1684 /* Emitters for varying records */
1685
1686 static void
1687 pan_emit_vary(struct mali_attribute_packed *out,
1688 unsigned present, enum pan_special_varying buf,
1689 unsigned quirks, enum mali_format format,
1690 unsigned offset)
1691 {
1692 unsigned nr_channels = MALI_EXTRACT_CHANNELS(format);
1693 unsigned swizzle = quirks & HAS_SWIZZLES ?
1694 panfrost_get_default_swizzle(nr_channels) :
1695 panfrost_bifrost_swizzle(nr_channels);
1696
1697 pan_pack(out, ATTRIBUTE, cfg) {
1698 cfg.buffer_index = pan_varying_index(present, buf);
1699 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1700 cfg.format = (format << 12) | swizzle;
1701 cfg.offset = offset;
1702 }
1703 }
1704
1705 /* General varying that is unused */
1706
1707 static void
1708 pan_emit_vary_only(struct mali_attribute_packed *out,
1709 unsigned present, unsigned quirks)
1710 {
1711 pan_emit_vary(out, present, 0, quirks, MALI_VARYING_DISCARD, 0);
1712 }
1713
1714 /* Special records */
1715
1716 static const enum mali_format pan_varying_formats[PAN_VARY_MAX] = {
1717 [PAN_VARY_POSITION] = MALI_VARYING_POS,
1718 [PAN_VARY_PSIZ] = MALI_R16F,
1719 [PAN_VARY_PNTCOORD] = MALI_R16F,
1720 [PAN_VARY_FACE] = MALI_R32I,
1721 [PAN_VARY_FRAGCOORD] = MALI_RGBA32F
1722 };
1723
1724 static void
1725 pan_emit_vary_special(struct mali_attribute_packed *out,
1726 unsigned present, enum pan_special_varying buf,
1727 unsigned quirks)
1728 {
1729 assert(buf < PAN_VARY_MAX);
1730 pan_emit_vary(out, present, buf, quirks, pan_varying_formats[buf], 0);
1731 }
1732
1733 static enum mali_format
1734 pan_xfb_format(enum mali_format format, unsigned nr)
1735 {
1736 if (MALI_EXTRACT_BITS(format) == MALI_CHANNEL_FLOAT)
1737 return MALI_R32F | MALI_NR_CHANNELS(nr);
1738 else
1739 return MALI_EXTRACT_TYPE(format) | MALI_NR_CHANNELS(nr) | MALI_CHANNEL_32;
1740 }
1741
1742 /* Transform feedback records. Note struct pipe_stream_output is (if packed as
1743 * a bitfield) 32-bit, smaller than a 64-bit pointer, so may as well pass by
1744 * value. */
1745
1746 static void
1747 pan_emit_vary_xfb(struct mali_attribute_packed *out,
1748 unsigned present,
1749 unsigned max_xfb,
1750 unsigned *streamout_offsets,
1751 unsigned quirks,
1752 enum mali_format format,
1753 struct pipe_stream_output o)
1754 {
1755 unsigned swizzle = quirks & HAS_SWIZZLES ?
1756 panfrost_get_default_swizzle(o.num_components) :
1757 panfrost_bifrost_swizzle(o.num_components);
1758
1759 pan_pack(out, ATTRIBUTE, cfg) {
1760 /* XFB buffers come after everything else */
1761 cfg.buffer_index = pan_xfb_base(present) + o.output_buffer;
1762 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1763
1764 /* Override number of channels and precision to highp */
1765 cfg.format = (pan_xfb_format(format, o.num_components) << 12) | swizzle;
1766
1767 /* Apply given offsets together */
1768 cfg.offset = (o.dst_offset * 4) /* dwords */
1769 + streamout_offsets[o.output_buffer];
1770 }
1771 }
1772
1773 /* Determine if we should capture a varying for XFB. This requires actually
1774 * having a buffer for it. If we don't capture it, we'll fallback to a general
1775 * varying path (linked or unlinked, possibly discarding the write) */
1776
1777 static bool
1778 panfrost_xfb_captured(struct panfrost_shader_state *xfb,
1779 unsigned loc, unsigned max_xfb)
1780 {
1781 if (!(xfb->so_mask & (1ll << loc)))
1782 return false;
1783
1784 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1785 return o->output_buffer < max_xfb;
1786 }
1787
1788 static void
1789 pan_emit_general_varying(struct mali_attribute_packed *out,
1790 struct panfrost_shader_state *other,
1791 struct panfrost_shader_state *xfb,
1792 gl_varying_slot loc,
1793 enum mali_format format,
1794 unsigned present,
1795 unsigned quirks,
1796 unsigned *gen_offsets,
1797 enum mali_format *gen_formats,
1798 unsigned *gen_stride,
1799 unsigned idx,
1800 bool should_alloc)
1801 {
1802 /* Check if we're linked */
1803 signed other_idx = -1;
1804
1805 for (unsigned j = 0; j < other->varying_count; ++j) {
1806 if (other->varyings_loc[j] == loc) {
1807 other_idx = j;
1808 break;
1809 }
1810 }
1811
1812 if (other_idx < 0) {
1813 pan_emit_vary_only(out, present, quirks);
1814 return;
1815 }
1816
1817 unsigned offset = gen_offsets[other_idx];
1818
1819 if (should_alloc) {
1820 /* We're linked, so allocate a space via a watermark allocation */
1821 enum mali_format alt = other->varyings[other_idx];
1822
1823 /* Do interpolation at minimum precision */
1824 unsigned size_main = pan_varying_size(format);
1825 unsigned size_alt = pan_varying_size(alt);
1826 unsigned size = MIN2(size_main, size_alt);
1827
1828 /* If a varying is marked for XFB but not actually captured, we
1829 * should match the format to the format that would otherwise
1830 * be used for XFB, since dEQP checks for invariance here. It's
1831 * unclear if this is required by the spec. */
1832
1833 if (xfb->so_mask & (1ull << loc)) {
1834 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1835 format = pan_xfb_format(format, o->num_components);
1836 size = pan_varying_size(format);
1837 } else if (size == size_alt) {
1838 format = alt;
1839 }
1840
1841 gen_offsets[idx] = *gen_stride;
1842 gen_formats[other_idx] = format;
1843 offset = *gen_stride;
1844 *gen_stride += size;
1845 }
1846
1847 pan_emit_vary(out, present, PAN_VARY_GENERAL, quirks, format, offset);
1848 }
1849
1850 /* Higher-level wrapper around all of the above, classifying a varying into one
1851 * of the above types */
1852
1853 static void
1854 panfrost_emit_varying(
1855 struct mali_attribute_packed *out,
1856 struct panfrost_shader_state *stage,
1857 struct panfrost_shader_state *other,
1858 struct panfrost_shader_state *xfb,
1859 unsigned present,
1860 unsigned max_xfb,
1861 unsigned *streamout_offsets,
1862 unsigned quirks,
1863 unsigned *gen_offsets,
1864 enum mali_format *gen_formats,
1865 unsigned *gen_stride,
1866 unsigned idx,
1867 bool should_alloc,
1868 bool is_fragment)
1869 {
1870 gl_varying_slot loc = stage->varyings_loc[idx];
1871 enum mali_format format = stage->varyings[idx];
1872
1873 /* Override format to match linkage */
1874 if (!should_alloc && gen_formats[idx])
1875 format = gen_formats[idx];
1876
1877 if (has_point_coord(stage->point_sprite_mask, loc)) {
1878 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1879 } else if (panfrost_xfb_captured(xfb, loc, max_xfb)) {
1880 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1881 pan_emit_vary_xfb(out, present, max_xfb, streamout_offsets, quirks, format, *o);
1882 } else if (loc == VARYING_SLOT_POS) {
1883 if (is_fragment)
1884 pan_emit_vary_special(out, present, PAN_VARY_FRAGCOORD, quirks);
1885 else
1886 pan_emit_vary_special(out, present, PAN_VARY_POSITION, quirks);
1887 } else if (loc == VARYING_SLOT_PSIZ) {
1888 pan_emit_vary_special(out, present, PAN_VARY_PSIZ, quirks);
1889 } else if (loc == VARYING_SLOT_PNTC) {
1890 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1891 } else if (loc == VARYING_SLOT_FACE) {
1892 pan_emit_vary_special(out, present, PAN_VARY_FACE, quirks);
1893 } else {
1894 pan_emit_general_varying(out, other, xfb, loc, format, present,
1895 quirks, gen_offsets, gen_formats, gen_stride,
1896 idx, should_alloc);
1897 }
1898 }
1899
1900 static void
1901 pan_emit_special_input(struct mali_attribute_buffer_packed *out,
1902 unsigned present,
1903 enum pan_special_varying v,
1904 unsigned special)
1905 {
1906 if (present & (1 << v)) {
1907 unsigned idx = pan_varying_index(present, v);
1908
1909 pan_pack(out + idx, ATTRIBUTE_BUFFER, cfg) {
1910 cfg.special = special;
1911 cfg.type = 0;
1912 }
1913 }
1914 }
1915
1916 void
1917 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1918 unsigned vertex_count,
1919 struct mali_vertex_tiler_postfix *vertex_postfix,
1920 struct mali_vertex_tiler_postfix *tiler_postfix,
1921 union midgard_primitive_size *primitive_size)
1922 {
1923 /* Load the shaders */
1924 struct panfrost_context *ctx = batch->ctx;
1925 struct panfrost_device *dev = pan_device(ctx->base.screen);
1926 struct panfrost_shader_state *vs, *fs;
1927 size_t vs_size, fs_size;
1928
1929 /* Allocate the varying descriptor */
1930
1931 vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1932 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1933 vs_size = MALI_ATTRIBUTE_LENGTH * vs->varying_count;
1934 fs_size = MALI_ATTRIBUTE_LENGTH * fs->varying_count;
1935
1936 struct panfrost_transfer trans = panfrost_pool_alloc_aligned(
1937 &batch->pool, vs_size + fs_size, MALI_ATTRIBUTE_LENGTH);
1938
1939 struct pipe_stream_output_info *so = &vs->stream_output;
1940 unsigned present = pan_varying_present(vs, fs, dev->quirks);
1941
1942 /* Check if this varying is linked by us. This is the case for
1943 * general-purpose, non-captured varyings. If it is, link it. If it's
1944 * not, use the provided stream out information to determine the
1945 * offset, since it was already linked for us. */
1946
1947 unsigned gen_offsets[32];
1948 enum mali_format gen_formats[32];
1949 memset(gen_offsets, 0, sizeof(gen_offsets));
1950 memset(gen_formats, 0, sizeof(gen_formats));
1951
1952 unsigned gen_stride = 0;
1953 assert(vs->varying_count < ARRAY_SIZE(gen_offsets));
1954 assert(fs->varying_count < ARRAY_SIZE(gen_offsets));
1955
1956 unsigned streamout_offsets[32];
1957
1958 for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1959 streamout_offsets[i] = panfrost_streamout_offset(
1960 so->stride[i],
1961 ctx->streamout.offsets[i],
1962 ctx->streamout.targets[i]);
1963 }
1964
1965 struct mali_attribute_packed *ovs = (struct mali_attribute_packed *)trans.cpu;
1966 struct mali_attribute_packed *ofs = ovs + vs->varying_count;
1967
1968 for (unsigned i = 0; i < vs->varying_count; i++) {
1969 panfrost_emit_varying(ovs + i, vs, fs, vs, present,
1970 ctx->streamout.num_targets, streamout_offsets,
1971 dev->quirks,
1972 gen_offsets, gen_formats, &gen_stride, i, true, false);
1973 }
1974
1975 for (unsigned i = 0; i < fs->varying_count; i++) {
1976 panfrost_emit_varying(ofs + i, fs, vs, vs, present,
1977 ctx->streamout.num_targets, streamout_offsets,
1978 dev->quirks,
1979 gen_offsets, gen_formats, &gen_stride, i, false, true);
1980 }
1981
1982 unsigned xfb_base = pan_xfb_base(present);
1983 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1984 MALI_ATTRIBUTE_BUFFER_LENGTH * (xfb_base + ctx->streamout.num_targets),
1985 MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
1986 struct mali_attribute_buffer_packed *varyings =
1987 (struct mali_attribute_buffer_packed *) T.cpu;
1988
1989 /* Emit the stream out buffers */
1990
1991 unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1992 ctx->vertex_count);
1993
1994 for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1995 panfrost_emit_streamout(batch, &varyings[xfb_base + i],
1996 so->stride[i],
1997 ctx->streamout.offsets[i],
1998 out_count,
1999 ctx->streamout.targets[i]);
2000 }
2001
2002 panfrost_emit_varyings(batch,
2003 &varyings[pan_varying_index(present, PAN_VARY_GENERAL)],
2004 gen_stride, vertex_count);
2005
2006 /* fp32 vec4 gl_Position */
2007 tiler_postfix->position_varying = panfrost_emit_varyings(batch,
2008 &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
2009 sizeof(float) * 4, vertex_count);
2010
2011 if (present & (1 << PAN_VARY_PSIZ)) {
2012 primitive_size->pointer = panfrost_emit_varyings(batch,
2013 &varyings[pan_varying_index(present, PAN_VARY_PSIZ)],
2014 2, vertex_count);
2015 }
2016
2017 pan_emit_special_input(varyings, present, PAN_VARY_PNTCOORD, MALI_ATTRIBUTE_SPECIAL_POINT_COORD);
2018 pan_emit_special_input(varyings, present, PAN_VARY_FACE, MALI_ATTRIBUTE_SPECIAL_FRONT_FACING);
2019 pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD, MALI_ATTRIBUTE_SPECIAL_FRAG_COORD);
2020
2021 vertex_postfix->varyings = T.gpu;
2022 tiler_postfix->varyings = T.gpu;
2023
2024 vertex_postfix->varying_meta = trans.gpu;
2025 tiler_postfix->varying_meta = trans.gpu + vs_size;
2026 }
2027
2028 void
2029 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
2030 struct mali_vertex_tiler_prefix *vertex_prefix,
2031 struct mali_vertex_tiler_postfix *vertex_postfix,
2032 struct mali_vertex_tiler_prefix *tiler_prefix,
2033 struct mali_vertex_tiler_postfix *tiler_postfix,
2034 union midgard_primitive_size *primitive_size)
2035 {
2036 struct panfrost_context *ctx = batch->ctx;
2037 struct panfrost_device *device = pan_device(ctx->base.screen);
2038 bool wallpapering = ctx->wallpaper_batch && batch->scoreboard.tiler_dep;
2039 struct bifrost_payload_vertex bifrost_vertex = {0,};
2040 struct bifrost_payload_tiler bifrost_tiler = {0,};
2041 struct midgard_payload_vertex_tiler midgard_vertex = {0,};
2042 struct midgard_payload_vertex_tiler midgard_tiler = {0,};
2043 void *vp, *tp;
2044 size_t vp_size, tp_size;
2045
2046 if (device->quirks & IS_BIFROST) {
2047 bifrost_vertex.prefix = *vertex_prefix;
2048 bifrost_vertex.postfix = *vertex_postfix;
2049 vp = &bifrost_vertex;
2050 vp_size = sizeof(bifrost_vertex);
2051
2052 bifrost_tiler.prefix = *tiler_prefix;
2053 bifrost_tiler.tiler.primitive_size = *primitive_size;
2054 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
2055 bifrost_tiler.postfix = *tiler_postfix;
2056 tp = &bifrost_tiler;
2057 tp_size = sizeof(bifrost_tiler);
2058 } else {
2059 midgard_vertex.prefix = *vertex_prefix;
2060 midgard_vertex.postfix = *vertex_postfix;
2061 vp = &midgard_vertex;
2062 vp_size = sizeof(midgard_vertex);
2063
2064 midgard_tiler.prefix = *tiler_prefix;
2065 midgard_tiler.postfix = *tiler_postfix;
2066 midgard_tiler.primitive_size = *primitive_size;
2067 tp = &midgard_tiler;
2068 tp_size = sizeof(midgard_tiler);
2069 }
2070
2071 if (wallpapering) {
2072 /* Inject in reverse order, with "predicted" job indices.
2073 * THIS IS A HACK XXX */
2074 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false,
2075 batch->scoreboard.job_index + 2, tp, tp_size, true);
2076 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2077 vp, vp_size, true);
2078 return;
2079 }
2080
2081 /* If rasterizer discard is enable, only submit the vertex */
2082
2083 unsigned vertex = panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2084 vp, vp_size, false);
2085
2086 if (ctx->rasterizer->base.rasterizer_discard)
2087 return;
2088
2089 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false, vertex, tp, tp_size,
2090 false);
2091 }
2092
2093 /* TODO: stop hardcoding this */
2094 mali_ptr
2095 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2096 {
2097 uint16_t locations[] = {
2098 128, 128,
2099 0, 256,
2100 0, 256,
2101 0, 256,
2102 0, 256,
2103 0, 256,
2104 0, 256,
2105 0, 256,
2106 0, 256,
2107 0, 256,
2108 0, 256,
2109 0, 256,
2110 0, 256,
2111 0, 256,
2112 0, 256,
2113 0, 256,
2114 0, 256,
2115 0, 256,
2116 0, 256,
2117 0, 256,
2118 0, 256,
2119 0, 256,
2120 0, 256,
2121 0, 256,
2122 0, 256,
2123 0, 256,
2124 0, 256,
2125 0, 256,
2126 0, 256,
2127 0, 256,
2128 0, 256,
2129 0, 256,
2130 128, 128,
2131 0, 0,
2132 0, 0,
2133 0, 0,
2134 0, 0,
2135 0, 0,
2136 0, 0,
2137 0, 0,
2138 0, 0,
2139 0, 0,
2140 0, 0,
2141 0, 0,
2142 0, 0,
2143 0, 0,
2144 0, 0,
2145 0, 0,
2146 };
2147
2148 return panfrost_pool_upload_aligned(&batch->pool, locations, 96 * sizeof(uint16_t), 64);
2149 }