panfrost: Pack vertex properties when compiling
[mesa.git] / src / gallium / drivers / panfrost / pan_cmdstream.c
1 /*
2 * Copyright (C) 2018 Alyssa Rosenzweig
3 * Copyright (C) 2020 Collabora Ltd.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "util/macros.h"
26 #include "util/u_prim.h"
27 #include "util/u_vbuf.h"
28
29 #include "panfrost-quirks.h"
30
31 #include "pan_pool.h"
32 #include "pan_bo.h"
33 #include "pan_cmdstream.h"
34 #include "pan_context.h"
35 #include "pan_job.h"
36
37 /* If a BO is accessed for a particular shader stage, will it be in the primary
38 * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
39 * fragment will be primary, e.g. compute jobs will be considered
40 * "vertex/tiler" by analogy */
41
42 static inline uint32_t
43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
44 {
45 assert(stage == PIPE_SHADER_FRAGMENT ||
46 stage == PIPE_SHADER_VERTEX ||
47 stage == PIPE_SHADER_COMPUTE);
48
49 return stage == PIPE_SHADER_FRAGMENT ?
50 PAN_BO_ACCESS_FRAGMENT :
51 PAN_BO_ACCESS_VERTEX_TILER;
52 }
53
54 static void
55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
56 struct mali_vertex_tiler_postfix *postfix)
57 {
58 struct panfrost_device *dev = pan_device(ctx->base.screen);
59 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
60
61 struct mali_shared_memory shared = {
62 .shared_workgroup_count = ~0,
63 };
64
65 if (batch->stack_size) {
66 struct panfrost_bo *stack =
67 panfrost_batch_get_scratchpad(batch, batch->stack_size,
68 dev->thread_tls_alloc,
69 dev->core_count);
70
71 shared.stack_shift = panfrost_get_stack_shift(batch->stack_size);
72 shared.scratchpad = stack->gpu;
73 }
74
75 postfix->shared_memory = panfrost_pool_upload_aligned(&batch->pool, &shared, sizeof(shared), 64);
76 }
77
78 static void
79 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
80 struct mali_vertex_tiler_postfix *postfix)
81 {
82 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
83 postfix->shared_memory = panfrost_batch_reserve_framebuffer(batch);
84 }
85
86 static void
87 panfrost_vt_update_rasterizer(struct panfrost_rasterizer *rasterizer,
88 struct mali_vertex_tiler_prefix *prefix,
89 struct mali_vertex_tiler_postfix *postfix)
90 {
91 postfix->gl_enables |= 0x7;
92 SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
93 rasterizer->base.front_ccw);
94 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
95 (rasterizer->base.cull_face & PIPE_FACE_FRONT));
96 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
97 (rasterizer->base.cull_face & PIPE_FACE_BACK));
98 SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
99 rasterizer->base.flatshade_first);
100 }
101
102 void
103 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
104 struct mali_vertex_tiler_prefix *prefix,
105 union midgard_primitive_size *primitive_size)
106 {
107 struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
108
109 if (!panfrost_writes_point_size(ctx)) {
110 float val = (prefix->draw_mode == MALI_DRAW_MODE_POINTS) ?
111 rasterizer->base.point_size :
112 rasterizer->base.line_width;
113
114 primitive_size->constant = val;
115 }
116 }
117
118 static void
119 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
120 struct mali_vertex_tiler_postfix *postfix)
121 {
122 SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
123 if (ctx->occlusion_query) {
124 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
125 panfrost_batch_add_bo(ctx->batch, ctx->occlusion_query->bo,
126 PAN_BO_ACCESS_SHARED |
127 PAN_BO_ACCESS_RW |
128 PAN_BO_ACCESS_FRAGMENT);
129 } else {
130 postfix->occlusion_counter = 0;
131 }
132 }
133
134 void
135 panfrost_vt_init(struct panfrost_context *ctx,
136 enum pipe_shader_type stage,
137 struct mali_vertex_tiler_prefix *prefix,
138 struct mali_vertex_tiler_postfix *postfix)
139 {
140 struct panfrost_device *device = pan_device(ctx->base.screen);
141
142 if (!ctx->shader[stage])
143 return;
144
145 memset(prefix, 0, sizeof(*prefix));
146 memset(postfix, 0, sizeof(*postfix));
147
148 if (device->quirks & IS_BIFROST) {
149 postfix->gl_enables = 0x2;
150 panfrost_vt_emit_shared_memory(ctx, postfix);
151 } else {
152 postfix->gl_enables = 0x6;
153 panfrost_vt_attach_framebuffer(ctx, postfix);
154 }
155
156 if (stage == PIPE_SHADER_FRAGMENT) {
157 panfrost_vt_update_occlusion_query(ctx, postfix);
158 panfrost_vt_update_rasterizer(ctx->rasterizer, prefix, postfix);
159 }
160 }
161
162 static unsigned
163 panfrost_translate_index_size(unsigned size)
164 {
165 switch (size) {
166 case 1:
167 return MALI_DRAW_INDEXED_UINT8;
168
169 case 2:
170 return MALI_DRAW_INDEXED_UINT16;
171
172 case 4:
173 return MALI_DRAW_INDEXED_UINT32;
174
175 default:
176 unreachable("Invalid index size");
177 }
178 }
179
180 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
181 * good for the duration of the draw (transient), could last longer. Also get
182 * the bounds on the index buffer for the range accessed by the draw. We do
183 * these operations together because there are natural optimizations which
184 * require them to be together. */
185
186 static mali_ptr
187 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
188 const struct pipe_draw_info *info,
189 unsigned *min_index, unsigned *max_index)
190 {
191 struct panfrost_resource *rsrc = pan_resource(info->index.resource);
192 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
193 off_t offset = info->start * info->index_size;
194 bool needs_indices = true;
195 mali_ptr out = 0;
196
197 if (info->max_index != ~0u) {
198 *min_index = info->min_index;
199 *max_index = info->max_index;
200 needs_indices = false;
201 }
202
203 if (!info->has_user_indices) {
204 /* Only resources can be directly mapped */
205 panfrost_batch_add_bo(batch, rsrc->bo,
206 PAN_BO_ACCESS_SHARED |
207 PAN_BO_ACCESS_READ |
208 PAN_BO_ACCESS_VERTEX_TILER);
209 out = rsrc->bo->gpu + offset;
210
211 /* Check the cache */
212 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
213 info->start,
214 info->count,
215 min_index,
216 max_index);
217 } else {
218 /* Otherwise, we need to upload to transient memory */
219 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
220 struct panfrost_transfer T =
221 panfrost_pool_alloc_aligned(&batch->pool,
222 info->count * info->index_size,
223 info->index_size);
224
225 memcpy(T.cpu, ibuf8 + offset, info->count * info->index_size);
226 out = T.gpu;
227 }
228
229 if (needs_indices) {
230 /* Fallback */
231 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
232
233 if (!info->has_user_indices)
234 panfrost_minmax_cache_add(rsrc->index_cache,
235 info->start, info->count,
236 *min_index, *max_index);
237 }
238
239 return out;
240 }
241
242 void
243 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
244 const struct pipe_draw_info *info,
245 enum mali_draw_mode draw_mode,
246 struct mali_vertex_tiler_postfix *vertex_postfix,
247 struct mali_vertex_tiler_prefix *tiler_prefix,
248 struct mali_vertex_tiler_postfix *tiler_postfix,
249 unsigned *vertex_count,
250 unsigned *padded_count)
251 {
252 tiler_prefix->draw_mode = draw_mode;
253
254 unsigned draw_flags = 0;
255
256 if (panfrost_writes_point_size(ctx))
257 draw_flags |= MALI_DRAW_VARYING_SIZE;
258
259 if (info->primitive_restart)
260 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
261
262 /* These doesn't make much sense */
263
264 draw_flags |= 0x3000;
265
266 if (info->index_size) {
267 unsigned min_index = 0, max_index = 0;
268
269 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
270 info,
271 &min_index,
272 &max_index);
273
274 /* Use the corresponding values */
275 *vertex_count = max_index - min_index + 1;
276 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
277 tiler_prefix->offset_bias_correction = -min_index;
278 tiler_prefix->index_count = MALI_POSITIVE(info->count);
279 draw_flags |= panfrost_translate_index_size(info->index_size);
280 } else {
281 tiler_prefix->indices = 0;
282 *vertex_count = ctx->vertex_count;
283 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
284 tiler_prefix->offset_bias_correction = 0;
285 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
286 }
287
288 tiler_prefix->unknown_draw = draw_flags;
289
290 /* Encode the padded vertex count */
291
292 if (info->instance_count > 1) {
293 *padded_count = panfrost_padded_vertex_count(*vertex_count);
294
295 unsigned shift = __builtin_ctz(ctx->padded_count);
296 unsigned k = ctx->padded_count >> (shift + 1);
297
298 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
299 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
300 } else {
301 *padded_count = *vertex_count;
302
303 /* Reset instancing state */
304 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
305 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
306 }
307 }
308
309 static void
310 panfrost_emit_compute_shader(struct panfrost_context *ctx,
311 enum pipe_shader_type st,
312 struct mali_shader_meta *meta)
313 {
314 const struct panfrost_device *dev = pan_device(ctx->base.screen);
315 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
316
317 memset(meta, 0, sizeof(*meta));
318 memcpy(&meta->shader, &ss->shader, sizeof(ss->shader));
319 memcpy(&meta->midgard_props, &ss->properties, sizeof(ss->properties));
320
321 if (dev->quirks & IS_BIFROST)
322 memcpy(&meta->bifrost_preload, &ss->preload, sizeof(ss->preload));
323 }
324
325 static unsigned
326 translate_tex_wrap(enum pipe_tex_wrap w)
327 {
328 switch (w) {
329 case PIPE_TEX_WRAP_REPEAT: return MALI_WRAP_MODE_REPEAT;
330 case PIPE_TEX_WRAP_CLAMP: return MALI_WRAP_MODE_CLAMP;
331 case PIPE_TEX_WRAP_CLAMP_TO_EDGE: return MALI_WRAP_MODE_CLAMP_TO_EDGE;
332 case PIPE_TEX_WRAP_CLAMP_TO_BORDER: return MALI_WRAP_MODE_CLAMP_TO_BORDER;
333 case PIPE_TEX_WRAP_MIRROR_REPEAT: return MALI_WRAP_MODE_MIRRORED_REPEAT;
334 case PIPE_TEX_WRAP_MIRROR_CLAMP: return MALI_WRAP_MODE_MIRRORED_CLAMP;
335 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE;
336 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER;
337 default: unreachable("Invalid wrap");
338 }
339 }
340
341 /* The hardware compares in the wrong order order, so we have to flip before
342 * encoding. Yes, really. */
343
344 static enum mali_func
345 panfrost_sampler_compare_func(const struct pipe_sampler_state *cso)
346 {
347 if (!cso->compare_mode)
348 return MALI_FUNC_NEVER;
349
350 enum mali_func f = panfrost_translate_compare_func(cso->compare_func);
351 return panfrost_flip_compare_func(f);
352 }
353
354 static enum mali_mipmap_mode
355 pan_pipe_to_mipmode(enum pipe_tex_mipfilter f)
356 {
357 switch (f) {
358 case PIPE_TEX_MIPFILTER_NEAREST: return MALI_MIPMAP_MODE_NEAREST;
359 case PIPE_TEX_MIPFILTER_LINEAR: return MALI_MIPMAP_MODE_TRILINEAR;
360 case PIPE_TEX_MIPFILTER_NONE: return MALI_MIPMAP_MODE_NONE;
361 default: unreachable("Invalid");
362 }
363 }
364
365 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
366 struct mali_midgard_sampler_packed *hw)
367 {
368 pan_pack(hw, MIDGARD_SAMPLER, cfg) {
369 cfg.magnify_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
370 cfg.minify_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
371 cfg.mipmap_mode = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR) ?
372 MALI_MIPMAP_MODE_TRILINEAR : MALI_MIPMAP_MODE_NEAREST;
373 cfg.normalized_coordinates = cso->normalized_coords;
374
375 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
376
377 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
378
379 /* If necessary, we disable mipmapping in the sampler descriptor by
380 * clamping the LOD as tight as possible (from 0 to epsilon,
381 * essentially -- remember these are fixed point numbers, so
382 * epsilon=1/256) */
383
384 cfg.maximum_lod = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) ?
385 cfg.minimum_lod + 1 :
386 FIXED_16(cso->max_lod, false);
387
388 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
389 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
390 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
391
392 cfg.compare_function = panfrost_sampler_compare_func(cso);
393 cfg.seamless_cube_map = cso->seamless_cube_map;
394
395 cfg.border_color_r = cso->border_color.f[0];
396 cfg.border_color_g = cso->border_color.f[1];
397 cfg.border_color_b = cso->border_color.f[2];
398 cfg.border_color_a = cso->border_color.f[3];
399 }
400 }
401
402 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
403 struct mali_bifrost_sampler_packed *hw)
404 {
405 pan_pack(hw, BIFROST_SAMPLER, cfg) {
406 cfg.magnify_linear = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR;
407 cfg.minify_linear = cso->min_img_filter == PIPE_TEX_FILTER_LINEAR;
408 cfg.mipmap_mode = pan_pipe_to_mipmode(cso->min_mip_filter);
409 cfg.normalized_coordinates = cso->normalized_coords;
410
411 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
412 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
413 cfg.maximum_lod = FIXED_16(cso->max_lod, false);
414
415 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
416 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
417 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
418
419 cfg.compare_function = panfrost_sampler_compare_func(cso);
420 cfg.seamless_cube_map = cso->seamless_cube_map;
421 }
422 }
423
424 static bool
425 panfrost_fs_required(
426 struct panfrost_shader_state *fs,
427 struct panfrost_blend_final *blend,
428 unsigned rt_count)
429 {
430 /* If we generally have side effects */
431 if (fs->fs_sidefx)
432 return true;
433
434 /* If colour is written we need to execute */
435 for (unsigned i = 0; i < rt_count; ++i) {
436 if (!blend[i].no_colour)
437 return true;
438 }
439
440 /* If depth is written and not implied we need to execute.
441 * TODO: Predicate on Z/S writes being enabled */
442 return (fs->writes_depth || fs->writes_stencil);
443 }
444
445 static void
446 panfrost_emit_blend(struct panfrost_batch *batch, void *rts,
447 struct panfrost_blend_final *blend)
448 {
449 const struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
450 struct panfrost_shader_state *fs = panfrost_get_shader_state(batch->ctx, PIPE_SHADER_FRAGMENT);
451 unsigned rt_count = batch->key.nr_cbufs;
452
453 struct bifrost_blend_rt *brts = rts;
454 struct midgard_blend_rt *mrts = rts;
455
456 /* Disable blending for depth-only on Bifrost */
457
458 if (rt_count == 0 && dev->quirks & IS_BIFROST)
459 brts[0].unk2 = 0x3;
460
461 for (unsigned i = 0; i < rt_count; ++i) {
462 unsigned flags = 0;
463
464 pan_pack(&flags, BLEND_FLAGS, cfg) {
465 if (blend[i].no_colour) {
466 cfg.enable = false;
467 break;
468 }
469
470 batch->draws |= (PIPE_CLEAR_COLOR0 << i);
471
472 cfg.srgb = util_format_is_srgb(batch->key.cbufs[i]->format);
473 cfg.load_destination = blend[i].load_dest;
474 cfg.dither_disable = !batch->ctx->blend->base.dither;
475
476 if (!(dev->quirks & IS_BIFROST))
477 cfg.midgard_blend_shader = blend[i].is_shader;
478 }
479
480 if (dev->quirks & IS_BIFROST) {
481 brts[i].flags = flags;
482
483 if (blend[i].is_shader) {
484 /* The blend shader's address needs to be at
485 * the same top 32 bit as the fragment shader.
486 * TODO: Ensure that's always the case.
487 */
488 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
489 (fs->bo->gpu & (0xffffffffull << 32)));
490 brts[i].shader = blend[i].shader.gpu;
491 brts[i].unk2 = 0x0;
492 } else {
493 enum pipe_format format = batch->key.cbufs[i]->format;
494 const struct util_format_description *format_desc;
495 format_desc = util_format_description(format);
496
497 brts[i].equation = blend[i].equation.equation;
498
499 /* TODO: this is a bit more complicated */
500 brts[i].constant = blend[i].equation.constant;
501
502 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
503
504 /* 0x19 disables blending and forces REPLACE
505 * mode (equivalent to rgb_mode = alpha_mode =
506 * x122, colour mask = 0xF). 0x1a allows
507 * blending. */
508 brts[i].unk2 = blend[i].opaque ? 0x19 : 0x1a;
509
510 brts[i].shader_type = fs->blend_types[i];
511 }
512 } else {
513 memcpy(&mrts[i].flags, &flags, sizeof(flags));
514
515 if (blend[i].is_shader) {
516 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
517 } else {
518 mrts[i].blend.equation = blend[i].equation.equation;
519 mrts[i].blend.constant = blend[i].equation.constant;
520 }
521 }
522 }
523 }
524
525 static struct mali_shader_packed
526 panfrost_pack_shaderless(bool midgard)
527 {
528 struct mali_shader_packed pack;
529
530 pan_pack(&pack, SHADER, cfg) {
531 cfg.shader = midgard ? 0x1 : 0x0;
532 }
533
534 return pack;
535 }
536
537 static void
538 panfrost_emit_frag_shader(struct panfrost_context *ctx,
539 struct mali_shader_meta *fragmeta,
540 struct panfrost_blend_final *blend)
541 {
542 const struct panfrost_device *dev = pan_device(ctx->base.screen);
543 struct panfrost_shader_state *fs;
544
545 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
546
547 struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
548 const struct panfrost_zsa_state *zsa = ctx->depth_stencil;
549 unsigned rt_count = ctx->pipe_framebuffer.nr_cbufs;
550
551 memset(fragmeta, 0, sizeof(*fragmeta));
552 memcpy(&fragmeta->shader, &fs->shader, sizeof(fs->shader));
553
554 if (dev->quirks & IS_BIFROST) {
555 struct mali_bifrost_properties_packed prop;
556 struct mali_preload_fragment_packed preload;
557
558 bool no_blend = true;
559
560 for (unsigned i = 0; i < rt_count; ++i)
561 no_blend &= (!blend[i].load_dest | blend[i].no_colour);
562
563 pan_pack(&prop, BIFROST_PROPERTIES, cfg) {
564 cfg.unknown = 0x950020; /* XXX */
565 cfg.uniform_buffer_count = fs->ubo_count;
566 cfg.early_z_enable = !fs->can_discard && !fs->writes_depth && no_blend;
567 }
568
569 pan_pack(&preload, PRELOAD_FRAGMENT, cfg) {
570 cfg.uniform_count = fs->uniform_count;
571 cfg.fragment_position = fs->reads_frag_coord;
572 }
573
574 memcpy(&fragmeta->bifrost_props, &prop, sizeof(prop));
575 memcpy(&fragmeta->bifrost_preload, &preload, sizeof(preload));
576 } else {
577 struct mali_midgard_properties_packed prop;
578
579 /* Reasons to disable early-Z from a shader perspective */
580 bool late_z = fs->can_discard || fs->writes_global ||
581 fs->writes_depth || fs->writes_stencil;
582
583 /* Reasons to disable early-Z from a CSO perspective */
584 bool alpha_to_coverage = ctx->blend->base.alpha_to_coverage;
585
586 /* If either depth or stencil is enabled, discard matters */
587 bool zs_enabled =
588 (zsa->base.depth.enabled && zsa->base.depth.func != PIPE_FUNC_ALWAYS) ||
589 zsa->base.stencil[0].enabled;
590
591 bool has_blend_shader = false;
592
593 for (unsigned c = 0; c < rt_count; ++c)
594 has_blend_shader |= blend[c].is_shader;
595
596 pan_pack(&prop, MIDGARD_PROPERTIES, cfg) {
597 cfg.uniform_buffer_count = fs->ubo_count;
598 cfg.uniform_count = fs->uniform_count;
599 cfg.work_register_count = fs->work_reg_count;
600 cfg.writes_globals = fs->writes_global;
601 cfg.suppress_inf_nan = true; /* XXX */
602
603 /* TODO: Reduce this limit? */
604 if (has_blend_shader)
605 cfg.work_register_count = MAX2(cfg.work_register_count, 8);
606
607 cfg.stencil_from_shader = fs->writes_stencil;
608 cfg.helper_invocation_enable = fs->helper_invocations;
609 cfg.depth_source = fs->writes_depth ?
610 MALI_DEPTH_SOURCE_SHADER :
611 MALI_DEPTH_SOURCE_FIXED_FUNCTION;
612
613 /* Depend on other state */
614 cfg.early_z_enable = !(late_z || alpha_to_coverage);
615 cfg.reads_tilebuffer = fs->outputs_read || (!zs_enabled && fs->can_discard);
616 cfg.reads_depth_stencil = zs_enabled && fs->can_discard;
617 }
618
619 memcpy(&fragmeta->midgard_props, &prop, sizeof(prop));
620 }
621
622 bool msaa = rast->multisample;
623 fragmeta->coverage_mask = msaa ? ctx->sample_mask : ~0;
624
625 fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x10;
626 fragmeta->unknown2_4 = 0x4e0;
627
628 /* TODO: Sample size */
629 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
630 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
631
632 /* EXT_shader_framebuffer_fetch requires the shader to be run
633 * per-sample when outputs are read. */
634 bool per_sample = ctx->min_samples > 1 || fs->outputs_read;
635 SET_BIT(fragmeta->unknown2_3, MALI_PER_SAMPLE, msaa && per_sample);
636
637 fragmeta->depth_units = rast->offset_units * 2.0f;
638 fragmeta->depth_factor = rast->offset_scale;
639
640 /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
641
642 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, rast->offset_tri);
643 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, rast->offset_tri);
644
645 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_NEAR, rast->depth_clip_near);
646 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_FAR, rast->depth_clip_far);
647
648 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
649 zsa->base.stencil[0].enabled);
650
651 fragmeta->stencil_mask_front = zsa->stencil_mask_front;
652 fragmeta->stencil_mask_back = zsa->stencil_mask_back;
653
654 /* Bottom bits for stencil ref, exactly one word */
655 fragmeta->stencil_front.opaque[0] = zsa->stencil_front.opaque[0] | ctx->stencil_ref.ref_value[0];
656
657 /* If back-stencil is not enabled, use the front values */
658
659 if (zsa->base.stencil[1].enabled)
660 fragmeta->stencil_back.opaque[0] = zsa->stencil_back.opaque[0] | ctx->stencil_ref.ref_value[1];
661 else
662 fragmeta->stencil_back = fragmeta->stencil_front;
663
664 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
665 zsa->base.depth.writemask);
666
667 fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
668 fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(
669 zsa->base.depth.enabled ? zsa->base.depth.func : PIPE_FUNC_ALWAYS));
670
671 SET_BIT(fragmeta->unknown2_4, MALI_ALPHA_TO_COVERAGE,
672 ctx->blend->base.alpha_to_coverage);
673
674 /* Disable shader execution if we can */
675 if (!panfrost_fs_required(fs, blend, rt_count)) {
676 struct mali_shader_packed shader =
677 panfrost_pack_shaderless(!(dev->quirks & IS_BIFROST));
678
679 memcpy(&fragmeta->shader, &shader, sizeof(shader));
680
681 struct mali_midgard_properties_packed prop;
682
683 if (dev->quirks & IS_BIFROST) {
684 pan_pack(&prop, BIFROST_PROPERTIES, cfg) {
685 cfg.unknown = 0x950020; /* XXX */
686 cfg.early_z_enable = true;
687 }
688 } else {
689 pan_pack(&prop, MIDGARD_PROPERTIES, cfg) {
690 cfg.work_register_count = 1;
691 cfg.depth_source = MALI_DEPTH_SOURCE_FIXED_FUNCTION;
692 cfg.early_z_enable = true;
693 }
694 }
695
696 memcpy(&fragmeta->midgard_props, &prop, sizeof(prop));
697 }
698
699 if (dev->quirks & MIDGARD_SFBD) {
700 /* When only a single render target platform is used, the blend
701 * information is inside the shader meta itself. We additionally
702 * need to signal CAN_DISCARD for nontrivial blend modes (so
703 * we're able to read back the destination buffer) */
704
705 if (blend[0].no_colour)
706 return;
707
708 fragmeta->unknown2_4 |= MALI_SFBD_ENABLE;
709
710 SET_BIT(fragmeta->unknown2_4, MALI_SFBD_SRGB,
711 util_format_is_srgb(ctx->pipe_framebuffer.cbufs[0]->format));
712
713 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
714 blend[0].is_shader);
715
716 if (blend[0].is_shader) {
717 fragmeta->blend.shader = blend[0].shader.gpu |
718 blend[0].shader.first_tag;
719 } else {
720 fragmeta->blend.equation = blend[0].equation.equation;
721 fragmeta->blend.constant = blend[0].equation.constant;
722 }
723
724 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
725 blend[0].load_dest);
726
727 SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER, !ctx->blend->base.dither);
728 } else if (!(dev->quirks & IS_BIFROST)) {
729 /* Bug where MRT-capable hw apparently reads the last blend
730 * shader from here instead of the usual location? */
731
732 for (signed rt = ((signed) rt_count - 1); rt >= 0; --rt) {
733 if (!blend[rt].is_shader)
734 continue;
735
736 fragmeta->blend.shader = blend[rt].shader.gpu |
737 blend[rt].shader.first_tag;
738 break;
739 }
740 }
741 }
742
743 void
744 panfrost_emit_shader_meta(struct panfrost_batch *batch,
745 enum pipe_shader_type st,
746 struct mali_vertex_tiler_postfix *postfix)
747 {
748 struct panfrost_context *ctx = batch->ctx;
749 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
750
751 if (!ss) {
752 postfix->shader = 0;
753 return;
754 }
755
756 struct mali_shader_meta meta;
757
758 /* Add the shader BO to the batch. */
759 panfrost_batch_add_bo(batch, ss->bo,
760 PAN_BO_ACCESS_PRIVATE |
761 PAN_BO_ACCESS_READ |
762 panfrost_bo_access_for_stage(st));
763
764 mali_ptr shader_ptr;
765
766 if (st == PIPE_SHADER_FRAGMENT) {
767 struct panfrost_device *dev = pan_device(ctx->base.screen);
768 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
769 size_t desc_size = sizeof(meta);
770 void *rts = NULL;
771 struct panfrost_transfer xfer;
772 unsigned rt_size;
773
774 if (dev->quirks & MIDGARD_SFBD)
775 rt_size = 0;
776 else if (dev->quirks & IS_BIFROST)
777 rt_size = sizeof(struct bifrost_blend_rt);
778 else
779 rt_size = sizeof(struct midgard_blend_rt);
780
781 desc_size += rt_size * rt_count;
782
783 if (rt_size)
784 rts = rzalloc_size(ctx, rt_size * rt_count);
785
786 struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
787
788 for (unsigned c = 0; c < ctx->pipe_framebuffer.nr_cbufs; ++c)
789 blend[c] = panfrost_get_blend_for_context(ctx, c);
790
791 panfrost_emit_frag_shader(ctx, &meta, blend);
792
793 if (!(dev->quirks & MIDGARD_SFBD))
794 panfrost_emit_blend(batch, rts, blend);
795 else
796 batch->draws |= PIPE_CLEAR_COLOR0;
797
798 xfer = panfrost_pool_alloc_aligned(&batch->pool, desc_size, sizeof(meta));
799
800 memcpy(xfer.cpu, &meta, sizeof(meta));
801 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
802
803 if (rt_size)
804 ralloc_free(rts);
805
806 shader_ptr = xfer.gpu;
807 } else {
808 panfrost_emit_compute_shader(ctx, st, &meta);
809
810 shader_ptr = panfrost_pool_upload(&batch->pool, &meta,
811 sizeof(meta));
812 }
813
814 postfix->shader = shader_ptr;
815 }
816
817 void
818 panfrost_emit_viewport(struct panfrost_batch *batch,
819 struct mali_vertex_tiler_postfix *tiler_postfix)
820 {
821 struct panfrost_context *ctx = batch->ctx;
822 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
823 const struct pipe_scissor_state *ss = &ctx->scissor;
824 const struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
825 const struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer;
826
827 /* Derive min/max from translate/scale. Note since |x| >= 0 by
828 * definition, we have that -|x| <= |x| hence translate - |scale| <=
829 * translate + |scale|, so the ordering is correct here. */
830 float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
831 float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
832 float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
833 float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
834 float minz = (vp->translate[2] - fabsf(vp->scale[2]));
835 float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
836
837 /* Scissor to the intersection of viewport and to the scissor, clamped
838 * to the framebuffer */
839
840 unsigned minx = MIN2(fb->width, vp_minx);
841 unsigned maxx = MIN2(fb->width, vp_maxx);
842 unsigned miny = MIN2(fb->height, vp_miny);
843 unsigned maxy = MIN2(fb->height, vp_maxy);
844
845 if (ss && rast->scissor) {
846 minx = MAX2(ss->minx, minx);
847 miny = MAX2(ss->miny, miny);
848 maxx = MIN2(ss->maxx, maxx);
849 maxy = MIN2(ss->maxy, maxy);
850 }
851
852 struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool, MALI_VIEWPORT_LENGTH);
853
854 pan_pack(T.cpu, VIEWPORT, cfg) {
855 cfg.scissor_minimum_x = minx;
856 cfg.scissor_minimum_y = miny;
857 cfg.scissor_maximum_x = maxx - 1;
858 cfg.scissor_maximum_y = maxy - 1;
859
860 cfg.minimum_z = rast->depth_clip_near ? minz : -INFINITY;
861 cfg.maximum_z = rast->depth_clip_far ? maxz : INFINITY;
862 }
863
864 tiler_postfix->viewport = T.gpu;
865 panfrost_batch_union_scissor(batch, minx, miny, maxx, maxy);
866 }
867
868 static mali_ptr
869 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
870 enum pipe_shader_type st,
871 struct panfrost_constant_buffer *buf,
872 unsigned index)
873 {
874 struct pipe_constant_buffer *cb = &buf->cb[index];
875 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
876
877 if (rsrc) {
878 panfrost_batch_add_bo(batch, rsrc->bo,
879 PAN_BO_ACCESS_SHARED |
880 PAN_BO_ACCESS_READ |
881 panfrost_bo_access_for_stage(st));
882
883 /* Alignment gauranteed by
884 * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
885 return rsrc->bo->gpu + cb->buffer_offset;
886 } else if (cb->user_buffer) {
887 return panfrost_pool_upload_aligned(&batch->pool,
888 cb->user_buffer +
889 cb->buffer_offset,
890 cb->buffer_size, 16);
891 } else {
892 unreachable("No constant buffer");
893 }
894 }
895
896 struct sysval_uniform {
897 union {
898 float f[4];
899 int32_t i[4];
900 uint32_t u[4];
901 uint64_t du[2];
902 };
903 };
904
905 static void
906 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
907 struct sysval_uniform *uniform)
908 {
909 struct panfrost_context *ctx = batch->ctx;
910 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
911
912 uniform->f[0] = vp->scale[0];
913 uniform->f[1] = vp->scale[1];
914 uniform->f[2] = vp->scale[2];
915 }
916
917 static void
918 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
919 struct sysval_uniform *uniform)
920 {
921 struct panfrost_context *ctx = batch->ctx;
922 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
923
924 uniform->f[0] = vp->translate[0];
925 uniform->f[1] = vp->translate[1];
926 uniform->f[2] = vp->translate[2];
927 }
928
929 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
930 enum pipe_shader_type st,
931 unsigned int sysvalid,
932 struct sysval_uniform *uniform)
933 {
934 struct panfrost_context *ctx = batch->ctx;
935 unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
936 unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
937 bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
938 struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
939
940 assert(dim);
941 uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
942
943 if (dim > 1)
944 uniform->i[1] = u_minify(tex->texture->height0,
945 tex->u.tex.first_level);
946
947 if (dim > 2)
948 uniform->i[2] = u_minify(tex->texture->depth0,
949 tex->u.tex.first_level);
950
951 if (is_array)
952 uniform->i[dim] = tex->texture->array_size;
953 }
954
955 static void
956 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
957 enum pipe_shader_type st,
958 unsigned ssbo_id,
959 struct sysval_uniform *uniform)
960 {
961 struct panfrost_context *ctx = batch->ctx;
962
963 assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
964 struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
965
966 /* Compute address */
967 struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
968
969 panfrost_batch_add_bo(batch, bo,
970 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
971 panfrost_bo_access_for_stage(st));
972
973 /* Upload address and size as sysval */
974 uniform->du[0] = bo->gpu + sb.buffer_offset;
975 uniform->u[2] = sb.buffer_size;
976 }
977
978 static void
979 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
980 enum pipe_shader_type st,
981 unsigned samp_idx,
982 struct sysval_uniform *uniform)
983 {
984 struct panfrost_context *ctx = batch->ctx;
985 struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
986
987 uniform->f[0] = sampl->min_lod;
988 uniform->f[1] = sampl->max_lod;
989 uniform->f[2] = sampl->lod_bias;
990
991 /* Even without any errata, Midgard represents "no mipmapping" as
992 * fixing the LOD with the clamps; keep behaviour consistent. c.f.
993 * panfrost_create_sampler_state which also explains our choice of
994 * epsilon value (again to keep behaviour consistent) */
995
996 if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
997 uniform->f[1] = uniform->f[0] + (1.0/256.0);
998 }
999
1000 static void
1001 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1002 struct sysval_uniform *uniform)
1003 {
1004 struct panfrost_context *ctx = batch->ctx;
1005
1006 uniform->u[0] = ctx->compute_grid->grid[0];
1007 uniform->u[1] = ctx->compute_grid->grid[1];
1008 uniform->u[2] = ctx->compute_grid->grid[2];
1009 }
1010
1011 static void
1012 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1013 struct panfrost_shader_state *ss,
1014 enum pipe_shader_type st)
1015 {
1016 struct sysval_uniform *uniforms = (void *)buf;
1017
1018 for (unsigned i = 0; i < ss->sysval_count; ++i) {
1019 int sysval = ss->sysval[i];
1020
1021 switch (PAN_SYSVAL_TYPE(sysval)) {
1022 case PAN_SYSVAL_VIEWPORT_SCALE:
1023 panfrost_upload_viewport_scale_sysval(batch,
1024 &uniforms[i]);
1025 break;
1026 case PAN_SYSVAL_VIEWPORT_OFFSET:
1027 panfrost_upload_viewport_offset_sysval(batch,
1028 &uniforms[i]);
1029 break;
1030 case PAN_SYSVAL_TEXTURE_SIZE:
1031 panfrost_upload_txs_sysval(batch, st,
1032 PAN_SYSVAL_ID(sysval),
1033 &uniforms[i]);
1034 break;
1035 case PAN_SYSVAL_SSBO:
1036 panfrost_upload_ssbo_sysval(batch, st,
1037 PAN_SYSVAL_ID(sysval),
1038 &uniforms[i]);
1039 break;
1040 case PAN_SYSVAL_NUM_WORK_GROUPS:
1041 panfrost_upload_num_work_groups_sysval(batch,
1042 &uniforms[i]);
1043 break;
1044 case PAN_SYSVAL_SAMPLER:
1045 panfrost_upload_sampler_sysval(batch, st,
1046 PAN_SYSVAL_ID(sysval),
1047 &uniforms[i]);
1048 break;
1049 default:
1050 assert(0);
1051 }
1052 }
1053 }
1054
1055 static const void *
1056 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1057 unsigned index)
1058 {
1059 struct pipe_constant_buffer *cb = &buf->cb[index];
1060 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1061
1062 if (rsrc)
1063 return rsrc->bo->cpu;
1064 else if (cb->user_buffer)
1065 return cb->user_buffer;
1066 else
1067 unreachable("No constant buffer");
1068 }
1069
1070 void
1071 panfrost_emit_const_buf(struct panfrost_batch *batch,
1072 enum pipe_shader_type stage,
1073 struct mali_vertex_tiler_postfix *postfix)
1074 {
1075 struct panfrost_context *ctx = batch->ctx;
1076 struct panfrost_shader_variants *all = ctx->shader[stage];
1077
1078 if (!all)
1079 return;
1080
1081 struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1082
1083 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1084
1085 /* Uniforms are implicitly UBO #0 */
1086 bool has_uniforms = buf->enabled_mask & (1 << 0);
1087
1088 /* Allocate room for the sysval and the uniforms */
1089 size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1090 size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1091 size_t size = sys_size + uniform_size;
1092 struct panfrost_transfer transfer =
1093 panfrost_pool_alloc_aligned(&batch->pool, size, 16);
1094
1095 /* Upload sysvals requested by the shader */
1096 panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1097
1098 /* Upload uniforms */
1099 if (has_uniforms && uniform_size) {
1100 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1101 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1102 }
1103
1104 /* Next up, attach UBOs. UBO #0 is the uniforms we just
1105 * uploaded, so it's always included. The count is the highest UBO
1106 * addressable -- gaps are included. */
1107
1108 unsigned ubo_count = 32 - __builtin_clz(buf->enabled_mask | 1);
1109
1110 size_t sz = MALI_UNIFORM_BUFFER_LENGTH * ubo_count;
1111 struct panfrost_transfer ubos =
1112 panfrost_pool_alloc_aligned(&batch->pool, sz,
1113 MALI_UNIFORM_BUFFER_LENGTH);
1114
1115 uint64_t *ubo_ptr = (uint64_t *) ubos.cpu;
1116
1117 /* Upload uniforms as a UBO */
1118
1119 if (size) {
1120 pan_pack(ubo_ptr, UNIFORM_BUFFER, cfg) {
1121 cfg.entries = DIV_ROUND_UP(size, 16);
1122 cfg.pointer = transfer.gpu;
1123 }
1124 } else {
1125 *ubo_ptr = 0;
1126 }
1127
1128 /* The rest are honest-to-goodness UBOs */
1129
1130 for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1131 size_t usz = buf->cb[ubo].buffer_size;
1132 bool enabled = buf->enabled_mask & (1 << ubo);
1133 bool empty = usz == 0;
1134
1135 if (!enabled || empty) {
1136 ubo_ptr[ubo] = 0;
1137 continue;
1138 }
1139
1140 pan_pack(ubo_ptr + ubo, UNIFORM_BUFFER, cfg) {
1141 cfg.entries = DIV_ROUND_UP(usz, 16);
1142 cfg.pointer = panfrost_map_constant_buffer_gpu(batch,
1143 stage, buf, ubo);
1144 }
1145 }
1146
1147 postfix->uniforms = transfer.gpu;
1148 postfix->uniform_buffers = ubos.gpu;
1149
1150 buf->dirty_mask = 0;
1151 }
1152
1153 void
1154 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1155 const struct pipe_grid_info *info,
1156 struct midgard_payload_vertex_tiler *vtp)
1157 {
1158 struct panfrost_context *ctx = batch->ctx;
1159 struct panfrost_device *dev = pan_device(ctx->base.screen);
1160 struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1161 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1162 unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1163 128));
1164
1165 unsigned log2_instances =
1166 util_logbase2_ceil(info->grid[0]) +
1167 util_logbase2_ceil(info->grid[1]) +
1168 util_logbase2_ceil(info->grid[2]);
1169
1170 unsigned shared_size = single_size * (1 << log2_instances) * dev->core_count;
1171 struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1172 shared_size,
1173 1);
1174
1175 struct mali_shared_memory shared = {
1176 .shared_memory = bo->gpu,
1177 .shared_workgroup_count = log2_instances,
1178 .shared_shift = util_logbase2(single_size) + 1
1179 };
1180
1181 vtp->postfix.shared_memory = panfrost_pool_upload_aligned(&batch->pool, &shared,
1182 sizeof(shared), 64);
1183 }
1184
1185 static mali_ptr
1186 panfrost_get_tex_desc(struct panfrost_batch *batch,
1187 enum pipe_shader_type st,
1188 struct panfrost_sampler_view *view)
1189 {
1190 if (!view)
1191 return (mali_ptr) 0;
1192
1193 struct pipe_sampler_view *pview = &view->base;
1194 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1195
1196 /* Add the BO to the job so it's retained until the job is done. */
1197
1198 panfrost_batch_add_bo(batch, rsrc->bo,
1199 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1200 panfrost_bo_access_for_stage(st));
1201
1202 panfrost_batch_add_bo(batch, view->bo,
1203 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1204 panfrost_bo_access_for_stage(st));
1205
1206 return view->bo->gpu;
1207 }
1208
1209 static void
1210 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
1211 struct pipe_context *pctx)
1212 {
1213 struct panfrost_resource *rsrc = pan_resource(view->base.texture);
1214 if (view->texture_bo != rsrc->bo->gpu ||
1215 view->modifier != rsrc->modifier) {
1216 panfrost_bo_unreference(view->bo);
1217 panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);
1218 }
1219 }
1220
1221 void
1222 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1223 enum pipe_shader_type stage,
1224 struct mali_vertex_tiler_postfix *postfix)
1225 {
1226 struct panfrost_context *ctx = batch->ctx;
1227 struct panfrost_device *device = pan_device(ctx->base.screen);
1228
1229 if (!ctx->sampler_view_count[stage])
1230 return;
1231
1232 if (device->quirks & IS_BIFROST) {
1233 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1234 MALI_BIFROST_TEXTURE_LENGTH *
1235 ctx->sampler_view_count[stage],
1236 MALI_BIFROST_TEXTURE_LENGTH);
1237
1238 struct mali_bifrost_texture_packed *out =
1239 (struct mali_bifrost_texture_packed *) T.cpu;
1240
1241 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1242 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1243 struct pipe_sampler_view *pview = &view->base;
1244 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1245
1246 panfrost_update_sampler_view(view, &ctx->base);
1247 out[i] = view->bifrost_descriptor;
1248
1249 /* Add the BOs to the job so they are retained until the job is done. */
1250
1251 panfrost_batch_add_bo(batch, rsrc->bo,
1252 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1253 panfrost_bo_access_for_stage(stage));
1254
1255 panfrost_batch_add_bo(batch, view->bo,
1256 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1257 panfrost_bo_access_for_stage(stage));
1258 }
1259
1260 postfix->textures = T.gpu;
1261 } else {
1262 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1263
1264 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1265 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1266
1267 panfrost_update_sampler_view(view, &ctx->base);
1268
1269 trampolines[i] = panfrost_get_tex_desc(batch, stage, view);
1270 }
1271
1272 postfix->textures = panfrost_pool_upload_aligned(&batch->pool,
1273 trampolines,
1274 sizeof(uint64_t) *
1275 ctx->sampler_view_count[stage],
1276 sizeof(uint64_t));
1277 }
1278 }
1279
1280 void
1281 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1282 enum pipe_shader_type stage,
1283 struct mali_vertex_tiler_postfix *postfix)
1284 {
1285 struct panfrost_context *ctx = batch->ctx;
1286
1287 if (!ctx->sampler_count[stage])
1288 return;
1289
1290 size_t desc_size = MALI_BIFROST_SAMPLER_LENGTH;
1291 assert(MALI_BIFROST_SAMPLER_LENGTH == MALI_MIDGARD_SAMPLER_LENGTH);
1292
1293 size_t sz = desc_size * ctx->sampler_count[stage];
1294 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool, sz, desc_size);
1295 struct mali_midgard_sampler_packed *out = (struct mali_midgard_sampler_packed *) T.cpu;
1296
1297 for (unsigned i = 0; i < ctx->sampler_count[stage]; ++i)
1298 out[i] = ctx->samplers[stage][i]->hw;
1299
1300 postfix->sampler_descriptor = T.gpu;
1301 }
1302
1303 void
1304 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1305 struct mali_vertex_tiler_postfix *vertex_postfix)
1306 {
1307 struct panfrost_context *ctx = batch->ctx;
1308 struct panfrost_vertex_state *so = ctx->vertex;
1309 struct panfrost_shader_state *vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1310
1311 unsigned instance_shift = vertex_postfix->instance_shift;
1312 unsigned instance_odd = vertex_postfix->instance_odd;
1313
1314 /* Worst case: everything is NPOT, which is only possible if instancing
1315 * is enabled. Otherwise single record is gauranteed */
1316 bool could_npot = instance_shift || instance_odd;
1317
1318 struct panfrost_transfer S = panfrost_pool_alloc_aligned(&batch->pool,
1319 MALI_ATTRIBUTE_BUFFER_LENGTH * vs->attribute_count *
1320 (could_npot ? 2 : 1),
1321 MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
1322
1323 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1324 MALI_ATTRIBUTE_LENGTH * vs->attribute_count,
1325 MALI_ATTRIBUTE_LENGTH);
1326
1327 struct mali_attribute_buffer_packed *bufs =
1328 (struct mali_attribute_buffer_packed *) S.cpu;
1329
1330 struct mali_attribute_packed *out =
1331 (struct mali_attribute_packed *) T.cpu;
1332
1333 unsigned attrib_to_buffer[PIPE_MAX_ATTRIBS] = { 0 };
1334 unsigned k = 0;
1335
1336 for (unsigned i = 0; i < so->num_elements; ++i) {
1337 /* We map buffers 1:1 with the attributes, which
1338 * means duplicating some vertex buffers (who cares? aside from
1339 * maybe some caching implications but I somehow doubt that
1340 * matters) */
1341
1342 struct pipe_vertex_element *elem = &so->pipe[i];
1343 unsigned vbi = elem->vertex_buffer_index;
1344 attrib_to_buffer[i] = k;
1345
1346 if (!(ctx->vb_mask & (1 << vbi)))
1347 continue;
1348
1349 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1350 struct panfrost_resource *rsrc;
1351
1352 rsrc = pan_resource(buf->buffer.resource);
1353 if (!rsrc)
1354 continue;
1355
1356 /* Add a dependency of the batch on the vertex buffer */
1357 panfrost_batch_add_bo(batch, rsrc->bo,
1358 PAN_BO_ACCESS_SHARED |
1359 PAN_BO_ACCESS_READ |
1360 PAN_BO_ACCESS_VERTEX_TILER);
1361
1362 /* Mask off lower bits, see offset fixup below */
1363 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1364 mali_ptr addr = raw_addr & ~63;
1365
1366 /* Since we advanced the base pointer, we shrink the buffer
1367 * size, but add the offset we subtracted */
1368 unsigned size = rsrc->base.width0 + (raw_addr - addr)
1369 - buf->buffer_offset;
1370
1371 /* When there is a divisor, the hardware-level divisor is
1372 * the product of the instance divisor and the padded count */
1373 unsigned divisor = elem->instance_divisor;
1374 unsigned hw_divisor = ctx->padded_count * divisor;
1375 unsigned stride = buf->stride;
1376
1377 /* If there's a divisor(=1) but no instancing, we want every
1378 * attribute to be the same */
1379
1380 if (divisor && ctx->instance_count == 1)
1381 stride = 0;
1382
1383 if (!divisor || ctx->instance_count <= 1) {
1384 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1385 if (ctx->instance_count > 1)
1386 cfg.type = MALI_ATTRIBUTE_TYPE_1D_MODULUS;
1387
1388 cfg.pointer = addr;
1389 cfg.stride = stride;
1390 cfg.size = size;
1391 cfg.divisor_r = instance_shift;
1392 cfg.divisor_p = instance_odd;
1393 }
1394 } else if (util_is_power_of_two_or_zero(hw_divisor)) {
1395 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1396 cfg.type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;
1397 cfg.pointer = addr;
1398 cfg.stride = stride;
1399 cfg.size = size;
1400 cfg.divisor_r = __builtin_ctz(hw_divisor);
1401 }
1402
1403 } else {
1404 unsigned shift = 0, extra_flags = 0;
1405
1406 unsigned magic_divisor =
1407 panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags);
1408
1409 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1410 cfg.type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;
1411 cfg.pointer = addr;
1412 cfg.stride = stride;
1413 cfg.size = size;
1414
1415 cfg.divisor_r = shift;
1416 cfg.divisor_e = extra_flags;
1417 }
1418
1419 pan_pack(bufs + k + 1, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) {
1420 cfg.divisor_numerator = magic_divisor;
1421 cfg.divisor = divisor;
1422 }
1423
1424 ++k;
1425 }
1426
1427 ++k;
1428 }
1429
1430 /* Add special gl_VertexID/gl_InstanceID buffers */
1431
1432 if (unlikely(vs->attribute_count >= PAN_VERTEX_ID)) {
1433 panfrost_vertex_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1434
1435 pan_pack(out + PAN_VERTEX_ID, ATTRIBUTE, cfg) {
1436 cfg.buffer_index = k++;
1437 cfg.format = so->formats[PAN_VERTEX_ID];
1438 }
1439
1440 panfrost_instance_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1441
1442 pan_pack(out + PAN_INSTANCE_ID, ATTRIBUTE, cfg) {
1443 cfg.buffer_index = k++;
1444 cfg.format = so->formats[PAN_INSTANCE_ID];
1445 }
1446 }
1447
1448 /* Attribute addresses require 64-byte alignment, so let:
1449 *
1450 * base' = base & ~63 = base - (base & 63)
1451 * offset' = offset + (base & 63)
1452 *
1453 * Since base' + offset' = base + offset, these are equivalent
1454 * addressing modes and now base is 64 aligned.
1455 */
1456
1457 unsigned start = vertex_postfix->offset_start;
1458
1459 for (unsigned i = 0; i < so->num_elements; ++i) {
1460 unsigned vbi = so->pipe[i].vertex_buffer_index;
1461 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1462
1463 /* Adjust by the masked off bits of the offset. Make sure we
1464 * read src_offset from so->hw (which is not GPU visible)
1465 * rather than target (which is) due to caching effects */
1466
1467 unsigned src_offset = so->pipe[i].src_offset;
1468
1469 /* BOs aligned to 4k so guaranteed aligned to 64 */
1470 src_offset += (buf->buffer_offset & 63);
1471
1472 /* Also, somewhat obscurely per-instance data needs to be
1473 * offset in response to a delayed start in an indexed draw */
1474
1475 if (so->pipe[i].instance_divisor && ctx->instance_count > 1 && start)
1476 src_offset -= buf->stride * start;
1477
1478 pan_pack(out + i, ATTRIBUTE, cfg) {
1479 cfg.buffer_index = attrib_to_buffer[i];
1480 cfg.format = so->formats[i];
1481 cfg.offset = src_offset;
1482 }
1483 }
1484
1485 vertex_postfix->attributes = S.gpu;
1486 vertex_postfix->attribute_meta = T.gpu;
1487 }
1488
1489 static mali_ptr
1490 panfrost_emit_varyings(struct panfrost_batch *batch,
1491 struct mali_attribute_buffer_packed *slot,
1492 unsigned stride, unsigned count)
1493 {
1494 unsigned size = stride * count;
1495 mali_ptr ptr = panfrost_pool_alloc_aligned(&batch->invisible_pool, size, 64).gpu;
1496
1497 pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1498 cfg.stride = stride;
1499 cfg.size = size;
1500 cfg.pointer = ptr;
1501 }
1502
1503 return ptr;
1504 }
1505
1506 static unsigned
1507 panfrost_streamout_offset(unsigned stride, unsigned offset,
1508 struct pipe_stream_output_target *target)
1509 {
1510 return (target->buffer_offset + (offset * stride * 4)) & 63;
1511 }
1512
1513 static void
1514 panfrost_emit_streamout(struct panfrost_batch *batch,
1515 struct mali_attribute_buffer_packed *slot,
1516 unsigned stride_words, unsigned offset, unsigned count,
1517 struct pipe_stream_output_target *target)
1518 {
1519 unsigned stride = stride_words * 4;
1520 unsigned max_size = target->buffer_size;
1521 unsigned expected_size = stride * count;
1522
1523 /* Grab the BO and bind it to the batch */
1524 struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1525
1526 /* Varyings are WRITE from the perspective of the VERTEX but READ from
1527 * the perspective of the TILER and FRAGMENT.
1528 */
1529 panfrost_batch_add_bo(batch, bo,
1530 PAN_BO_ACCESS_SHARED |
1531 PAN_BO_ACCESS_RW |
1532 PAN_BO_ACCESS_VERTEX_TILER |
1533 PAN_BO_ACCESS_FRAGMENT);
1534
1535 /* We will have an offset applied to get alignment */
1536 mali_ptr addr = bo->gpu + target->buffer_offset + (offset * stride);
1537
1538 pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1539 cfg.pointer = (addr & ~63);
1540 cfg.stride = stride;
1541 cfg.size = MIN2(max_size, expected_size) + (addr & 63);
1542 }
1543 }
1544
1545 static bool
1546 has_point_coord(unsigned mask, gl_varying_slot loc)
1547 {
1548 if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1549 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1550 else if (loc == VARYING_SLOT_PNTC)
1551 return (mask & (1 << 8));
1552 else
1553 return false;
1554 }
1555
1556 /* Helpers for manipulating stream out information so we can pack varyings
1557 * accordingly. Compute the src_offset for a given captured varying */
1558
1559 static struct pipe_stream_output *
1560 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1561 {
1562 for (unsigned i = 0; i < info->num_outputs; ++i) {
1563 if (info->output[i].register_index == loc)
1564 return &info->output[i];
1565 }
1566
1567 unreachable("Varying not captured");
1568 }
1569
1570 static unsigned
1571 pan_varying_size(enum mali_format fmt)
1572 {
1573 unsigned type = MALI_EXTRACT_TYPE(fmt);
1574 unsigned chan = MALI_EXTRACT_CHANNELS(fmt);
1575 unsigned bits = MALI_EXTRACT_BITS(fmt);
1576 unsigned bpc = 0;
1577
1578 if (bits == MALI_CHANNEL_FLOAT) {
1579 /* No doubles */
1580 bool fp16 = (type == MALI_FORMAT_SINT);
1581 assert(fp16 || (type == MALI_FORMAT_UNORM));
1582
1583 bpc = fp16 ? 2 : 4;
1584 } else {
1585 assert(type >= MALI_FORMAT_SNORM && type <= MALI_FORMAT_SINT);
1586
1587 /* See the enums */
1588 bits = 1 << bits;
1589 assert(bits >= 8);
1590 bpc = bits / 8;
1591 }
1592
1593 return bpc * chan;
1594 }
1595
1596 /* Indices for named (non-XFB) varyings that are present. These are packed
1597 * tightly so they correspond to a bitfield present (P) indexed by (1 <<
1598 * PAN_VARY_*). This has the nice property that you can lookup the buffer index
1599 * of a given special field given a shift S by:
1600 *
1601 * idx = popcount(P & ((1 << S) - 1))
1602 *
1603 * That is... look at all of the varyings that come earlier and count them, the
1604 * count is the new index since plus one. Likewise, the total number of special
1605 * buffers required is simply popcount(P)
1606 */
1607
1608 enum pan_special_varying {
1609 PAN_VARY_GENERAL = 0,
1610 PAN_VARY_POSITION = 1,
1611 PAN_VARY_PSIZ = 2,
1612 PAN_VARY_PNTCOORD = 3,
1613 PAN_VARY_FACE = 4,
1614 PAN_VARY_FRAGCOORD = 5,
1615
1616 /* Keep last */
1617 PAN_VARY_MAX,
1618 };
1619
1620 /* Given a varying, figure out which index it correpsonds to */
1621
1622 static inline unsigned
1623 pan_varying_index(unsigned present, enum pan_special_varying v)
1624 {
1625 unsigned mask = (1 << v) - 1;
1626 return util_bitcount(present & mask);
1627 }
1628
1629 /* Get the base offset for XFB buffers, which by convention come after
1630 * everything else. Wrapper function for semantic reasons; by construction this
1631 * is just popcount. */
1632
1633 static inline unsigned
1634 pan_xfb_base(unsigned present)
1635 {
1636 return util_bitcount(present);
1637 }
1638
1639 /* Computes the present mask for varyings so we can start emitting varying records */
1640
1641 static inline unsigned
1642 pan_varying_present(
1643 struct panfrost_shader_state *vs,
1644 struct panfrost_shader_state *fs,
1645 unsigned quirks)
1646 {
1647 /* At the moment we always emit general and position buffers. Not
1648 * strictly necessary but usually harmless */
1649
1650 unsigned present = (1 << PAN_VARY_GENERAL) | (1 << PAN_VARY_POSITION);
1651
1652 /* Enable special buffers by the shader info */
1653
1654 if (vs->writes_point_size)
1655 present |= (1 << PAN_VARY_PSIZ);
1656
1657 if (fs->reads_point_coord)
1658 present |= (1 << PAN_VARY_PNTCOORD);
1659
1660 if (fs->reads_face)
1661 present |= (1 << PAN_VARY_FACE);
1662
1663 if (fs->reads_frag_coord && !(quirks & IS_BIFROST))
1664 present |= (1 << PAN_VARY_FRAGCOORD);
1665
1666 /* Also, if we have a point sprite, we need a point coord buffer */
1667
1668 for (unsigned i = 0; i < fs->varying_count; i++) {
1669 gl_varying_slot loc = fs->varyings_loc[i];
1670
1671 if (has_point_coord(fs->point_sprite_mask, loc))
1672 present |= (1 << PAN_VARY_PNTCOORD);
1673 }
1674
1675 return present;
1676 }
1677
1678 /* Emitters for varying records */
1679
1680 static void
1681 pan_emit_vary(struct mali_attribute_packed *out,
1682 unsigned present, enum pan_special_varying buf,
1683 unsigned quirks, enum mali_format format,
1684 unsigned offset)
1685 {
1686 unsigned nr_channels = MALI_EXTRACT_CHANNELS(format);
1687 unsigned swizzle = quirks & HAS_SWIZZLES ?
1688 panfrost_get_default_swizzle(nr_channels) :
1689 panfrost_bifrost_swizzle(nr_channels);
1690
1691 pan_pack(out, ATTRIBUTE, cfg) {
1692 cfg.buffer_index = pan_varying_index(present, buf);
1693 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1694 cfg.format = (format << 12) | swizzle;
1695 cfg.offset = offset;
1696 }
1697 }
1698
1699 /* General varying that is unused */
1700
1701 static void
1702 pan_emit_vary_only(struct mali_attribute_packed *out,
1703 unsigned present, unsigned quirks)
1704 {
1705 pan_emit_vary(out, present, 0, quirks, MALI_VARYING_DISCARD, 0);
1706 }
1707
1708 /* Special records */
1709
1710 static const enum mali_format pan_varying_formats[PAN_VARY_MAX] = {
1711 [PAN_VARY_POSITION] = MALI_VARYING_POS,
1712 [PAN_VARY_PSIZ] = MALI_R16F,
1713 [PAN_VARY_PNTCOORD] = MALI_R16F,
1714 [PAN_VARY_FACE] = MALI_R32I,
1715 [PAN_VARY_FRAGCOORD] = MALI_RGBA32F
1716 };
1717
1718 static void
1719 pan_emit_vary_special(struct mali_attribute_packed *out,
1720 unsigned present, enum pan_special_varying buf,
1721 unsigned quirks)
1722 {
1723 assert(buf < PAN_VARY_MAX);
1724 pan_emit_vary(out, present, buf, quirks, pan_varying_formats[buf], 0);
1725 }
1726
1727 static enum mali_format
1728 pan_xfb_format(enum mali_format format, unsigned nr)
1729 {
1730 if (MALI_EXTRACT_BITS(format) == MALI_CHANNEL_FLOAT)
1731 return MALI_R32F | MALI_NR_CHANNELS(nr);
1732 else
1733 return MALI_EXTRACT_TYPE(format) | MALI_NR_CHANNELS(nr) | MALI_CHANNEL_32;
1734 }
1735
1736 /* Transform feedback records. Note struct pipe_stream_output is (if packed as
1737 * a bitfield) 32-bit, smaller than a 64-bit pointer, so may as well pass by
1738 * value. */
1739
1740 static void
1741 pan_emit_vary_xfb(struct mali_attribute_packed *out,
1742 unsigned present,
1743 unsigned max_xfb,
1744 unsigned *streamout_offsets,
1745 unsigned quirks,
1746 enum mali_format format,
1747 struct pipe_stream_output o)
1748 {
1749 unsigned swizzle = quirks & HAS_SWIZZLES ?
1750 panfrost_get_default_swizzle(o.num_components) :
1751 panfrost_bifrost_swizzle(o.num_components);
1752
1753 pan_pack(out, ATTRIBUTE, cfg) {
1754 /* XFB buffers come after everything else */
1755 cfg.buffer_index = pan_xfb_base(present) + o.output_buffer;
1756 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1757
1758 /* Override number of channels and precision to highp */
1759 cfg.format = (pan_xfb_format(format, o.num_components) << 12) | swizzle;
1760
1761 /* Apply given offsets together */
1762 cfg.offset = (o.dst_offset * 4) /* dwords */
1763 + streamout_offsets[o.output_buffer];
1764 }
1765 }
1766
1767 /* Determine if we should capture a varying for XFB. This requires actually
1768 * having a buffer for it. If we don't capture it, we'll fallback to a general
1769 * varying path (linked or unlinked, possibly discarding the write) */
1770
1771 static bool
1772 panfrost_xfb_captured(struct panfrost_shader_state *xfb,
1773 unsigned loc, unsigned max_xfb)
1774 {
1775 if (!(xfb->so_mask & (1ll << loc)))
1776 return false;
1777
1778 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1779 return o->output_buffer < max_xfb;
1780 }
1781
1782 static void
1783 pan_emit_general_varying(struct mali_attribute_packed *out,
1784 struct panfrost_shader_state *other,
1785 struct panfrost_shader_state *xfb,
1786 gl_varying_slot loc,
1787 enum mali_format format,
1788 unsigned present,
1789 unsigned quirks,
1790 unsigned *gen_offsets,
1791 enum mali_format *gen_formats,
1792 unsigned *gen_stride,
1793 unsigned idx,
1794 bool should_alloc)
1795 {
1796 /* Check if we're linked */
1797 signed other_idx = -1;
1798
1799 for (unsigned j = 0; j < other->varying_count; ++j) {
1800 if (other->varyings_loc[j] == loc) {
1801 other_idx = j;
1802 break;
1803 }
1804 }
1805
1806 if (other_idx < 0) {
1807 pan_emit_vary_only(out, present, quirks);
1808 return;
1809 }
1810
1811 unsigned offset = gen_offsets[other_idx];
1812
1813 if (should_alloc) {
1814 /* We're linked, so allocate a space via a watermark allocation */
1815 enum mali_format alt = other->varyings[other_idx];
1816
1817 /* Do interpolation at minimum precision */
1818 unsigned size_main = pan_varying_size(format);
1819 unsigned size_alt = pan_varying_size(alt);
1820 unsigned size = MIN2(size_main, size_alt);
1821
1822 /* If a varying is marked for XFB but not actually captured, we
1823 * should match the format to the format that would otherwise
1824 * be used for XFB, since dEQP checks for invariance here. It's
1825 * unclear if this is required by the spec. */
1826
1827 if (xfb->so_mask & (1ull << loc)) {
1828 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1829 format = pan_xfb_format(format, o->num_components);
1830 size = pan_varying_size(format);
1831 } else if (size == size_alt) {
1832 format = alt;
1833 }
1834
1835 gen_offsets[idx] = *gen_stride;
1836 gen_formats[other_idx] = format;
1837 offset = *gen_stride;
1838 *gen_stride += size;
1839 }
1840
1841 pan_emit_vary(out, present, PAN_VARY_GENERAL, quirks, format, offset);
1842 }
1843
1844 /* Higher-level wrapper around all of the above, classifying a varying into one
1845 * of the above types */
1846
1847 static void
1848 panfrost_emit_varying(
1849 struct mali_attribute_packed *out,
1850 struct panfrost_shader_state *stage,
1851 struct panfrost_shader_state *other,
1852 struct panfrost_shader_state *xfb,
1853 unsigned present,
1854 unsigned max_xfb,
1855 unsigned *streamout_offsets,
1856 unsigned quirks,
1857 unsigned *gen_offsets,
1858 enum mali_format *gen_formats,
1859 unsigned *gen_stride,
1860 unsigned idx,
1861 bool should_alloc,
1862 bool is_fragment)
1863 {
1864 gl_varying_slot loc = stage->varyings_loc[idx];
1865 enum mali_format format = stage->varyings[idx];
1866
1867 /* Override format to match linkage */
1868 if (!should_alloc && gen_formats[idx])
1869 format = gen_formats[idx];
1870
1871 if (has_point_coord(stage->point_sprite_mask, loc)) {
1872 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1873 } else if (panfrost_xfb_captured(xfb, loc, max_xfb)) {
1874 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1875 pan_emit_vary_xfb(out, present, max_xfb, streamout_offsets, quirks, format, *o);
1876 } else if (loc == VARYING_SLOT_POS) {
1877 if (is_fragment)
1878 pan_emit_vary_special(out, present, PAN_VARY_FRAGCOORD, quirks);
1879 else
1880 pan_emit_vary_special(out, present, PAN_VARY_POSITION, quirks);
1881 } else if (loc == VARYING_SLOT_PSIZ) {
1882 pan_emit_vary_special(out, present, PAN_VARY_PSIZ, quirks);
1883 } else if (loc == VARYING_SLOT_PNTC) {
1884 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1885 } else if (loc == VARYING_SLOT_FACE) {
1886 pan_emit_vary_special(out, present, PAN_VARY_FACE, quirks);
1887 } else {
1888 pan_emit_general_varying(out, other, xfb, loc, format, present,
1889 quirks, gen_offsets, gen_formats, gen_stride,
1890 idx, should_alloc);
1891 }
1892 }
1893
1894 static void
1895 pan_emit_special_input(struct mali_attribute_buffer_packed *out,
1896 unsigned present,
1897 enum pan_special_varying v,
1898 unsigned special)
1899 {
1900 if (present & (1 << v)) {
1901 unsigned idx = pan_varying_index(present, v);
1902
1903 pan_pack(out + idx, ATTRIBUTE_BUFFER, cfg) {
1904 cfg.special = special;
1905 cfg.type = 0;
1906 }
1907 }
1908 }
1909
1910 void
1911 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1912 unsigned vertex_count,
1913 struct mali_vertex_tiler_postfix *vertex_postfix,
1914 struct mali_vertex_tiler_postfix *tiler_postfix,
1915 union midgard_primitive_size *primitive_size)
1916 {
1917 /* Load the shaders */
1918 struct panfrost_context *ctx = batch->ctx;
1919 struct panfrost_device *dev = pan_device(ctx->base.screen);
1920 struct panfrost_shader_state *vs, *fs;
1921 size_t vs_size, fs_size;
1922
1923 /* Allocate the varying descriptor */
1924
1925 vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1926 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1927 vs_size = MALI_ATTRIBUTE_LENGTH * vs->varying_count;
1928 fs_size = MALI_ATTRIBUTE_LENGTH * fs->varying_count;
1929
1930 struct panfrost_transfer trans = panfrost_pool_alloc_aligned(
1931 &batch->pool, vs_size + fs_size, MALI_ATTRIBUTE_LENGTH);
1932
1933 struct pipe_stream_output_info *so = &vs->stream_output;
1934 unsigned present = pan_varying_present(vs, fs, dev->quirks);
1935
1936 /* Check if this varying is linked by us. This is the case for
1937 * general-purpose, non-captured varyings. If it is, link it. If it's
1938 * not, use the provided stream out information to determine the
1939 * offset, since it was already linked for us. */
1940
1941 unsigned gen_offsets[32];
1942 enum mali_format gen_formats[32];
1943 memset(gen_offsets, 0, sizeof(gen_offsets));
1944 memset(gen_formats, 0, sizeof(gen_formats));
1945
1946 unsigned gen_stride = 0;
1947 assert(vs->varying_count < ARRAY_SIZE(gen_offsets));
1948 assert(fs->varying_count < ARRAY_SIZE(gen_offsets));
1949
1950 unsigned streamout_offsets[32];
1951
1952 for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1953 streamout_offsets[i] = panfrost_streamout_offset(
1954 so->stride[i],
1955 ctx->streamout.offsets[i],
1956 ctx->streamout.targets[i]);
1957 }
1958
1959 struct mali_attribute_packed *ovs = (struct mali_attribute_packed *)trans.cpu;
1960 struct mali_attribute_packed *ofs = ovs + vs->varying_count;
1961
1962 for (unsigned i = 0; i < vs->varying_count; i++) {
1963 panfrost_emit_varying(ovs + i, vs, fs, vs, present,
1964 ctx->streamout.num_targets, streamout_offsets,
1965 dev->quirks,
1966 gen_offsets, gen_formats, &gen_stride, i, true, false);
1967 }
1968
1969 for (unsigned i = 0; i < fs->varying_count; i++) {
1970 panfrost_emit_varying(ofs + i, fs, vs, vs, present,
1971 ctx->streamout.num_targets, streamout_offsets,
1972 dev->quirks,
1973 gen_offsets, gen_formats, &gen_stride, i, false, true);
1974 }
1975
1976 unsigned xfb_base = pan_xfb_base(present);
1977 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1978 MALI_ATTRIBUTE_BUFFER_LENGTH * (xfb_base + ctx->streamout.num_targets),
1979 MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
1980 struct mali_attribute_buffer_packed *varyings =
1981 (struct mali_attribute_buffer_packed *) T.cpu;
1982
1983 /* Emit the stream out buffers */
1984
1985 unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1986 ctx->vertex_count);
1987
1988 for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1989 panfrost_emit_streamout(batch, &varyings[xfb_base + i],
1990 so->stride[i],
1991 ctx->streamout.offsets[i],
1992 out_count,
1993 ctx->streamout.targets[i]);
1994 }
1995
1996 panfrost_emit_varyings(batch,
1997 &varyings[pan_varying_index(present, PAN_VARY_GENERAL)],
1998 gen_stride, vertex_count);
1999
2000 /* fp32 vec4 gl_Position */
2001 tiler_postfix->position_varying = panfrost_emit_varyings(batch,
2002 &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
2003 sizeof(float) * 4, vertex_count);
2004
2005 if (present & (1 << PAN_VARY_PSIZ)) {
2006 primitive_size->pointer = panfrost_emit_varyings(batch,
2007 &varyings[pan_varying_index(present, PAN_VARY_PSIZ)],
2008 2, vertex_count);
2009 }
2010
2011 pan_emit_special_input(varyings, present, PAN_VARY_PNTCOORD, MALI_ATTRIBUTE_SPECIAL_POINT_COORD);
2012 pan_emit_special_input(varyings, present, PAN_VARY_FACE, MALI_ATTRIBUTE_SPECIAL_FRONT_FACING);
2013 pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD, MALI_ATTRIBUTE_SPECIAL_FRAG_COORD);
2014
2015 vertex_postfix->varyings = T.gpu;
2016 tiler_postfix->varyings = T.gpu;
2017
2018 vertex_postfix->varying_meta = trans.gpu;
2019 tiler_postfix->varying_meta = trans.gpu + vs_size;
2020 }
2021
2022 void
2023 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
2024 struct mali_vertex_tiler_prefix *vertex_prefix,
2025 struct mali_vertex_tiler_postfix *vertex_postfix,
2026 struct mali_vertex_tiler_prefix *tiler_prefix,
2027 struct mali_vertex_tiler_postfix *tiler_postfix,
2028 union midgard_primitive_size *primitive_size)
2029 {
2030 struct panfrost_context *ctx = batch->ctx;
2031 struct panfrost_device *device = pan_device(ctx->base.screen);
2032 bool wallpapering = ctx->wallpaper_batch && batch->scoreboard.tiler_dep;
2033 struct bifrost_payload_vertex bifrost_vertex = {0,};
2034 struct bifrost_payload_tiler bifrost_tiler = {0,};
2035 struct midgard_payload_vertex_tiler midgard_vertex = {0,};
2036 struct midgard_payload_vertex_tiler midgard_tiler = {0,};
2037 void *vp, *tp;
2038 size_t vp_size, tp_size;
2039
2040 if (device->quirks & IS_BIFROST) {
2041 bifrost_vertex.prefix = *vertex_prefix;
2042 bifrost_vertex.postfix = *vertex_postfix;
2043 vp = &bifrost_vertex;
2044 vp_size = sizeof(bifrost_vertex);
2045
2046 bifrost_tiler.prefix = *tiler_prefix;
2047 bifrost_tiler.tiler.primitive_size = *primitive_size;
2048 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
2049 bifrost_tiler.postfix = *tiler_postfix;
2050 tp = &bifrost_tiler;
2051 tp_size = sizeof(bifrost_tiler);
2052 } else {
2053 midgard_vertex.prefix = *vertex_prefix;
2054 midgard_vertex.postfix = *vertex_postfix;
2055 vp = &midgard_vertex;
2056 vp_size = sizeof(midgard_vertex);
2057
2058 midgard_tiler.prefix = *tiler_prefix;
2059 midgard_tiler.postfix = *tiler_postfix;
2060 midgard_tiler.primitive_size = *primitive_size;
2061 tp = &midgard_tiler;
2062 tp_size = sizeof(midgard_tiler);
2063 }
2064
2065 if (wallpapering) {
2066 /* Inject in reverse order, with "predicted" job indices.
2067 * THIS IS A HACK XXX */
2068 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false,
2069 batch->scoreboard.job_index + 2, tp, tp_size, true);
2070 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2071 vp, vp_size, true);
2072 return;
2073 }
2074
2075 /* If rasterizer discard is enable, only submit the vertex */
2076
2077 unsigned vertex = panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2078 vp, vp_size, false);
2079
2080 if (ctx->rasterizer->base.rasterizer_discard)
2081 return;
2082
2083 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false, vertex, tp, tp_size,
2084 false);
2085 }
2086
2087 /* TODO: stop hardcoding this */
2088 mali_ptr
2089 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2090 {
2091 uint16_t locations[] = {
2092 128, 128,
2093 0, 256,
2094 0, 256,
2095 0, 256,
2096 0, 256,
2097 0, 256,
2098 0, 256,
2099 0, 256,
2100 0, 256,
2101 0, 256,
2102 0, 256,
2103 0, 256,
2104 0, 256,
2105 0, 256,
2106 0, 256,
2107 0, 256,
2108 0, 256,
2109 0, 256,
2110 0, 256,
2111 0, 256,
2112 0, 256,
2113 0, 256,
2114 0, 256,
2115 0, 256,
2116 0, 256,
2117 0, 256,
2118 0, 256,
2119 0, 256,
2120 0, 256,
2121 0, 256,
2122 0, 256,
2123 0, 256,
2124 128, 128,
2125 0, 0,
2126 0, 0,
2127 0, 0,
2128 0, 0,
2129 0, 0,
2130 0, 0,
2131 0, 0,
2132 0, 0,
2133 0, 0,
2134 0, 0,
2135 0, 0,
2136 0, 0,
2137 0, 0,
2138 0, 0,
2139 0, 0,
2140 };
2141
2142 return panfrost_pool_upload_aligned(&batch->pool, locations, 96 * sizeof(uint16_t), 64);
2143 }