panfrost: Use packs for fragment properties
[mesa.git] / src / gallium / drivers / panfrost / pan_cmdstream.c
1 /*
2 * Copyright (C) 2018 Alyssa Rosenzweig
3 * Copyright (C) 2020 Collabora Ltd.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "util/macros.h"
26 #include "util/u_prim.h"
27 #include "util/u_vbuf.h"
28
29 #include "panfrost-quirks.h"
30
31 #include "pan_pool.h"
32 #include "pan_bo.h"
33 #include "pan_cmdstream.h"
34 #include "pan_context.h"
35 #include "pan_job.h"
36
37 /* If a BO is accessed for a particular shader stage, will it be in the primary
38 * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
39 * fragment will be primary, e.g. compute jobs will be considered
40 * "vertex/tiler" by analogy */
41
42 static inline uint32_t
43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
44 {
45 assert(stage == PIPE_SHADER_FRAGMENT ||
46 stage == PIPE_SHADER_VERTEX ||
47 stage == PIPE_SHADER_COMPUTE);
48
49 return stage == PIPE_SHADER_FRAGMENT ?
50 PAN_BO_ACCESS_FRAGMENT :
51 PAN_BO_ACCESS_VERTEX_TILER;
52 }
53
54 static void
55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
56 struct mali_vertex_tiler_postfix *postfix)
57 {
58 struct panfrost_device *dev = pan_device(ctx->base.screen);
59 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
60
61 struct mali_shared_memory shared = {
62 .shared_workgroup_count = ~0,
63 };
64
65 if (batch->stack_size) {
66 struct panfrost_bo *stack =
67 panfrost_batch_get_scratchpad(batch, batch->stack_size,
68 dev->thread_tls_alloc,
69 dev->core_count);
70
71 shared.stack_shift = panfrost_get_stack_shift(batch->stack_size);
72 shared.scratchpad = stack->gpu;
73 }
74
75 postfix->shared_memory = panfrost_pool_upload_aligned(&batch->pool, &shared, sizeof(shared), 64);
76 }
77
78 static void
79 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
80 struct mali_vertex_tiler_postfix *postfix)
81 {
82 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
83 postfix->shared_memory = panfrost_batch_reserve_framebuffer(batch);
84 }
85
86 static void
87 panfrost_vt_update_rasterizer(struct panfrost_rasterizer *rasterizer,
88 struct mali_vertex_tiler_prefix *prefix,
89 struct mali_vertex_tiler_postfix *postfix)
90 {
91 postfix->gl_enables |= 0x7;
92 SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
93 rasterizer->base.front_ccw);
94 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
95 (rasterizer->base.cull_face & PIPE_FACE_FRONT));
96 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
97 (rasterizer->base.cull_face & PIPE_FACE_BACK));
98 SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
99 rasterizer->base.flatshade_first);
100 }
101
102 void
103 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
104 struct mali_vertex_tiler_prefix *prefix,
105 union midgard_primitive_size *primitive_size)
106 {
107 struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
108
109 if (!panfrost_writes_point_size(ctx)) {
110 float val = (prefix->draw_mode == MALI_DRAW_MODE_POINTS) ?
111 rasterizer->base.point_size :
112 rasterizer->base.line_width;
113
114 primitive_size->constant = val;
115 }
116 }
117
118 static void
119 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
120 struct mali_vertex_tiler_postfix *postfix)
121 {
122 SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
123 if (ctx->occlusion_query) {
124 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
125 panfrost_batch_add_bo(ctx->batch, ctx->occlusion_query->bo,
126 PAN_BO_ACCESS_SHARED |
127 PAN_BO_ACCESS_RW |
128 PAN_BO_ACCESS_FRAGMENT);
129 } else {
130 postfix->occlusion_counter = 0;
131 }
132 }
133
134 void
135 panfrost_vt_init(struct panfrost_context *ctx,
136 enum pipe_shader_type stage,
137 struct mali_vertex_tiler_prefix *prefix,
138 struct mali_vertex_tiler_postfix *postfix)
139 {
140 struct panfrost_device *device = pan_device(ctx->base.screen);
141
142 if (!ctx->shader[stage])
143 return;
144
145 memset(prefix, 0, sizeof(*prefix));
146 memset(postfix, 0, sizeof(*postfix));
147
148 if (device->quirks & IS_BIFROST) {
149 postfix->gl_enables = 0x2;
150 panfrost_vt_emit_shared_memory(ctx, postfix);
151 } else {
152 postfix->gl_enables = 0x6;
153 panfrost_vt_attach_framebuffer(ctx, postfix);
154 }
155
156 if (stage == PIPE_SHADER_FRAGMENT) {
157 panfrost_vt_update_occlusion_query(ctx, postfix);
158 panfrost_vt_update_rasterizer(ctx->rasterizer, prefix, postfix);
159 }
160 }
161
162 static unsigned
163 panfrost_translate_index_size(unsigned size)
164 {
165 switch (size) {
166 case 1:
167 return MALI_DRAW_INDEXED_UINT8;
168
169 case 2:
170 return MALI_DRAW_INDEXED_UINT16;
171
172 case 4:
173 return MALI_DRAW_INDEXED_UINT32;
174
175 default:
176 unreachable("Invalid index size");
177 }
178 }
179
180 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
181 * good for the duration of the draw (transient), could last longer. Also get
182 * the bounds on the index buffer for the range accessed by the draw. We do
183 * these operations together because there are natural optimizations which
184 * require them to be together. */
185
186 static mali_ptr
187 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
188 const struct pipe_draw_info *info,
189 unsigned *min_index, unsigned *max_index)
190 {
191 struct panfrost_resource *rsrc = pan_resource(info->index.resource);
192 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
193 off_t offset = info->start * info->index_size;
194 bool needs_indices = true;
195 mali_ptr out = 0;
196
197 if (info->max_index != ~0u) {
198 *min_index = info->min_index;
199 *max_index = info->max_index;
200 needs_indices = false;
201 }
202
203 if (!info->has_user_indices) {
204 /* Only resources can be directly mapped */
205 panfrost_batch_add_bo(batch, rsrc->bo,
206 PAN_BO_ACCESS_SHARED |
207 PAN_BO_ACCESS_READ |
208 PAN_BO_ACCESS_VERTEX_TILER);
209 out = rsrc->bo->gpu + offset;
210
211 /* Check the cache */
212 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
213 info->start,
214 info->count,
215 min_index,
216 max_index);
217 } else {
218 /* Otherwise, we need to upload to transient memory */
219 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
220 struct panfrost_transfer T =
221 panfrost_pool_alloc_aligned(&batch->pool,
222 info->count * info->index_size,
223 info->index_size);
224
225 memcpy(T.cpu, ibuf8 + offset, info->count * info->index_size);
226 out = T.gpu;
227 }
228
229 if (needs_indices) {
230 /* Fallback */
231 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
232
233 if (!info->has_user_indices)
234 panfrost_minmax_cache_add(rsrc->index_cache,
235 info->start, info->count,
236 *min_index, *max_index);
237 }
238
239 return out;
240 }
241
242 void
243 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
244 const struct pipe_draw_info *info,
245 enum mali_draw_mode draw_mode,
246 struct mali_vertex_tiler_postfix *vertex_postfix,
247 struct mali_vertex_tiler_prefix *tiler_prefix,
248 struct mali_vertex_tiler_postfix *tiler_postfix,
249 unsigned *vertex_count,
250 unsigned *padded_count)
251 {
252 tiler_prefix->draw_mode = draw_mode;
253
254 unsigned draw_flags = 0;
255
256 if (panfrost_writes_point_size(ctx))
257 draw_flags |= MALI_DRAW_VARYING_SIZE;
258
259 if (info->primitive_restart)
260 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
261
262 /* These doesn't make much sense */
263
264 draw_flags |= 0x3000;
265
266 if (info->index_size) {
267 unsigned min_index = 0, max_index = 0;
268
269 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
270 info,
271 &min_index,
272 &max_index);
273
274 /* Use the corresponding values */
275 *vertex_count = max_index - min_index + 1;
276 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
277 tiler_prefix->offset_bias_correction = -min_index;
278 tiler_prefix->index_count = MALI_POSITIVE(info->count);
279 draw_flags |= panfrost_translate_index_size(info->index_size);
280 } else {
281 tiler_prefix->indices = 0;
282 *vertex_count = ctx->vertex_count;
283 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
284 tiler_prefix->offset_bias_correction = 0;
285 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
286 }
287
288 tiler_prefix->unknown_draw = draw_flags;
289
290 /* Encode the padded vertex count */
291
292 if (info->instance_count > 1) {
293 *padded_count = panfrost_padded_vertex_count(*vertex_count);
294
295 unsigned shift = __builtin_ctz(ctx->padded_count);
296 unsigned k = ctx->padded_count >> (shift + 1);
297
298 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
299 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
300 } else {
301 *padded_count = *vertex_count;
302
303 /* Reset instancing state */
304 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
305 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
306 }
307 }
308
309 static void
310 panfrost_emit_compute_shader(struct panfrost_context *ctx,
311 enum pipe_shader_type st,
312 struct mali_shader_meta *meta)
313 {
314 const struct panfrost_device *dev = pan_device(ctx->base.screen);
315 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
316
317 memset(meta, 0, sizeof(*meta));
318 meta->shader = ss->shader;
319 meta->attribute_count = ss->attribute_count;
320 meta->varying_count = ss->varying_count;
321 meta->texture_count = ctx->sampler_view_count[st];
322 meta->sampler_count = ctx->sampler_count[st];
323
324 if (dev->quirks & IS_BIFROST) {
325 meta->bifrost1.unk1 = 0x800000;
326 meta->bifrost2.preload_regs = 0xC0;
327 meta->bifrost2.uniform_count = ss->uniform_count;
328 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
329 } else {
330 struct mali_midgard_properties_packed prop;
331
332 pan_pack(&prop, MIDGARD_PROPERTIES, cfg) {
333 cfg.uniform_buffer_count = panfrost_ubo_count(ctx, st);
334 cfg.uniform_count = ss->uniform_count;
335 cfg.work_register_count = ss->work_reg_count;
336 cfg.writes_globals = ss->writes_global;
337 cfg.suppress_inf_nan = true; /* XXX */
338 }
339
340 memcpy(&meta->midgard1, &prop, sizeof(prop));
341 }
342 }
343
344 static unsigned
345 translate_tex_wrap(enum pipe_tex_wrap w)
346 {
347 switch (w) {
348 case PIPE_TEX_WRAP_REPEAT: return MALI_WRAP_MODE_REPEAT;
349 case PIPE_TEX_WRAP_CLAMP: return MALI_WRAP_MODE_CLAMP;
350 case PIPE_TEX_WRAP_CLAMP_TO_EDGE: return MALI_WRAP_MODE_CLAMP_TO_EDGE;
351 case PIPE_TEX_WRAP_CLAMP_TO_BORDER: return MALI_WRAP_MODE_CLAMP_TO_BORDER;
352 case PIPE_TEX_WRAP_MIRROR_REPEAT: return MALI_WRAP_MODE_MIRRORED_REPEAT;
353 case PIPE_TEX_WRAP_MIRROR_CLAMP: return MALI_WRAP_MODE_MIRRORED_CLAMP;
354 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE;
355 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER;
356 default: unreachable("Invalid wrap");
357 }
358 }
359
360 /* The hardware compares in the wrong order order, so we have to flip before
361 * encoding. Yes, really. */
362
363 static enum mali_func
364 panfrost_sampler_compare_func(const struct pipe_sampler_state *cso)
365 {
366 if (!cso->compare_mode)
367 return MALI_FUNC_NEVER;
368
369 enum mali_func f = panfrost_translate_compare_func(cso->compare_func);
370 return panfrost_flip_compare_func(f);
371 }
372
373 static enum mali_mipmap_mode
374 pan_pipe_to_mipmode(enum pipe_tex_mipfilter f)
375 {
376 switch (f) {
377 case PIPE_TEX_MIPFILTER_NEAREST: return MALI_MIPMAP_MODE_NEAREST;
378 case PIPE_TEX_MIPFILTER_LINEAR: return MALI_MIPMAP_MODE_TRILINEAR;
379 case PIPE_TEX_MIPFILTER_NONE: return MALI_MIPMAP_MODE_NONE;
380 default: unreachable("Invalid");
381 }
382 }
383
384 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
385 struct mali_midgard_sampler_packed *hw)
386 {
387 pan_pack(hw, MIDGARD_SAMPLER, cfg) {
388 cfg.magnify_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
389 cfg.minify_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
390 cfg.mipmap_mode = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR) ?
391 MALI_MIPMAP_MODE_TRILINEAR : MALI_MIPMAP_MODE_NEAREST;
392 cfg.normalized_coordinates = cso->normalized_coords;
393
394 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
395
396 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
397
398 /* If necessary, we disable mipmapping in the sampler descriptor by
399 * clamping the LOD as tight as possible (from 0 to epsilon,
400 * essentially -- remember these are fixed point numbers, so
401 * epsilon=1/256) */
402
403 cfg.maximum_lod = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) ?
404 cfg.minimum_lod + 1 :
405 FIXED_16(cso->max_lod, false);
406
407 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
408 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
409 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
410
411 cfg.compare_function = panfrost_sampler_compare_func(cso);
412 cfg.seamless_cube_map = cso->seamless_cube_map;
413
414 cfg.border_color_r = cso->border_color.f[0];
415 cfg.border_color_g = cso->border_color.f[1];
416 cfg.border_color_b = cso->border_color.f[2];
417 cfg.border_color_a = cso->border_color.f[3];
418 }
419 }
420
421 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
422 struct mali_bifrost_sampler_packed *hw)
423 {
424 pan_pack(hw, BIFROST_SAMPLER, cfg) {
425 cfg.magnify_linear = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR;
426 cfg.minify_linear = cso->min_img_filter == PIPE_TEX_FILTER_LINEAR;
427 cfg.mipmap_mode = pan_pipe_to_mipmode(cso->min_mip_filter);
428 cfg.normalized_coordinates = cso->normalized_coords;
429
430 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
431 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
432 cfg.maximum_lod = FIXED_16(cso->max_lod, false);
433
434 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
435 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
436 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
437
438 cfg.compare_function = panfrost_sampler_compare_func(cso);
439 cfg.seamless_cube_map = cso->seamless_cube_map;
440 }
441 }
442
443 static bool
444 panfrost_fs_required(
445 struct panfrost_shader_state *fs,
446 struct panfrost_blend_final *blend,
447 unsigned rt_count)
448 {
449 /* If we generally have side effects */
450 if (fs->fs_sidefx)
451 return true;
452
453 /* If colour is written we need to execute */
454 for (unsigned i = 0; i < rt_count; ++i) {
455 if (!blend[i].no_colour)
456 return true;
457 }
458
459 /* If depth is written and not implied we need to execute.
460 * TODO: Predicate on Z/S writes being enabled */
461 return (fs->writes_depth || fs->writes_stencil);
462 }
463
464 static void
465 panfrost_emit_blend(struct panfrost_batch *batch, void *rts,
466 struct panfrost_blend_final *blend)
467 {
468 const struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
469 struct panfrost_shader_state *fs = panfrost_get_shader_state(batch->ctx, PIPE_SHADER_FRAGMENT);
470 unsigned rt_count = batch->key.nr_cbufs;
471
472 struct bifrost_blend_rt *brts = rts;
473 struct midgard_blend_rt *mrts = rts;
474
475 /* Disable blending for depth-only on Bifrost */
476
477 if (rt_count == 0 && dev->quirks & IS_BIFROST)
478 brts[0].unk2 = 0x3;
479
480 for (unsigned i = 0; i < rt_count; ++i) {
481 unsigned flags = 0;
482
483 pan_pack(&flags, BLEND_FLAGS, cfg) {
484 if (blend[i].no_colour) {
485 cfg.enable = false;
486 break;
487 }
488
489 batch->draws |= (PIPE_CLEAR_COLOR0 << i);
490
491 cfg.srgb = util_format_is_srgb(batch->key.cbufs[i]->format);
492 cfg.load_destination = blend[i].load_dest;
493 cfg.dither_disable = !batch->ctx->blend->base.dither;
494
495 if (!(dev->quirks & IS_BIFROST))
496 cfg.midgard_blend_shader = blend[i].is_shader;
497 }
498
499 if (dev->quirks & IS_BIFROST) {
500 brts[i].flags = flags;
501
502 if (blend[i].is_shader) {
503 /* The blend shader's address needs to be at
504 * the same top 32 bit as the fragment shader.
505 * TODO: Ensure that's always the case.
506 */
507 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
508 (fs->bo->gpu & (0xffffffffull << 32)));
509 brts[i].shader = blend[i].shader.gpu;
510 brts[i].unk2 = 0x0;
511 } else {
512 enum pipe_format format = batch->key.cbufs[i]->format;
513 const struct util_format_description *format_desc;
514 format_desc = util_format_description(format);
515
516 brts[i].equation = blend[i].equation.equation;
517
518 /* TODO: this is a bit more complicated */
519 brts[i].constant = blend[i].equation.constant;
520
521 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
522
523 /* 0x19 disables blending and forces REPLACE
524 * mode (equivalent to rgb_mode = alpha_mode =
525 * x122, colour mask = 0xF). 0x1a allows
526 * blending. */
527 brts[i].unk2 = blend[i].opaque ? 0x19 : 0x1a;
528
529 brts[i].shader_type = fs->blend_types[i];
530 }
531 } else {
532 memcpy(&mrts[i].flags, &flags, sizeof(flags));
533
534 if (blend[i].is_shader) {
535 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
536 } else {
537 mrts[i].blend.equation = blend[i].equation.equation;
538 mrts[i].blend.constant = blend[i].equation.constant;
539 }
540 }
541 }
542 }
543
544 static void
545 panfrost_emit_frag_shader(struct panfrost_context *ctx,
546 struct mali_shader_meta *fragmeta,
547 struct panfrost_blend_final *blend)
548 {
549 const struct panfrost_device *dev = pan_device(ctx->base.screen);
550 struct panfrost_shader_state *fs;
551
552 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
553
554 struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
555 const struct panfrost_zsa_state *zsa = ctx->depth_stencil;
556
557 memset(fragmeta, 0, sizeof(*fragmeta));
558
559 fragmeta->shader = fs->shader;
560 fragmeta->attribute_count = fs->attribute_count;
561 fragmeta->varying_count = fs->varying_count;
562 fragmeta->texture_count = ctx->sampler_view_count[PIPE_SHADER_FRAGMENT];
563 fragmeta->sampler_count = ctx->sampler_count[PIPE_SHADER_FRAGMENT];
564
565 if (dev->quirks & IS_BIFROST) {
566 /* First clause ATEST |= 0x4000000.
567 * Lefs than 32 regs |= 0x200 */
568 fragmeta->bifrost1.unk1 = 0x950020;
569
570 fragmeta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, PIPE_SHADER_FRAGMENT);
571 fragmeta->bifrost2.preload_regs = 0x1;
572 SET_BIT(fragmeta->bifrost2.preload_regs, 0x10, fs->reads_frag_coord);
573
574 fragmeta->bifrost2.uniform_count = fs->uniform_count;
575 } else {
576 struct mali_midgard_properties_packed prop;
577
578 /* Reasons to disable early-Z from a shader perspective */
579 bool late_z = fs->can_discard || fs->writes_global ||
580 fs->writes_depth || fs->writes_stencil;
581
582 /* Reasons to disable early-Z from a CSO perspective */
583 bool alpha_to_coverage = ctx->blend->base.alpha_to_coverage;
584
585 /* If either depth or stencil is enabled, discard matters */
586 bool zs_enabled =
587 (zsa->base.depth.enabled && zsa->base.depth.func != PIPE_FUNC_ALWAYS) ||
588 zsa->base.stencil[0].enabled;
589
590 pan_pack(&prop, MIDGARD_PROPERTIES, cfg) {
591 cfg.uniform_buffer_count = panfrost_ubo_count(ctx, PIPE_SHADER_FRAGMENT);
592 cfg.uniform_count = fs->uniform_count;
593 cfg.work_register_count = fs->work_reg_count;
594 cfg.writes_globals = fs->writes_global;
595 cfg.suppress_inf_nan = true; /* XXX */
596
597 cfg.stencil_from_shader = fs->writes_stencil;
598 cfg.helper_invocation_enable = fs->helper_invocations;
599 cfg.depth_source = fs->writes_depth ?
600 MALI_DEPTH_SOURCE_SHADER :
601 MALI_DEPTH_SOURCE_FIXED_FUNCTION;
602
603 /* Depend on other state */
604 cfg.early_z_enable = !(late_z || alpha_to_coverage);
605 cfg.reads_tilebuffer = fs->outputs_read || (!zs_enabled && fs->can_discard);
606 cfg.reads_depth_stencil = zs_enabled && fs->can_discard;
607 }
608
609 memcpy(&fragmeta->midgard1, &prop, sizeof(prop));
610 }
611
612 bool msaa = rast->multisample;
613 fragmeta->coverage_mask = msaa ? ctx->sample_mask : ~0;
614
615 fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x10;
616 fragmeta->unknown2_4 = 0x4e0;
617
618 /* TODO: Sample size */
619 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
620 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
621
622 /* EXT_shader_framebuffer_fetch requires the shader to be run
623 * per-sample when outputs are read. */
624 bool per_sample = ctx->min_samples > 1 || fs->outputs_read;
625 SET_BIT(fragmeta->unknown2_3, MALI_PER_SAMPLE, msaa && per_sample);
626
627 fragmeta->depth_units = rast->offset_units * 2.0f;
628 fragmeta->depth_factor = rast->offset_scale;
629
630 /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
631
632 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, rast->offset_tri);
633 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, rast->offset_tri);
634
635 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_NEAR, rast->depth_clip_near);
636 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_FAR, rast->depth_clip_far);
637
638 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
639 zsa->base.stencil[0].enabled);
640
641 fragmeta->stencil_mask_front = zsa->stencil_mask_front;
642 fragmeta->stencil_mask_back = zsa->stencil_mask_back;
643
644 /* Bottom bits for stencil ref, exactly one word */
645 fragmeta->stencil_front.opaque[0] = zsa->stencil_front.opaque[0] | ctx->stencil_ref.ref_value[0];
646
647 /* If back-stencil is not enabled, use the front values */
648
649 if (zsa->base.stencil[1].enabled)
650 fragmeta->stencil_back.opaque[0] = zsa->stencil_back.opaque[0] | ctx->stencil_ref.ref_value[1];
651 else
652 fragmeta->stencil_back = fragmeta->stencil_front;
653
654 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
655 zsa->base.depth.writemask);
656
657 fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
658 fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(
659 zsa->base.depth.enabled ? zsa->base.depth.func : PIPE_FUNC_ALWAYS));
660
661 SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
662 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
663 !ctx->blend->base.dither);
664
665 SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
666
667 SET_BIT(fragmeta->unknown2_4, MALI_ALPHA_TO_COVERAGE,
668 ctx->blend->base.alpha_to_coverage);
669
670 /* Get blending setup */
671 unsigned rt_count = ctx->pipe_framebuffer.nr_cbufs;
672
673 /* Disable shader execution if we can */
674 if (dev->quirks & MIDGARD_SHADERLESS
675 && !panfrost_fs_required(fs, blend, rt_count)) {
676 fragmeta->shader = 0x1;
677 fragmeta->attribute_count = 0;
678 fragmeta->varying_count = 0;
679 fragmeta->texture_count = 0;
680 fragmeta->sampler_count = 0;
681
682 /* This feature is not known to work on Bifrost */
683 fragmeta->midgard1.work_count = 1;
684 fragmeta->midgard1.uniform_count = 0;
685 fragmeta->midgard1.uniform_buffer_count = 0;
686 }
687
688 /* If there is a blend shader, work registers are shared. We impose 8
689 * work registers as a limit for blend shaders. Should be lower XXX */
690
691 if (!(dev->quirks & IS_BIFROST)) {
692 for (unsigned c = 0; c < rt_count; ++c) {
693 if (blend[c].is_shader) {
694 fragmeta->midgard1.work_count =
695 MAX2(fragmeta->midgard1.work_count, 8);
696 }
697 }
698 }
699
700 if (dev->quirks & MIDGARD_SFBD) {
701 /* When only a single render target platform is used, the blend
702 * information is inside the shader meta itself. We additionally
703 * need to signal CAN_DISCARD for nontrivial blend modes (so
704 * we're able to read back the destination buffer) */
705
706 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
707 blend[0].is_shader);
708
709 if (blend[0].is_shader) {
710 fragmeta->blend.shader = blend[0].shader.gpu |
711 blend[0].shader.first_tag;
712 } else {
713 fragmeta->blend.equation = blend[0].equation.equation;
714 fragmeta->blend.constant = blend[0].equation.constant;
715 }
716
717 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
718 blend[0].load_dest);
719 } else if (!(dev->quirks & IS_BIFROST)) {
720 /* Bug where MRT-capable hw apparently reads the last blend
721 * shader from here instead of the usual location? */
722
723 for (signed rt = ((signed) rt_count - 1); rt >= 0; --rt) {
724 if (!blend[rt].is_shader)
725 continue;
726
727 fragmeta->blend.shader = blend[rt].shader.gpu |
728 blend[rt].shader.first_tag;
729 break;
730 }
731 }
732
733 if (dev->quirks & IS_BIFROST) {
734 bool no_blend = true;
735
736 for (unsigned i = 0; i < rt_count; ++i)
737 no_blend &= (!blend[i].load_dest | blend[i].no_colour);
738
739 SET_BIT(fragmeta->bifrost1.unk1, MALI_BIFROST_EARLY_Z,
740 !fs->can_discard && !fs->writes_depth && no_blend);
741 }
742 }
743
744 void
745 panfrost_emit_shader_meta(struct panfrost_batch *batch,
746 enum pipe_shader_type st,
747 struct mali_vertex_tiler_postfix *postfix)
748 {
749 struct panfrost_context *ctx = batch->ctx;
750 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
751
752 if (!ss) {
753 postfix->shader = 0;
754 return;
755 }
756
757 struct mali_shader_meta meta;
758
759 /* Add the shader BO to the batch. */
760 panfrost_batch_add_bo(batch, ss->bo,
761 PAN_BO_ACCESS_PRIVATE |
762 PAN_BO_ACCESS_READ |
763 panfrost_bo_access_for_stage(st));
764
765 mali_ptr shader_ptr;
766
767 if (st == PIPE_SHADER_FRAGMENT) {
768 struct panfrost_device *dev = pan_device(ctx->base.screen);
769 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
770 size_t desc_size = sizeof(meta);
771 void *rts = NULL;
772 struct panfrost_transfer xfer;
773 unsigned rt_size;
774
775 if (dev->quirks & MIDGARD_SFBD)
776 rt_size = 0;
777 else if (dev->quirks & IS_BIFROST)
778 rt_size = sizeof(struct bifrost_blend_rt);
779 else
780 rt_size = sizeof(struct midgard_blend_rt);
781
782 desc_size += rt_size * rt_count;
783
784 if (rt_size)
785 rts = rzalloc_size(ctx, rt_size * rt_count);
786
787 struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
788
789 for (unsigned c = 0; c < ctx->pipe_framebuffer.nr_cbufs; ++c)
790 blend[c] = panfrost_get_blend_for_context(ctx, c);
791
792 panfrost_emit_frag_shader(ctx, &meta, blend);
793
794 if (!(dev->quirks & MIDGARD_SFBD))
795 panfrost_emit_blend(batch, rts, blend);
796 else
797 batch->draws |= PIPE_CLEAR_COLOR0;
798
799 xfer = panfrost_pool_alloc_aligned(&batch->pool, desc_size, sizeof(meta));
800
801 memcpy(xfer.cpu, &meta, sizeof(meta));
802 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
803
804 if (rt_size)
805 ralloc_free(rts);
806
807 shader_ptr = xfer.gpu;
808 } else {
809 panfrost_emit_compute_shader(ctx, st, &meta);
810
811 shader_ptr = panfrost_pool_upload(&batch->pool, &meta,
812 sizeof(meta));
813 }
814
815 postfix->shader = shader_ptr;
816 }
817
818 void
819 panfrost_emit_viewport(struct panfrost_batch *batch,
820 struct mali_vertex_tiler_postfix *tiler_postfix)
821 {
822 struct panfrost_context *ctx = batch->ctx;
823 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
824 const struct pipe_scissor_state *ss = &ctx->scissor;
825 const struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
826 const struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer;
827
828 /* Derive min/max from translate/scale. Note since |x| >= 0 by
829 * definition, we have that -|x| <= |x| hence translate - |scale| <=
830 * translate + |scale|, so the ordering is correct here. */
831 float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
832 float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
833 float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
834 float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
835 float minz = (vp->translate[2] - fabsf(vp->scale[2]));
836 float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
837
838 /* Scissor to the intersection of viewport and to the scissor, clamped
839 * to the framebuffer */
840
841 unsigned minx = MIN2(fb->width, vp_minx);
842 unsigned maxx = MIN2(fb->width, vp_maxx);
843 unsigned miny = MIN2(fb->height, vp_miny);
844 unsigned maxy = MIN2(fb->height, vp_maxy);
845
846 if (ss && rast->scissor) {
847 minx = MAX2(ss->minx, minx);
848 miny = MAX2(ss->miny, miny);
849 maxx = MIN2(ss->maxx, maxx);
850 maxy = MIN2(ss->maxy, maxy);
851 }
852
853 struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool, MALI_VIEWPORT_LENGTH);
854
855 pan_pack(T.cpu, VIEWPORT, cfg) {
856 cfg.scissor_minimum_x = minx;
857 cfg.scissor_minimum_y = miny;
858 cfg.scissor_maximum_x = maxx - 1;
859 cfg.scissor_maximum_y = maxy - 1;
860
861 cfg.minimum_z = rast->depth_clip_near ? minz : -INFINITY;
862 cfg.maximum_z = rast->depth_clip_far ? maxz : INFINITY;
863 }
864
865 tiler_postfix->viewport = T.gpu;
866 panfrost_batch_union_scissor(batch, minx, miny, maxx, maxy);
867 }
868
869 static mali_ptr
870 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
871 enum pipe_shader_type st,
872 struct panfrost_constant_buffer *buf,
873 unsigned index)
874 {
875 struct pipe_constant_buffer *cb = &buf->cb[index];
876 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
877
878 if (rsrc) {
879 panfrost_batch_add_bo(batch, rsrc->bo,
880 PAN_BO_ACCESS_SHARED |
881 PAN_BO_ACCESS_READ |
882 panfrost_bo_access_for_stage(st));
883
884 /* Alignment gauranteed by
885 * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
886 return rsrc->bo->gpu + cb->buffer_offset;
887 } else if (cb->user_buffer) {
888 return panfrost_pool_upload_aligned(&batch->pool,
889 cb->user_buffer +
890 cb->buffer_offset,
891 cb->buffer_size, 16);
892 } else {
893 unreachable("No constant buffer");
894 }
895 }
896
897 struct sysval_uniform {
898 union {
899 float f[4];
900 int32_t i[4];
901 uint32_t u[4];
902 uint64_t du[2];
903 };
904 };
905
906 static void
907 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
908 struct sysval_uniform *uniform)
909 {
910 struct panfrost_context *ctx = batch->ctx;
911 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
912
913 uniform->f[0] = vp->scale[0];
914 uniform->f[1] = vp->scale[1];
915 uniform->f[2] = vp->scale[2];
916 }
917
918 static void
919 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
920 struct sysval_uniform *uniform)
921 {
922 struct panfrost_context *ctx = batch->ctx;
923 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
924
925 uniform->f[0] = vp->translate[0];
926 uniform->f[1] = vp->translate[1];
927 uniform->f[2] = vp->translate[2];
928 }
929
930 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
931 enum pipe_shader_type st,
932 unsigned int sysvalid,
933 struct sysval_uniform *uniform)
934 {
935 struct panfrost_context *ctx = batch->ctx;
936 unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
937 unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
938 bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
939 struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
940
941 assert(dim);
942 uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
943
944 if (dim > 1)
945 uniform->i[1] = u_minify(tex->texture->height0,
946 tex->u.tex.first_level);
947
948 if (dim > 2)
949 uniform->i[2] = u_minify(tex->texture->depth0,
950 tex->u.tex.first_level);
951
952 if (is_array)
953 uniform->i[dim] = tex->texture->array_size;
954 }
955
956 static void
957 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
958 enum pipe_shader_type st,
959 unsigned ssbo_id,
960 struct sysval_uniform *uniform)
961 {
962 struct panfrost_context *ctx = batch->ctx;
963
964 assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
965 struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
966
967 /* Compute address */
968 struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
969
970 panfrost_batch_add_bo(batch, bo,
971 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
972 panfrost_bo_access_for_stage(st));
973
974 /* Upload address and size as sysval */
975 uniform->du[0] = bo->gpu + sb.buffer_offset;
976 uniform->u[2] = sb.buffer_size;
977 }
978
979 static void
980 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
981 enum pipe_shader_type st,
982 unsigned samp_idx,
983 struct sysval_uniform *uniform)
984 {
985 struct panfrost_context *ctx = batch->ctx;
986 struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
987
988 uniform->f[0] = sampl->min_lod;
989 uniform->f[1] = sampl->max_lod;
990 uniform->f[2] = sampl->lod_bias;
991
992 /* Even without any errata, Midgard represents "no mipmapping" as
993 * fixing the LOD with the clamps; keep behaviour consistent. c.f.
994 * panfrost_create_sampler_state which also explains our choice of
995 * epsilon value (again to keep behaviour consistent) */
996
997 if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
998 uniform->f[1] = uniform->f[0] + (1.0/256.0);
999 }
1000
1001 static void
1002 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1003 struct sysval_uniform *uniform)
1004 {
1005 struct panfrost_context *ctx = batch->ctx;
1006
1007 uniform->u[0] = ctx->compute_grid->grid[0];
1008 uniform->u[1] = ctx->compute_grid->grid[1];
1009 uniform->u[2] = ctx->compute_grid->grid[2];
1010 }
1011
1012 static void
1013 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1014 struct panfrost_shader_state *ss,
1015 enum pipe_shader_type st)
1016 {
1017 struct sysval_uniform *uniforms = (void *)buf;
1018
1019 for (unsigned i = 0; i < ss->sysval_count; ++i) {
1020 int sysval = ss->sysval[i];
1021
1022 switch (PAN_SYSVAL_TYPE(sysval)) {
1023 case PAN_SYSVAL_VIEWPORT_SCALE:
1024 panfrost_upload_viewport_scale_sysval(batch,
1025 &uniforms[i]);
1026 break;
1027 case PAN_SYSVAL_VIEWPORT_OFFSET:
1028 panfrost_upload_viewport_offset_sysval(batch,
1029 &uniforms[i]);
1030 break;
1031 case PAN_SYSVAL_TEXTURE_SIZE:
1032 panfrost_upload_txs_sysval(batch, st,
1033 PAN_SYSVAL_ID(sysval),
1034 &uniforms[i]);
1035 break;
1036 case PAN_SYSVAL_SSBO:
1037 panfrost_upload_ssbo_sysval(batch, st,
1038 PAN_SYSVAL_ID(sysval),
1039 &uniforms[i]);
1040 break;
1041 case PAN_SYSVAL_NUM_WORK_GROUPS:
1042 panfrost_upload_num_work_groups_sysval(batch,
1043 &uniforms[i]);
1044 break;
1045 case PAN_SYSVAL_SAMPLER:
1046 panfrost_upload_sampler_sysval(batch, st,
1047 PAN_SYSVAL_ID(sysval),
1048 &uniforms[i]);
1049 break;
1050 default:
1051 assert(0);
1052 }
1053 }
1054 }
1055
1056 static const void *
1057 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1058 unsigned index)
1059 {
1060 struct pipe_constant_buffer *cb = &buf->cb[index];
1061 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1062
1063 if (rsrc)
1064 return rsrc->bo->cpu;
1065 else if (cb->user_buffer)
1066 return cb->user_buffer;
1067 else
1068 unreachable("No constant buffer");
1069 }
1070
1071 void
1072 panfrost_emit_const_buf(struct panfrost_batch *batch,
1073 enum pipe_shader_type stage,
1074 struct mali_vertex_tiler_postfix *postfix)
1075 {
1076 struct panfrost_context *ctx = batch->ctx;
1077 struct panfrost_shader_variants *all = ctx->shader[stage];
1078
1079 if (!all)
1080 return;
1081
1082 struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1083
1084 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1085
1086 /* Uniforms are implicitly UBO #0 */
1087 bool has_uniforms = buf->enabled_mask & (1 << 0);
1088
1089 /* Allocate room for the sysval and the uniforms */
1090 size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1091 size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1092 size_t size = sys_size + uniform_size;
1093 struct panfrost_transfer transfer =
1094 panfrost_pool_alloc_aligned(&batch->pool, size, 16);
1095
1096 /* Upload sysvals requested by the shader */
1097 panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1098
1099 /* Upload uniforms */
1100 if (has_uniforms && uniform_size) {
1101 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1102 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1103 }
1104
1105 /* Next up, attach UBOs. UBO #0 is the uniforms we just
1106 * uploaded */
1107
1108 unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1109 assert(ubo_count >= 1);
1110
1111 size_t sz = MALI_UNIFORM_BUFFER_LENGTH * ubo_count;
1112 struct panfrost_transfer ubos =
1113 panfrost_pool_alloc_aligned(&batch->pool, sz,
1114 MALI_UNIFORM_BUFFER_LENGTH);
1115
1116 uint64_t *ubo_ptr = (uint64_t *) ubos.cpu;
1117
1118 /* Upload uniforms as a UBO */
1119
1120 if (size) {
1121 pan_pack(ubo_ptr, UNIFORM_BUFFER, cfg) {
1122 cfg.entries = DIV_ROUND_UP(size, 16);
1123 cfg.pointer = transfer.gpu;
1124 }
1125 } else {
1126 *ubo_ptr = 0;
1127 }
1128
1129 /* The rest are honest-to-goodness UBOs */
1130
1131 for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1132 size_t usz = buf->cb[ubo].buffer_size;
1133 bool enabled = buf->enabled_mask & (1 << ubo);
1134 bool empty = usz == 0;
1135
1136 if (!enabled || empty) {
1137 ubo_ptr[ubo] = 0;
1138 continue;
1139 }
1140
1141 pan_pack(ubo_ptr + ubo, UNIFORM_BUFFER, cfg) {
1142 cfg.entries = DIV_ROUND_UP(usz, 16);
1143 cfg.pointer = panfrost_map_constant_buffer_gpu(batch,
1144 stage, buf, ubo);
1145 }
1146 }
1147
1148 postfix->uniforms = transfer.gpu;
1149 postfix->uniform_buffers = ubos.gpu;
1150
1151 buf->dirty_mask = 0;
1152 }
1153
1154 void
1155 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1156 const struct pipe_grid_info *info,
1157 struct midgard_payload_vertex_tiler *vtp)
1158 {
1159 struct panfrost_context *ctx = batch->ctx;
1160 struct panfrost_device *dev = pan_device(ctx->base.screen);
1161 struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1162 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1163 unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1164 128));
1165
1166 unsigned log2_instances =
1167 util_logbase2_ceil(info->grid[0]) +
1168 util_logbase2_ceil(info->grid[1]) +
1169 util_logbase2_ceil(info->grid[2]);
1170
1171 unsigned shared_size = single_size * (1 << log2_instances) * dev->core_count;
1172 struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1173 shared_size,
1174 1);
1175
1176 struct mali_shared_memory shared = {
1177 .shared_memory = bo->gpu,
1178 .shared_workgroup_count = log2_instances,
1179 .shared_shift = util_logbase2(single_size) + 1
1180 };
1181
1182 vtp->postfix.shared_memory = panfrost_pool_upload_aligned(&batch->pool, &shared,
1183 sizeof(shared), 64);
1184 }
1185
1186 static mali_ptr
1187 panfrost_get_tex_desc(struct panfrost_batch *batch,
1188 enum pipe_shader_type st,
1189 struct panfrost_sampler_view *view)
1190 {
1191 if (!view)
1192 return (mali_ptr) 0;
1193
1194 struct pipe_sampler_view *pview = &view->base;
1195 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1196
1197 /* Add the BO to the job so it's retained until the job is done. */
1198
1199 panfrost_batch_add_bo(batch, rsrc->bo,
1200 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1201 panfrost_bo_access_for_stage(st));
1202
1203 panfrost_batch_add_bo(batch, view->bo,
1204 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1205 panfrost_bo_access_for_stage(st));
1206
1207 return view->bo->gpu;
1208 }
1209
1210 static void
1211 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
1212 struct pipe_context *pctx)
1213 {
1214 struct panfrost_resource *rsrc = pan_resource(view->base.texture);
1215 if (view->texture_bo != rsrc->bo->gpu ||
1216 view->modifier != rsrc->modifier) {
1217 panfrost_bo_unreference(view->bo);
1218 panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);
1219 }
1220 }
1221
1222 void
1223 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1224 enum pipe_shader_type stage,
1225 struct mali_vertex_tiler_postfix *postfix)
1226 {
1227 struct panfrost_context *ctx = batch->ctx;
1228 struct panfrost_device *device = pan_device(ctx->base.screen);
1229
1230 if (!ctx->sampler_view_count[stage])
1231 return;
1232
1233 if (device->quirks & IS_BIFROST) {
1234 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1235 MALI_BIFROST_TEXTURE_LENGTH *
1236 ctx->sampler_view_count[stage],
1237 MALI_BIFROST_TEXTURE_LENGTH);
1238
1239 struct mali_bifrost_texture_packed *out =
1240 (struct mali_bifrost_texture_packed *) T.cpu;
1241
1242 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1243 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1244 struct pipe_sampler_view *pview = &view->base;
1245 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1246
1247 panfrost_update_sampler_view(view, &ctx->base);
1248 out[i] = view->bifrost_descriptor;
1249
1250 /* Add the BOs to the job so they are retained until the job is done. */
1251
1252 panfrost_batch_add_bo(batch, rsrc->bo,
1253 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1254 panfrost_bo_access_for_stage(stage));
1255
1256 panfrost_batch_add_bo(batch, view->bo,
1257 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1258 panfrost_bo_access_for_stage(stage));
1259 }
1260
1261 postfix->textures = T.gpu;
1262 } else {
1263 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1264
1265 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1266 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1267
1268 panfrost_update_sampler_view(view, &ctx->base);
1269
1270 trampolines[i] = panfrost_get_tex_desc(batch, stage, view);
1271 }
1272
1273 postfix->textures = panfrost_pool_upload_aligned(&batch->pool,
1274 trampolines,
1275 sizeof(uint64_t) *
1276 ctx->sampler_view_count[stage],
1277 sizeof(uint64_t));
1278 }
1279 }
1280
1281 void
1282 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1283 enum pipe_shader_type stage,
1284 struct mali_vertex_tiler_postfix *postfix)
1285 {
1286 struct panfrost_context *ctx = batch->ctx;
1287
1288 if (!ctx->sampler_count[stage])
1289 return;
1290
1291 size_t desc_size = MALI_BIFROST_SAMPLER_LENGTH;
1292 assert(MALI_BIFROST_SAMPLER_LENGTH == MALI_MIDGARD_SAMPLER_LENGTH);
1293
1294 size_t sz = desc_size * ctx->sampler_count[stage];
1295 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool, sz, desc_size);
1296 struct mali_midgard_sampler_packed *out = (struct mali_midgard_sampler_packed *) T.cpu;
1297
1298 for (unsigned i = 0; i < ctx->sampler_count[stage]; ++i)
1299 out[i] = ctx->samplers[stage][i]->hw;
1300
1301 postfix->sampler_descriptor = T.gpu;
1302 }
1303
1304 void
1305 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1306 struct mali_vertex_tiler_postfix *vertex_postfix)
1307 {
1308 struct panfrost_context *ctx = batch->ctx;
1309 struct panfrost_vertex_state *so = ctx->vertex;
1310 struct panfrost_shader_state *vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1311
1312 unsigned instance_shift = vertex_postfix->instance_shift;
1313 unsigned instance_odd = vertex_postfix->instance_odd;
1314
1315 /* Worst case: everything is NPOT, which is only possible if instancing
1316 * is enabled. Otherwise single record is gauranteed */
1317 bool could_npot = instance_shift || instance_odd;
1318
1319 struct panfrost_transfer S = panfrost_pool_alloc_aligned(&batch->pool,
1320 MALI_ATTRIBUTE_BUFFER_LENGTH * vs->attribute_count *
1321 (could_npot ? 2 : 1),
1322 MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
1323
1324 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1325 MALI_ATTRIBUTE_LENGTH * vs->attribute_count,
1326 MALI_ATTRIBUTE_LENGTH);
1327
1328 struct mali_attribute_buffer_packed *bufs =
1329 (struct mali_attribute_buffer_packed *) S.cpu;
1330
1331 struct mali_attribute_packed *out =
1332 (struct mali_attribute_packed *) T.cpu;
1333
1334 unsigned attrib_to_buffer[PIPE_MAX_ATTRIBS] = { 0 };
1335 unsigned k = 0;
1336
1337 for (unsigned i = 0; i < so->num_elements; ++i) {
1338 /* We map buffers 1:1 with the attributes, which
1339 * means duplicating some vertex buffers (who cares? aside from
1340 * maybe some caching implications but I somehow doubt that
1341 * matters) */
1342
1343 struct pipe_vertex_element *elem = &so->pipe[i];
1344 unsigned vbi = elem->vertex_buffer_index;
1345 attrib_to_buffer[i] = k;
1346
1347 if (!(ctx->vb_mask & (1 << vbi)))
1348 continue;
1349
1350 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1351 struct panfrost_resource *rsrc;
1352
1353 rsrc = pan_resource(buf->buffer.resource);
1354 if (!rsrc)
1355 continue;
1356
1357 /* Add a dependency of the batch on the vertex buffer */
1358 panfrost_batch_add_bo(batch, rsrc->bo,
1359 PAN_BO_ACCESS_SHARED |
1360 PAN_BO_ACCESS_READ |
1361 PAN_BO_ACCESS_VERTEX_TILER);
1362
1363 /* Mask off lower bits, see offset fixup below */
1364 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1365 mali_ptr addr = raw_addr & ~63;
1366
1367 /* Since we advanced the base pointer, we shrink the buffer
1368 * size, but add the offset we subtracted */
1369 unsigned size = rsrc->base.width0 + (raw_addr - addr)
1370 - buf->buffer_offset;
1371
1372 /* When there is a divisor, the hardware-level divisor is
1373 * the product of the instance divisor and the padded count */
1374 unsigned divisor = elem->instance_divisor;
1375 unsigned hw_divisor = ctx->padded_count * divisor;
1376 unsigned stride = buf->stride;
1377
1378 /* If there's a divisor(=1) but no instancing, we want every
1379 * attribute to be the same */
1380
1381 if (divisor && ctx->instance_count == 1)
1382 stride = 0;
1383
1384 if (!divisor || ctx->instance_count <= 1) {
1385 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1386 if (ctx->instance_count > 1)
1387 cfg.type = MALI_ATTRIBUTE_TYPE_1D_MODULUS;
1388
1389 cfg.pointer = addr;
1390 cfg.stride = stride;
1391 cfg.size = size;
1392 cfg.divisor_r = instance_shift;
1393 cfg.divisor_p = instance_odd;
1394 }
1395 } else if (util_is_power_of_two_or_zero(hw_divisor)) {
1396 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1397 cfg.type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;
1398 cfg.pointer = addr;
1399 cfg.stride = stride;
1400 cfg.size = size;
1401 cfg.divisor_r = __builtin_ctz(hw_divisor);
1402 }
1403
1404 } else {
1405 unsigned shift = 0, extra_flags = 0;
1406
1407 unsigned magic_divisor =
1408 panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags);
1409
1410 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1411 cfg.type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;
1412 cfg.pointer = addr;
1413 cfg.stride = stride;
1414 cfg.size = size;
1415
1416 cfg.divisor_r = shift;
1417 cfg.divisor_e = extra_flags;
1418 }
1419
1420 pan_pack(bufs + k + 1, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) {
1421 cfg.divisor_numerator = magic_divisor;
1422 cfg.divisor = divisor;
1423 }
1424
1425 ++k;
1426 }
1427
1428 ++k;
1429 }
1430
1431 /* Add special gl_VertexID/gl_InstanceID buffers */
1432
1433 if (unlikely(vs->attribute_count >= PAN_VERTEX_ID)) {
1434 panfrost_vertex_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1435
1436 pan_pack(out + PAN_VERTEX_ID, ATTRIBUTE, cfg) {
1437 cfg.buffer_index = k++;
1438 cfg.format = so->formats[PAN_VERTEX_ID];
1439 }
1440
1441 panfrost_instance_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1442
1443 pan_pack(out + PAN_INSTANCE_ID, ATTRIBUTE, cfg) {
1444 cfg.buffer_index = k++;
1445 cfg.format = so->formats[PAN_INSTANCE_ID];
1446 }
1447 }
1448
1449 /* Attribute addresses require 64-byte alignment, so let:
1450 *
1451 * base' = base & ~63 = base - (base & 63)
1452 * offset' = offset + (base & 63)
1453 *
1454 * Since base' + offset' = base + offset, these are equivalent
1455 * addressing modes and now base is 64 aligned.
1456 */
1457
1458 unsigned start = vertex_postfix->offset_start;
1459
1460 for (unsigned i = 0; i < so->num_elements; ++i) {
1461 unsigned vbi = so->pipe[i].vertex_buffer_index;
1462 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1463
1464 /* Adjust by the masked off bits of the offset. Make sure we
1465 * read src_offset from so->hw (which is not GPU visible)
1466 * rather than target (which is) due to caching effects */
1467
1468 unsigned src_offset = so->pipe[i].src_offset;
1469
1470 /* BOs aligned to 4k so guaranteed aligned to 64 */
1471 src_offset += (buf->buffer_offset & 63);
1472
1473 /* Also, somewhat obscurely per-instance data needs to be
1474 * offset in response to a delayed start in an indexed draw */
1475
1476 if (so->pipe[i].instance_divisor && ctx->instance_count > 1 && start)
1477 src_offset -= buf->stride * start;
1478
1479 pan_pack(out + i, ATTRIBUTE, cfg) {
1480 cfg.buffer_index = attrib_to_buffer[i];
1481 cfg.format = so->formats[i];
1482 cfg.offset = src_offset;
1483 }
1484 }
1485
1486 vertex_postfix->attributes = S.gpu;
1487 vertex_postfix->attribute_meta = T.gpu;
1488 }
1489
1490 static mali_ptr
1491 panfrost_emit_varyings(struct panfrost_batch *batch,
1492 struct mali_attribute_buffer_packed *slot,
1493 unsigned stride, unsigned count)
1494 {
1495 unsigned size = stride * count;
1496 mali_ptr ptr = panfrost_pool_alloc_aligned(&batch->invisible_pool, size, 64).gpu;
1497
1498 pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1499 cfg.stride = stride;
1500 cfg.size = size;
1501 cfg.pointer = ptr;
1502 }
1503
1504 return ptr;
1505 }
1506
1507 static unsigned
1508 panfrost_streamout_offset(unsigned stride, unsigned offset,
1509 struct pipe_stream_output_target *target)
1510 {
1511 return (target->buffer_offset + (offset * stride * 4)) & 63;
1512 }
1513
1514 static void
1515 panfrost_emit_streamout(struct panfrost_batch *batch,
1516 struct mali_attribute_buffer_packed *slot,
1517 unsigned stride_words, unsigned offset, unsigned count,
1518 struct pipe_stream_output_target *target)
1519 {
1520 unsigned stride = stride_words * 4;
1521 unsigned max_size = target->buffer_size;
1522 unsigned expected_size = stride * count;
1523
1524 /* Grab the BO and bind it to the batch */
1525 struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1526
1527 /* Varyings are WRITE from the perspective of the VERTEX but READ from
1528 * the perspective of the TILER and FRAGMENT.
1529 */
1530 panfrost_batch_add_bo(batch, bo,
1531 PAN_BO_ACCESS_SHARED |
1532 PAN_BO_ACCESS_RW |
1533 PAN_BO_ACCESS_VERTEX_TILER |
1534 PAN_BO_ACCESS_FRAGMENT);
1535
1536 /* We will have an offset applied to get alignment */
1537 mali_ptr addr = bo->gpu + target->buffer_offset + (offset * stride);
1538
1539 pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1540 cfg.pointer = (addr & ~63);
1541 cfg.stride = stride;
1542 cfg.size = MIN2(max_size, expected_size) + (addr & 63);
1543 }
1544 }
1545
1546 static bool
1547 has_point_coord(unsigned mask, gl_varying_slot loc)
1548 {
1549 if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1550 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1551 else if (loc == VARYING_SLOT_PNTC)
1552 return (mask & (1 << 8));
1553 else
1554 return false;
1555 }
1556
1557 /* Helpers for manipulating stream out information so we can pack varyings
1558 * accordingly. Compute the src_offset for a given captured varying */
1559
1560 static struct pipe_stream_output *
1561 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1562 {
1563 for (unsigned i = 0; i < info->num_outputs; ++i) {
1564 if (info->output[i].register_index == loc)
1565 return &info->output[i];
1566 }
1567
1568 unreachable("Varying not captured");
1569 }
1570
1571 static unsigned
1572 pan_varying_size(enum mali_format fmt)
1573 {
1574 unsigned type = MALI_EXTRACT_TYPE(fmt);
1575 unsigned chan = MALI_EXTRACT_CHANNELS(fmt);
1576 unsigned bits = MALI_EXTRACT_BITS(fmt);
1577 unsigned bpc = 0;
1578
1579 if (bits == MALI_CHANNEL_FLOAT) {
1580 /* No doubles */
1581 bool fp16 = (type == MALI_FORMAT_SINT);
1582 assert(fp16 || (type == MALI_FORMAT_UNORM));
1583
1584 bpc = fp16 ? 2 : 4;
1585 } else {
1586 assert(type >= MALI_FORMAT_SNORM && type <= MALI_FORMAT_SINT);
1587
1588 /* See the enums */
1589 bits = 1 << bits;
1590 assert(bits >= 8);
1591 bpc = bits / 8;
1592 }
1593
1594 return bpc * chan;
1595 }
1596
1597 /* Indices for named (non-XFB) varyings that are present. These are packed
1598 * tightly so they correspond to a bitfield present (P) indexed by (1 <<
1599 * PAN_VARY_*). This has the nice property that you can lookup the buffer index
1600 * of a given special field given a shift S by:
1601 *
1602 * idx = popcount(P & ((1 << S) - 1))
1603 *
1604 * That is... look at all of the varyings that come earlier and count them, the
1605 * count is the new index since plus one. Likewise, the total number of special
1606 * buffers required is simply popcount(P)
1607 */
1608
1609 enum pan_special_varying {
1610 PAN_VARY_GENERAL = 0,
1611 PAN_VARY_POSITION = 1,
1612 PAN_VARY_PSIZ = 2,
1613 PAN_VARY_PNTCOORD = 3,
1614 PAN_VARY_FACE = 4,
1615 PAN_VARY_FRAGCOORD = 5,
1616
1617 /* Keep last */
1618 PAN_VARY_MAX,
1619 };
1620
1621 /* Given a varying, figure out which index it correpsonds to */
1622
1623 static inline unsigned
1624 pan_varying_index(unsigned present, enum pan_special_varying v)
1625 {
1626 unsigned mask = (1 << v) - 1;
1627 return util_bitcount(present & mask);
1628 }
1629
1630 /* Get the base offset for XFB buffers, which by convention come after
1631 * everything else. Wrapper function for semantic reasons; by construction this
1632 * is just popcount. */
1633
1634 static inline unsigned
1635 pan_xfb_base(unsigned present)
1636 {
1637 return util_bitcount(present);
1638 }
1639
1640 /* Computes the present mask for varyings so we can start emitting varying records */
1641
1642 static inline unsigned
1643 pan_varying_present(
1644 struct panfrost_shader_state *vs,
1645 struct panfrost_shader_state *fs,
1646 unsigned quirks)
1647 {
1648 /* At the moment we always emit general and position buffers. Not
1649 * strictly necessary but usually harmless */
1650
1651 unsigned present = (1 << PAN_VARY_GENERAL) | (1 << PAN_VARY_POSITION);
1652
1653 /* Enable special buffers by the shader info */
1654
1655 if (vs->writes_point_size)
1656 present |= (1 << PAN_VARY_PSIZ);
1657
1658 if (fs->reads_point_coord)
1659 present |= (1 << PAN_VARY_PNTCOORD);
1660
1661 if (fs->reads_face)
1662 present |= (1 << PAN_VARY_FACE);
1663
1664 if (fs->reads_frag_coord && !(quirks & IS_BIFROST))
1665 present |= (1 << PAN_VARY_FRAGCOORD);
1666
1667 /* Also, if we have a point sprite, we need a point coord buffer */
1668
1669 for (unsigned i = 0; i < fs->varying_count; i++) {
1670 gl_varying_slot loc = fs->varyings_loc[i];
1671
1672 if (has_point_coord(fs->point_sprite_mask, loc))
1673 present |= (1 << PAN_VARY_PNTCOORD);
1674 }
1675
1676 return present;
1677 }
1678
1679 /* Emitters for varying records */
1680
1681 static void
1682 pan_emit_vary(struct mali_attribute_packed *out,
1683 unsigned present, enum pan_special_varying buf,
1684 unsigned quirks, enum mali_format format,
1685 unsigned offset)
1686 {
1687 unsigned nr_channels = MALI_EXTRACT_CHANNELS(format);
1688 unsigned swizzle = quirks & HAS_SWIZZLES ?
1689 panfrost_get_default_swizzle(nr_channels) :
1690 panfrost_bifrost_swizzle(nr_channels);
1691
1692 pan_pack(out, ATTRIBUTE, cfg) {
1693 cfg.buffer_index = pan_varying_index(present, buf);
1694 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1695 cfg.format = (format << 12) | swizzle;
1696 cfg.offset = offset;
1697 }
1698 }
1699
1700 /* General varying that is unused */
1701
1702 static void
1703 pan_emit_vary_only(struct mali_attribute_packed *out,
1704 unsigned present, unsigned quirks)
1705 {
1706 pan_emit_vary(out, present, 0, quirks, MALI_VARYING_DISCARD, 0);
1707 }
1708
1709 /* Special records */
1710
1711 static const enum mali_format pan_varying_formats[PAN_VARY_MAX] = {
1712 [PAN_VARY_POSITION] = MALI_VARYING_POS,
1713 [PAN_VARY_PSIZ] = MALI_R16F,
1714 [PAN_VARY_PNTCOORD] = MALI_R16F,
1715 [PAN_VARY_FACE] = MALI_R32I,
1716 [PAN_VARY_FRAGCOORD] = MALI_RGBA32F
1717 };
1718
1719 static void
1720 pan_emit_vary_special(struct mali_attribute_packed *out,
1721 unsigned present, enum pan_special_varying buf,
1722 unsigned quirks)
1723 {
1724 assert(buf < PAN_VARY_MAX);
1725 pan_emit_vary(out, present, buf, quirks, pan_varying_formats[buf], 0);
1726 }
1727
1728 static enum mali_format
1729 pan_xfb_format(enum mali_format format, unsigned nr)
1730 {
1731 if (MALI_EXTRACT_BITS(format) == MALI_CHANNEL_FLOAT)
1732 return MALI_R32F | MALI_NR_CHANNELS(nr);
1733 else
1734 return MALI_EXTRACT_TYPE(format) | MALI_NR_CHANNELS(nr) | MALI_CHANNEL_32;
1735 }
1736
1737 /* Transform feedback records. Note struct pipe_stream_output is (if packed as
1738 * a bitfield) 32-bit, smaller than a 64-bit pointer, so may as well pass by
1739 * value. */
1740
1741 static void
1742 pan_emit_vary_xfb(struct mali_attribute_packed *out,
1743 unsigned present,
1744 unsigned max_xfb,
1745 unsigned *streamout_offsets,
1746 unsigned quirks,
1747 enum mali_format format,
1748 struct pipe_stream_output o)
1749 {
1750 unsigned swizzle = quirks & HAS_SWIZZLES ?
1751 panfrost_get_default_swizzle(o.num_components) :
1752 panfrost_bifrost_swizzle(o.num_components);
1753
1754 pan_pack(out, ATTRIBUTE, cfg) {
1755 /* XFB buffers come after everything else */
1756 cfg.buffer_index = pan_xfb_base(present) + o.output_buffer;
1757 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1758
1759 /* Override number of channels and precision to highp */
1760 cfg.format = (pan_xfb_format(format, o.num_components) << 12) | swizzle;
1761
1762 /* Apply given offsets together */
1763 cfg.offset = (o.dst_offset * 4) /* dwords */
1764 + streamout_offsets[o.output_buffer];
1765 }
1766 }
1767
1768 /* Determine if we should capture a varying for XFB. This requires actually
1769 * having a buffer for it. If we don't capture it, we'll fallback to a general
1770 * varying path (linked or unlinked, possibly discarding the write) */
1771
1772 static bool
1773 panfrost_xfb_captured(struct panfrost_shader_state *xfb,
1774 unsigned loc, unsigned max_xfb)
1775 {
1776 if (!(xfb->so_mask & (1ll << loc)))
1777 return false;
1778
1779 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1780 return o->output_buffer < max_xfb;
1781 }
1782
1783 static void
1784 pan_emit_general_varying(struct mali_attribute_packed *out,
1785 struct panfrost_shader_state *other,
1786 struct panfrost_shader_state *xfb,
1787 gl_varying_slot loc,
1788 enum mali_format format,
1789 unsigned present,
1790 unsigned quirks,
1791 unsigned *gen_offsets,
1792 enum mali_format *gen_formats,
1793 unsigned *gen_stride,
1794 unsigned idx,
1795 bool should_alloc)
1796 {
1797 /* Check if we're linked */
1798 signed other_idx = -1;
1799
1800 for (unsigned j = 0; j < other->varying_count; ++j) {
1801 if (other->varyings_loc[j] == loc) {
1802 other_idx = j;
1803 break;
1804 }
1805 }
1806
1807 if (other_idx < 0) {
1808 pan_emit_vary_only(out, present, quirks);
1809 return;
1810 }
1811
1812 unsigned offset = gen_offsets[other_idx];
1813
1814 if (should_alloc) {
1815 /* We're linked, so allocate a space via a watermark allocation */
1816 enum mali_format alt = other->varyings[other_idx];
1817
1818 /* Do interpolation at minimum precision */
1819 unsigned size_main = pan_varying_size(format);
1820 unsigned size_alt = pan_varying_size(alt);
1821 unsigned size = MIN2(size_main, size_alt);
1822
1823 /* If a varying is marked for XFB but not actually captured, we
1824 * should match the format to the format that would otherwise
1825 * be used for XFB, since dEQP checks for invariance here. It's
1826 * unclear if this is required by the spec. */
1827
1828 if (xfb->so_mask & (1ull << loc)) {
1829 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1830 format = pan_xfb_format(format, o->num_components);
1831 size = pan_varying_size(format);
1832 } else if (size == size_alt) {
1833 format = alt;
1834 }
1835
1836 gen_offsets[idx] = *gen_stride;
1837 gen_formats[other_idx] = format;
1838 offset = *gen_stride;
1839 *gen_stride += size;
1840 }
1841
1842 pan_emit_vary(out, present, PAN_VARY_GENERAL, quirks, format, offset);
1843 }
1844
1845 /* Higher-level wrapper around all of the above, classifying a varying into one
1846 * of the above types */
1847
1848 static void
1849 panfrost_emit_varying(
1850 struct mali_attribute_packed *out,
1851 struct panfrost_shader_state *stage,
1852 struct panfrost_shader_state *other,
1853 struct panfrost_shader_state *xfb,
1854 unsigned present,
1855 unsigned max_xfb,
1856 unsigned *streamout_offsets,
1857 unsigned quirks,
1858 unsigned *gen_offsets,
1859 enum mali_format *gen_formats,
1860 unsigned *gen_stride,
1861 unsigned idx,
1862 bool should_alloc,
1863 bool is_fragment)
1864 {
1865 gl_varying_slot loc = stage->varyings_loc[idx];
1866 enum mali_format format = stage->varyings[idx];
1867
1868 /* Override format to match linkage */
1869 if (!should_alloc && gen_formats[idx])
1870 format = gen_formats[idx];
1871
1872 if (has_point_coord(stage->point_sprite_mask, loc)) {
1873 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1874 } else if (panfrost_xfb_captured(xfb, loc, max_xfb)) {
1875 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1876 pan_emit_vary_xfb(out, present, max_xfb, streamout_offsets, quirks, format, *o);
1877 } else if (loc == VARYING_SLOT_POS) {
1878 if (is_fragment)
1879 pan_emit_vary_special(out, present, PAN_VARY_FRAGCOORD, quirks);
1880 else
1881 pan_emit_vary_special(out, present, PAN_VARY_POSITION, quirks);
1882 } else if (loc == VARYING_SLOT_PSIZ) {
1883 pan_emit_vary_special(out, present, PAN_VARY_PSIZ, quirks);
1884 } else if (loc == VARYING_SLOT_PNTC) {
1885 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1886 } else if (loc == VARYING_SLOT_FACE) {
1887 pan_emit_vary_special(out, present, PAN_VARY_FACE, quirks);
1888 } else {
1889 pan_emit_general_varying(out, other, xfb, loc, format, present,
1890 quirks, gen_offsets, gen_formats, gen_stride,
1891 idx, should_alloc);
1892 }
1893 }
1894
1895 static void
1896 pan_emit_special_input(struct mali_attribute_buffer_packed *out,
1897 unsigned present,
1898 enum pan_special_varying v,
1899 unsigned special)
1900 {
1901 if (present & (1 << v)) {
1902 unsigned idx = pan_varying_index(present, v);
1903
1904 pan_pack(out + idx, ATTRIBUTE_BUFFER, cfg) {
1905 cfg.special = special;
1906 cfg.type = 0;
1907 }
1908 }
1909 }
1910
1911 void
1912 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1913 unsigned vertex_count,
1914 struct mali_vertex_tiler_postfix *vertex_postfix,
1915 struct mali_vertex_tiler_postfix *tiler_postfix,
1916 union midgard_primitive_size *primitive_size)
1917 {
1918 /* Load the shaders */
1919 struct panfrost_context *ctx = batch->ctx;
1920 struct panfrost_device *dev = pan_device(ctx->base.screen);
1921 struct panfrost_shader_state *vs, *fs;
1922 size_t vs_size, fs_size;
1923
1924 /* Allocate the varying descriptor */
1925
1926 vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1927 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1928 vs_size = MALI_ATTRIBUTE_LENGTH * vs->varying_count;
1929 fs_size = MALI_ATTRIBUTE_LENGTH * fs->varying_count;
1930
1931 struct panfrost_transfer trans = panfrost_pool_alloc_aligned(
1932 &batch->pool, vs_size + fs_size, MALI_ATTRIBUTE_LENGTH);
1933
1934 struct pipe_stream_output_info *so = &vs->stream_output;
1935 unsigned present = pan_varying_present(vs, fs, dev->quirks);
1936
1937 /* Check if this varying is linked by us. This is the case for
1938 * general-purpose, non-captured varyings. If it is, link it. If it's
1939 * not, use the provided stream out information to determine the
1940 * offset, since it was already linked for us. */
1941
1942 unsigned gen_offsets[32];
1943 enum mali_format gen_formats[32];
1944 memset(gen_offsets, 0, sizeof(gen_offsets));
1945 memset(gen_formats, 0, sizeof(gen_formats));
1946
1947 unsigned gen_stride = 0;
1948 assert(vs->varying_count < ARRAY_SIZE(gen_offsets));
1949 assert(fs->varying_count < ARRAY_SIZE(gen_offsets));
1950
1951 unsigned streamout_offsets[32];
1952
1953 for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1954 streamout_offsets[i] = panfrost_streamout_offset(
1955 so->stride[i],
1956 ctx->streamout.offsets[i],
1957 ctx->streamout.targets[i]);
1958 }
1959
1960 struct mali_attribute_packed *ovs = (struct mali_attribute_packed *)trans.cpu;
1961 struct mali_attribute_packed *ofs = ovs + vs->varying_count;
1962
1963 for (unsigned i = 0; i < vs->varying_count; i++) {
1964 panfrost_emit_varying(ovs + i, vs, fs, vs, present,
1965 ctx->streamout.num_targets, streamout_offsets,
1966 dev->quirks,
1967 gen_offsets, gen_formats, &gen_stride, i, true, false);
1968 }
1969
1970 for (unsigned i = 0; i < fs->varying_count; i++) {
1971 panfrost_emit_varying(ofs + i, fs, vs, vs, present,
1972 ctx->streamout.num_targets, streamout_offsets,
1973 dev->quirks,
1974 gen_offsets, gen_formats, &gen_stride, i, false, true);
1975 }
1976
1977 unsigned xfb_base = pan_xfb_base(present);
1978 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1979 MALI_ATTRIBUTE_BUFFER_LENGTH * (xfb_base + ctx->streamout.num_targets),
1980 MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
1981 struct mali_attribute_buffer_packed *varyings =
1982 (struct mali_attribute_buffer_packed *) T.cpu;
1983
1984 /* Emit the stream out buffers */
1985
1986 unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1987 ctx->vertex_count);
1988
1989 for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1990 panfrost_emit_streamout(batch, &varyings[xfb_base + i],
1991 so->stride[i],
1992 ctx->streamout.offsets[i],
1993 out_count,
1994 ctx->streamout.targets[i]);
1995 }
1996
1997 panfrost_emit_varyings(batch,
1998 &varyings[pan_varying_index(present, PAN_VARY_GENERAL)],
1999 gen_stride, vertex_count);
2000
2001 /* fp32 vec4 gl_Position */
2002 tiler_postfix->position_varying = panfrost_emit_varyings(batch,
2003 &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
2004 sizeof(float) * 4, vertex_count);
2005
2006 if (present & (1 << PAN_VARY_PSIZ)) {
2007 primitive_size->pointer = panfrost_emit_varyings(batch,
2008 &varyings[pan_varying_index(present, PAN_VARY_PSIZ)],
2009 2, vertex_count);
2010 }
2011
2012 pan_emit_special_input(varyings, present, PAN_VARY_PNTCOORD, MALI_ATTRIBUTE_SPECIAL_POINT_COORD);
2013 pan_emit_special_input(varyings, present, PAN_VARY_FACE, MALI_ATTRIBUTE_SPECIAL_FRONT_FACING);
2014 pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD, MALI_ATTRIBUTE_SPECIAL_FRAG_COORD);
2015
2016 vertex_postfix->varyings = T.gpu;
2017 tiler_postfix->varyings = T.gpu;
2018
2019 vertex_postfix->varying_meta = trans.gpu;
2020 tiler_postfix->varying_meta = trans.gpu + vs_size;
2021 }
2022
2023 void
2024 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
2025 struct mali_vertex_tiler_prefix *vertex_prefix,
2026 struct mali_vertex_tiler_postfix *vertex_postfix,
2027 struct mali_vertex_tiler_prefix *tiler_prefix,
2028 struct mali_vertex_tiler_postfix *tiler_postfix,
2029 union midgard_primitive_size *primitive_size)
2030 {
2031 struct panfrost_context *ctx = batch->ctx;
2032 struct panfrost_device *device = pan_device(ctx->base.screen);
2033 bool wallpapering = ctx->wallpaper_batch && batch->scoreboard.tiler_dep;
2034 struct bifrost_payload_vertex bifrost_vertex = {0,};
2035 struct bifrost_payload_tiler bifrost_tiler = {0,};
2036 struct midgard_payload_vertex_tiler midgard_vertex = {0,};
2037 struct midgard_payload_vertex_tiler midgard_tiler = {0,};
2038 void *vp, *tp;
2039 size_t vp_size, tp_size;
2040
2041 if (device->quirks & IS_BIFROST) {
2042 bifrost_vertex.prefix = *vertex_prefix;
2043 bifrost_vertex.postfix = *vertex_postfix;
2044 vp = &bifrost_vertex;
2045 vp_size = sizeof(bifrost_vertex);
2046
2047 bifrost_tiler.prefix = *tiler_prefix;
2048 bifrost_tiler.tiler.primitive_size = *primitive_size;
2049 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
2050 bifrost_tiler.postfix = *tiler_postfix;
2051 tp = &bifrost_tiler;
2052 tp_size = sizeof(bifrost_tiler);
2053 } else {
2054 midgard_vertex.prefix = *vertex_prefix;
2055 midgard_vertex.postfix = *vertex_postfix;
2056 vp = &midgard_vertex;
2057 vp_size = sizeof(midgard_vertex);
2058
2059 midgard_tiler.prefix = *tiler_prefix;
2060 midgard_tiler.postfix = *tiler_postfix;
2061 midgard_tiler.primitive_size = *primitive_size;
2062 tp = &midgard_tiler;
2063 tp_size = sizeof(midgard_tiler);
2064 }
2065
2066 if (wallpapering) {
2067 /* Inject in reverse order, with "predicted" job indices.
2068 * THIS IS A HACK XXX */
2069 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false,
2070 batch->scoreboard.job_index + 2, tp, tp_size, true);
2071 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2072 vp, vp_size, true);
2073 return;
2074 }
2075
2076 /* If rasterizer discard is enable, only submit the vertex */
2077
2078 unsigned vertex = panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2079 vp, vp_size, false);
2080
2081 if (ctx->rasterizer->base.rasterizer_discard)
2082 return;
2083
2084 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false, vertex, tp, tp_size,
2085 false);
2086 }
2087
2088 /* TODO: stop hardcoding this */
2089 mali_ptr
2090 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2091 {
2092 uint16_t locations[] = {
2093 128, 128,
2094 0, 256,
2095 0, 256,
2096 0, 256,
2097 0, 256,
2098 0, 256,
2099 0, 256,
2100 0, 256,
2101 0, 256,
2102 0, 256,
2103 0, 256,
2104 0, 256,
2105 0, 256,
2106 0, 256,
2107 0, 256,
2108 0, 256,
2109 0, 256,
2110 0, 256,
2111 0, 256,
2112 0, 256,
2113 0, 256,
2114 0, 256,
2115 0, 256,
2116 0, 256,
2117 0, 256,
2118 0, 256,
2119 0, 256,
2120 0, 256,
2121 0, 256,
2122 0, 256,
2123 0, 256,
2124 0, 256,
2125 128, 128,
2126 0, 0,
2127 0, 0,
2128 0, 0,
2129 0, 0,
2130 0, 0,
2131 0, 0,
2132 0, 0,
2133 0, 0,
2134 0, 0,
2135 0, 0,
2136 0, 0,
2137 0, 0,
2138 0, 0,
2139 0, 0,
2140 0, 0,
2141 };
2142
2143 return panfrost_pool_upload_aligned(&batch->pool, locations, 96 * sizeof(uint16_t), 64);
2144 }