panfrost: Pack compute Midgard properties
[mesa.git] / src / gallium / drivers / panfrost / pan_cmdstream.c
1 /*
2 * Copyright (C) 2018 Alyssa Rosenzweig
3 * Copyright (C) 2020 Collabora Ltd.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "util/macros.h"
26 #include "util/u_prim.h"
27 #include "util/u_vbuf.h"
28
29 #include "panfrost-quirks.h"
30
31 #include "pan_pool.h"
32 #include "pan_bo.h"
33 #include "pan_cmdstream.h"
34 #include "pan_context.h"
35 #include "pan_job.h"
36
37 /* If a BO is accessed for a particular shader stage, will it be in the primary
38 * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
39 * fragment will be primary, e.g. compute jobs will be considered
40 * "vertex/tiler" by analogy */
41
42 static inline uint32_t
43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
44 {
45 assert(stage == PIPE_SHADER_FRAGMENT ||
46 stage == PIPE_SHADER_VERTEX ||
47 stage == PIPE_SHADER_COMPUTE);
48
49 return stage == PIPE_SHADER_FRAGMENT ?
50 PAN_BO_ACCESS_FRAGMENT :
51 PAN_BO_ACCESS_VERTEX_TILER;
52 }
53
54 static void
55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
56 struct mali_vertex_tiler_postfix *postfix)
57 {
58 struct panfrost_device *dev = pan_device(ctx->base.screen);
59 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
60
61 struct mali_shared_memory shared = {
62 .shared_workgroup_count = ~0,
63 };
64
65 if (batch->stack_size) {
66 struct panfrost_bo *stack =
67 panfrost_batch_get_scratchpad(batch, batch->stack_size,
68 dev->thread_tls_alloc,
69 dev->core_count);
70
71 shared.stack_shift = panfrost_get_stack_shift(batch->stack_size);
72 shared.scratchpad = stack->gpu;
73 }
74
75 postfix->shared_memory = panfrost_pool_upload_aligned(&batch->pool, &shared, sizeof(shared), 64);
76 }
77
78 static void
79 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
80 struct mali_vertex_tiler_postfix *postfix)
81 {
82 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
83 postfix->shared_memory = panfrost_batch_reserve_framebuffer(batch);
84 }
85
86 static void
87 panfrost_vt_update_rasterizer(struct panfrost_rasterizer *rasterizer,
88 struct mali_vertex_tiler_prefix *prefix,
89 struct mali_vertex_tiler_postfix *postfix)
90 {
91 postfix->gl_enables |= 0x7;
92 SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
93 rasterizer->base.front_ccw);
94 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
95 (rasterizer->base.cull_face & PIPE_FACE_FRONT));
96 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
97 (rasterizer->base.cull_face & PIPE_FACE_BACK));
98 SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
99 rasterizer->base.flatshade_first);
100 }
101
102 void
103 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
104 struct mali_vertex_tiler_prefix *prefix,
105 union midgard_primitive_size *primitive_size)
106 {
107 struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
108
109 if (!panfrost_writes_point_size(ctx)) {
110 float val = (prefix->draw_mode == MALI_DRAW_MODE_POINTS) ?
111 rasterizer->base.point_size :
112 rasterizer->base.line_width;
113
114 primitive_size->constant = val;
115 }
116 }
117
118 static void
119 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
120 struct mali_vertex_tiler_postfix *postfix)
121 {
122 SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
123 if (ctx->occlusion_query) {
124 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
125 panfrost_batch_add_bo(ctx->batch, ctx->occlusion_query->bo,
126 PAN_BO_ACCESS_SHARED |
127 PAN_BO_ACCESS_RW |
128 PAN_BO_ACCESS_FRAGMENT);
129 } else {
130 postfix->occlusion_counter = 0;
131 }
132 }
133
134 void
135 panfrost_vt_init(struct panfrost_context *ctx,
136 enum pipe_shader_type stage,
137 struct mali_vertex_tiler_prefix *prefix,
138 struct mali_vertex_tiler_postfix *postfix)
139 {
140 struct panfrost_device *device = pan_device(ctx->base.screen);
141
142 if (!ctx->shader[stage])
143 return;
144
145 memset(prefix, 0, sizeof(*prefix));
146 memset(postfix, 0, sizeof(*postfix));
147
148 if (device->quirks & IS_BIFROST) {
149 postfix->gl_enables = 0x2;
150 panfrost_vt_emit_shared_memory(ctx, postfix);
151 } else {
152 postfix->gl_enables = 0x6;
153 panfrost_vt_attach_framebuffer(ctx, postfix);
154 }
155
156 if (stage == PIPE_SHADER_FRAGMENT) {
157 panfrost_vt_update_occlusion_query(ctx, postfix);
158 panfrost_vt_update_rasterizer(ctx->rasterizer, prefix, postfix);
159 }
160 }
161
162 static unsigned
163 panfrost_translate_index_size(unsigned size)
164 {
165 switch (size) {
166 case 1:
167 return MALI_DRAW_INDEXED_UINT8;
168
169 case 2:
170 return MALI_DRAW_INDEXED_UINT16;
171
172 case 4:
173 return MALI_DRAW_INDEXED_UINT32;
174
175 default:
176 unreachable("Invalid index size");
177 }
178 }
179
180 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
181 * good for the duration of the draw (transient), could last longer. Also get
182 * the bounds on the index buffer for the range accessed by the draw. We do
183 * these operations together because there are natural optimizations which
184 * require them to be together. */
185
186 static mali_ptr
187 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
188 const struct pipe_draw_info *info,
189 unsigned *min_index, unsigned *max_index)
190 {
191 struct panfrost_resource *rsrc = pan_resource(info->index.resource);
192 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
193 off_t offset = info->start * info->index_size;
194 bool needs_indices = true;
195 mali_ptr out = 0;
196
197 if (info->max_index != ~0u) {
198 *min_index = info->min_index;
199 *max_index = info->max_index;
200 needs_indices = false;
201 }
202
203 if (!info->has_user_indices) {
204 /* Only resources can be directly mapped */
205 panfrost_batch_add_bo(batch, rsrc->bo,
206 PAN_BO_ACCESS_SHARED |
207 PAN_BO_ACCESS_READ |
208 PAN_BO_ACCESS_VERTEX_TILER);
209 out = rsrc->bo->gpu + offset;
210
211 /* Check the cache */
212 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
213 info->start,
214 info->count,
215 min_index,
216 max_index);
217 } else {
218 /* Otherwise, we need to upload to transient memory */
219 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
220 struct panfrost_transfer T =
221 panfrost_pool_alloc_aligned(&batch->pool,
222 info->count * info->index_size,
223 info->index_size);
224
225 memcpy(T.cpu, ibuf8 + offset, info->count * info->index_size);
226 out = T.gpu;
227 }
228
229 if (needs_indices) {
230 /* Fallback */
231 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
232
233 if (!info->has_user_indices)
234 panfrost_minmax_cache_add(rsrc->index_cache,
235 info->start, info->count,
236 *min_index, *max_index);
237 }
238
239 return out;
240 }
241
242 void
243 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
244 const struct pipe_draw_info *info,
245 enum mali_draw_mode draw_mode,
246 struct mali_vertex_tiler_postfix *vertex_postfix,
247 struct mali_vertex_tiler_prefix *tiler_prefix,
248 struct mali_vertex_tiler_postfix *tiler_postfix,
249 unsigned *vertex_count,
250 unsigned *padded_count)
251 {
252 tiler_prefix->draw_mode = draw_mode;
253
254 unsigned draw_flags = 0;
255
256 if (panfrost_writes_point_size(ctx))
257 draw_flags |= MALI_DRAW_VARYING_SIZE;
258
259 if (info->primitive_restart)
260 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
261
262 /* These doesn't make much sense */
263
264 draw_flags |= 0x3000;
265
266 if (info->index_size) {
267 unsigned min_index = 0, max_index = 0;
268
269 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
270 info,
271 &min_index,
272 &max_index);
273
274 /* Use the corresponding values */
275 *vertex_count = max_index - min_index + 1;
276 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
277 tiler_prefix->offset_bias_correction = -min_index;
278 tiler_prefix->index_count = MALI_POSITIVE(info->count);
279 draw_flags |= panfrost_translate_index_size(info->index_size);
280 } else {
281 tiler_prefix->indices = 0;
282 *vertex_count = ctx->vertex_count;
283 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
284 tiler_prefix->offset_bias_correction = 0;
285 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
286 }
287
288 tiler_prefix->unknown_draw = draw_flags;
289
290 /* Encode the padded vertex count */
291
292 if (info->instance_count > 1) {
293 *padded_count = panfrost_padded_vertex_count(*vertex_count);
294
295 unsigned shift = __builtin_ctz(ctx->padded_count);
296 unsigned k = ctx->padded_count >> (shift + 1);
297
298 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
299 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
300 } else {
301 *padded_count = *vertex_count;
302
303 /* Reset instancing state */
304 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
305 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
306 }
307 }
308
309 static void
310 panfrost_emit_compute_shader(struct panfrost_context *ctx,
311 enum pipe_shader_type st,
312 struct mali_shader_meta *meta)
313 {
314 const struct panfrost_device *dev = pan_device(ctx->base.screen);
315 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
316
317 memset(meta, 0, sizeof(*meta));
318 meta->shader = ss->shader;
319 meta->attribute_count = ss->attribute_count;
320 meta->varying_count = ss->varying_count;
321 meta->texture_count = ctx->sampler_view_count[st];
322 meta->sampler_count = ctx->sampler_count[st];
323
324 if (dev->quirks & IS_BIFROST) {
325 meta->bifrost1.unk1 = 0x800000;
326 meta->bifrost2.preload_regs = 0xC0;
327 meta->bifrost2.uniform_count = ss->uniform_count;
328 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
329 } else {
330 struct mali_midgard_properties_packed prop;
331
332 pan_pack(&prop, MIDGARD_PROPERTIES, cfg) {
333 cfg.uniform_buffer_count = panfrost_ubo_count(ctx, st);
334 cfg.uniform_count = ss->uniform_count;
335 cfg.work_register_count = ss->work_reg_count;
336 cfg.writes_globals = ss->writes_global;
337 cfg.suppress_inf_nan = true; /* XXX */
338 }
339
340 memcpy(&meta->midgard1, &prop, sizeof(prop));
341 }
342 }
343
344 static unsigned
345 translate_tex_wrap(enum pipe_tex_wrap w)
346 {
347 switch (w) {
348 case PIPE_TEX_WRAP_REPEAT: return MALI_WRAP_MODE_REPEAT;
349 case PIPE_TEX_WRAP_CLAMP: return MALI_WRAP_MODE_CLAMP;
350 case PIPE_TEX_WRAP_CLAMP_TO_EDGE: return MALI_WRAP_MODE_CLAMP_TO_EDGE;
351 case PIPE_TEX_WRAP_CLAMP_TO_BORDER: return MALI_WRAP_MODE_CLAMP_TO_BORDER;
352 case PIPE_TEX_WRAP_MIRROR_REPEAT: return MALI_WRAP_MODE_MIRRORED_REPEAT;
353 case PIPE_TEX_WRAP_MIRROR_CLAMP: return MALI_WRAP_MODE_MIRRORED_CLAMP;
354 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE;
355 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER;
356 default: unreachable("Invalid wrap");
357 }
358 }
359
360 /* The hardware compares in the wrong order order, so we have to flip before
361 * encoding. Yes, really. */
362
363 static enum mali_func
364 panfrost_sampler_compare_func(const struct pipe_sampler_state *cso)
365 {
366 if (!cso->compare_mode)
367 return MALI_FUNC_NEVER;
368
369 enum mali_func f = panfrost_translate_compare_func(cso->compare_func);
370 return panfrost_flip_compare_func(f);
371 }
372
373 static enum mali_mipmap_mode
374 pan_pipe_to_mipmode(enum pipe_tex_mipfilter f)
375 {
376 switch (f) {
377 case PIPE_TEX_MIPFILTER_NEAREST: return MALI_MIPMAP_MODE_NEAREST;
378 case PIPE_TEX_MIPFILTER_LINEAR: return MALI_MIPMAP_MODE_TRILINEAR;
379 case PIPE_TEX_MIPFILTER_NONE: return MALI_MIPMAP_MODE_NONE;
380 default: unreachable("Invalid");
381 }
382 }
383
384 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
385 struct mali_midgard_sampler_packed *hw)
386 {
387 pan_pack(hw, MIDGARD_SAMPLER, cfg) {
388 cfg.magnify_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
389 cfg.minify_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
390 cfg.mipmap_mode = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR) ?
391 MALI_MIPMAP_MODE_TRILINEAR : MALI_MIPMAP_MODE_NEAREST;
392 cfg.normalized_coordinates = cso->normalized_coords;
393
394 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
395
396 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
397
398 /* If necessary, we disable mipmapping in the sampler descriptor by
399 * clamping the LOD as tight as possible (from 0 to epsilon,
400 * essentially -- remember these are fixed point numbers, so
401 * epsilon=1/256) */
402
403 cfg.maximum_lod = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) ?
404 cfg.minimum_lod + 1 :
405 FIXED_16(cso->max_lod, false);
406
407 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
408 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
409 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
410
411 cfg.compare_function = panfrost_sampler_compare_func(cso);
412 cfg.seamless_cube_map = cso->seamless_cube_map;
413
414 cfg.border_color_r = cso->border_color.f[0];
415 cfg.border_color_g = cso->border_color.f[1];
416 cfg.border_color_b = cso->border_color.f[2];
417 cfg.border_color_a = cso->border_color.f[3];
418 }
419 }
420
421 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
422 struct mali_bifrost_sampler_packed *hw)
423 {
424 pan_pack(hw, BIFROST_SAMPLER, cfg) {
425 cfg.magnify_linear = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR;
426 cfg.minify_linear = cso->min_img_filter == PIPE_TEX_FILTER_LINEAR;
427 cfg.mipmap_mode = pan_pipe_to_mipmode(cso->min_mip_filter);
428 cfg.normalized_coordinates = cso->normalized_coords;
429
430 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
431 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
432 cfg.maximum_lod = FIXED_16(cso->max_lod, false);
433
434 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
435 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
436 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
437
438 cfg.compare_function = panfrost_sampler_compare_func(cso);
439 cfg.seamless_cube_map = cso->seamless_cube_map;
440 }
441 }
442
443 static bool
444 panfrost_fs_required(
445 struct panfrost_shader_state *fs,
446 struct panfrost_blend_final *blend,
447 unsigned rt_count)
448 {
449 /* If we generally have side effects */
450 if (fs->fs_sidefx)
451 return true;
452
453 /* If colour is written we need to execute */
454 for (unsigned i = 0; i < rt_count; ++i) {
455 if (!blend[i].no_colour)
456 return true;
457 }
458
459 /* If depth is written and not implied we need to execute.
460 * TODO: Predicate on Z/S writes being enabled */
461 return (fs->writes_depth || fs->writes_stencil);
462 }
463
464 static void
465 panfrost_emit_blend(struct panfrost_batch *batch, void *rts,
466 struct panfrost_blend_final *blend)
467 {
468 const struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
469 struct panfrost_shader_state *fs = panfrost_get_shader_state(batch->ctx, PIPE_SHADER_FRAGMENT);
470 unsigned rt_count = batch->key.nr_cbufs;
471
472 struct bifrost_blend_rt *brts = rts;
473 struct midgard_blend_rt *mrts = rts;
474
475 /* Disable blending for depth-only on Bifrost */
476
477 if (rt_count == 0 && dev->quirks & IS_BIFROST)
478 brts[0].unk2 = 0x3;
479
480 for (unsigned i = 0; i < rt_count; ++i) {
481 unsigned flags = 0;
482
483 pan_pack(&flags, BLEND_FLAGS, cfg) {
484 if (blend[i].no_colour) {
485 cfg.enable = false;
486 break;
487 }
488
489 batch->draws |= (PIPE_CLEAR_COLOR0 << i);
490
491 cfg.srgb = util_format_is_srgb(batch->key.cbufs[i]->format);
492 cfg.load_destination = blend[i].load_dest;
493 cfg.dither_disable = !batch->ctx->blend->base.dither;
494
495 if (!(dev->quirks & IS_BIFROST))
496 cfg.midgard_blend_shader = blend[i].is_shader;
497 }
498
499 if (dev->quirks & IS_BIFROST) {
500 brts[i].flags = flags;
501
502 if (blend[i].is_shader) {
503 /* The blend shader's address needs to be at
504 * the same top 32 bit as the fragment shader.
505 * TODO: Ensure that's always the case.
506 */
507 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
508 (fs->bo->gpu & (0xffffffffull << 32)));
509 brts[i].shader = blend[i].shader.gpu;
510 brts[i].unk2 = 0x0;
511 } else {
512 enum pipe_format format = batch->key.cbufs[i]->format;
513 const struct util_format_description *format_desc;
514 format_desc = util_format_description(format);
515
516 brts[i].equation = blend[i].equation.equation;
517
518 /* TODO: this is a bit more complicated */
519 brts[i].constant = blend[i].equation.constant;
520
521 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
522
523 /* 0x19 disables blending and forces REPLACE
524 * mode (equivalent to rgb_mode = alpha_mode =
525 * x122, colour mask = 0xF). 0x1a allows
526 * blending. */
527 brts[i].unk2 = blend[i].opaque ? 0x19 : 0x1a;
528
529 brts[i].shader_type = fs->blend_types[i];
530 }
531 } else {
532 memcpy(&mrts[i].flags, &flags, sizeof(flags));
533
534 if (blend[i].is_shader) {
535 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
536 } else {
537 mrts[i].blend.equation = blend[i].equation.equation;
538 mrts[i].blend.constant = blend[i].equation.constant;
539 }
540 }
541 }
542 }
543
544 static void
545 panfrost_emit_frag_shader(struct panfrost_context *ctx,
546 struct mali_shader_meta *fragmeta,
547 struct panfrost_blend_final *blend)
548 {
549 const struct panfrost_device *dev = pan_device(ctx->base.screen);
550 struct panfrost_shader_state *fs;
551
552 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
553
554 struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
555 const struct panfrost_zsa_state *zsa = ctx->depth_stencil;
556
557 memset(fragmeta, 0, sizeof(*fragmeta));
558
559 fragmeta->shader = fs->shader;
560 fragmeta->attribute_count = fs->attribute_count;
561 fragmeta->varying_count = fs->varying_count;
562 fragmeta->texture_count = ctx->sampler_view_count[PIPE_SHADER_FRAGMENT];
563 fragmeta->sampler_count = ctx->sampler_count[PIPE_SHADER_FRAGMENT];
564
565 if (dev->quirks & IS_BIFROST) {
566 /* First clause ATEST |= 0x4000000.
567 * Lefs than 32 regs |= 0x200 */
568 fragmeta->bifrost1.unk1 = 0x950020;
569
570 fragmeta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, PIPE_SHADER_FRAGMENT);
571 fragmeta->bifrost2.preload_regs = 0x1;
572 SET_BIT(fragmeta->bifrost2.preload_regs, 0x10, fs->reads_frag_coord);
573
574 fragmeta->bifrost2.uniform_count = fs->uniform_count;
575 } else {
576 fragmeta->midgard1.uniform_count = fs->uniform_count;
577 fragmeta->midgard1.work_count = fs->work_reg_count;
578
579 /* TODO: This is not conformant on ES3 */
580 fragmeta->midgard1.flags_hi = MALI_SUPPRESS_INF_NAN;
581
582 fragmeta->midgard1.flags_lo = 0x20;
583 fragmeta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, PIPE_SHADER_FRAGMENT);
584
585 SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_GLOBAL, fs->writes_global);
586 }
587
588 bool msaa = rast->multisample;
589 fragmeta->coverage_mask = msaa ? ctx->sample_mask : ~0;
590
591 fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x10;
592 fragmeta->unknown2_4 = 0x4e0;
593
594 if (dev->quirks & IS_BIFROST) {
595 /* TODO */
596 } else {
597 /* Depending on whether it's legal to in the given shader, we try to
598 * enable early-z testing. TODO: respect e-z force */
599
600 SET_BIT(fragmeta->midgard1.flags_lo, MALI_EARLY_Z,
601 !fs->can_discard && !fs->writes_global &&
602 !fs->writes_depth && !fs->writes_stencil &&
603 !ctx->blend->base.alpha_to_coverage);
604
605 /* Add the writes Z/S flags if needed. */
606 SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_Z, fs->writes_depth);
607 SET_BIT(fragmeta->midgard1.flags_hi, MALI_WRITES_S, fs->writes_stencil);
608
609 /* Any time texturing is used, derivatives are implicitly calculated,
610 * so we need to enable helper invocations */
611
612 SET_BIT(fragmeta->midgard1.flags_lo, MALI_HELPER_INVOCATIONS,
613 fs->helper_invocations);
614
615 /* If discard is enabled, which bit we set to convey this
616 * depends on if depth/stencil is used for the draw or not.
617 * Just one of depth OR stencil is enough to trigger this. */
618
619 bool zs_enabled =
620 fs->writes_depth || fs->writes_stencil ||
621 (zsa->base.depth.enabled && zsa->base.depth.func != PIPE_FUNC_ALWAYS) ||
622 zsa->base.stencil[0].enabled;
623
624 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_TILEBUFFER,
625 fs->outputs_read || (!zs_enabled && fs->can_discard));
626 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_ZS, zs_enabled && fs->can_discard);
627 }
628
629 /* TODO: Sample size */
630 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
631 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
632
633 /* EXT_shader_framebuffer_fetch requires the shader to be run
634 * per-sample when outputs are read. */
635 bool per_sample = ctx->min_samples > 1 || fs->outputs_read;
636 SET_BIT(fragmeta->unknown2_3, MALI_PER_SAMPLE, msaa && per_sample);
637
638 fragmeta->depth_units = rast->offset_units * 2.0f;
639 fragmeta->depth_factor = rast->offset_scale;
640
641 /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
642
643 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, rast->offset_tri);
644 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, rast->offset_tri);
645
646 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_NEAR, rast->depth_clip_near);
647 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_FAR, rast->depth_clip_far);
648
649 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
650 zsa->base.stencil[0].enabled);
651
652 fragmeta->stencil_mask_front = zsa->stencil_mask_front;
653 fragmeta->stencil_mask_back = zsa->stencil_mask_back;
654
655 /* Bottom bits for stencil ref, exactly one word */
656 fragmeta->stencil_front.opaque[0] = zsa->stencil_front.opaque[0] | ctx->stencil_ref.ref_value[0];
657
658 /* If back-stencil is not enabled, use the front values */
659
660 if (zsa->base.stencil[1].enabled)
661 fragmeta->stencil_back.opaque[0] = zsa->stencil_back.opaque[0] | ctx->stencil_ref.ref_value[1];
662 else
663 fragmeta->stencil_back = fragmeta->stencil_front;
664
665 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
666 zsa->base.depth.writemask);
667
668 fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
669 fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(
670 zsa->base.depth.enabled ? zsa->base.depth.func : PIPE_FUNC_ALWAYS));
671
672 SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
673 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
674 !ctx->blend->base.dither);
675
676 SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
677
678 SET_BIT(fragmeta->unknown2_4, MALI_ALPHA_TO_COVERAGE,
679 ctx->blend->base.alpha_to_coverage);
680
681 /* Get blending setup */
682 unsigned rt_count = ctx->pipe_framebuffer.nr_cbufs;
683
684 /* Disable shader execution if we can */
685 if (dev->quirks & MIDGARD_SHADERLESS
686 && !panfrost_fs_required(fs, blend, rt_count)) {
687 fragmeta->shader = 0;
688 fragmeta->attribute_count = 0;
689 fragmeta->varying_count = 0;
690 fragmeta->texture_count = 0;
691 fragmeta->sampler_count = 0;
692
693 /* This feature is not known to work on Bifrost */
694 fragmeta->midgard1.work_count = 1;
695 fragmeta->midgard1.uniform_count = 0;
696 fragmeta->midgard1.uniform_buffer_count = 0;
697 }
698
699 /* If there is a blend shader, work registers are shared. We impose 8
700 * work registers as a limit for blend shaders. Should be lower XXX */
701
702 if (!(dev->quirks & IS_BIFROST)) {
703 for (unsigned c = 0; c < rt_count; ++c) {
704 if (blend[c].is_shader) {
705 fragmeta->midgard1.work_count =
706 MAX2(fragmeta->midgard1.work_count, 8);
707 }
708 }
709 }
710
711 if (dev->quirks & MIDGARD_SFBD) {
712 /* When only a single render target platform is used, the blend
713 * information is inside the shader meta itself. We additionally
714 * need to signal CAN_DISCARD for nontrivial blend modes (so
715 * we're able to read back the destination buffer) */
716
717 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
718 blend[0].is_shader);
719
720 if (blend[0].is_shader) {
721 fragmeta->blend.shader = blend[0].shader.gpu |
722 blend[0].shader.first_tag;
723 } else {
724 fragmeta->blend.equation = blend[0].equation.equation;
725 fragmeta->blend.constant = blend[0].equation.constant;
726 }
727
728 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
729 blend[0].load_dest);
730 } else if (!(dev->quirks & IS_BIFROST)) {
731 /* Bug where MRT-capable hw apparently reads the last blend
732 * shader from here instead of the usual location? */
733
734 for (signed rt = ((signed) rt_count - 1); rt >= 0; --rt) {
735 if (!blend[rt].is_shader)
736 continue;
737
738 fragmeta->blend.shader = blend[rt].shader.gpu |
739 blend[rt].shader.first_tag;
740 break;
741 }
742 }
743
744 if (dev->quirks & IS_BIFROST) {
745 bool no_blend = true;
746
747 for (unsigned i = 0; i < rt_count; ++i)
748 no_blend &= (!blend[i].load_dest | blend[i].no_colour);
749
750 SET_BIT(fragmeta->bifrost1.unk1, MALI_BIFROST_EARLY_Z,
751 !fs->can_discard && !fs->writes_depth && no_blend);
752 }
753 }
754
755 void
756 panfrost_emit_shader_meta(struct panfrost_batch *batch,
757 enum pipe_shader_type st,
758 struct mali_vertex_tiler_postfix *postfix)
759 {
760 struct panfrost_context *ctx = batch->ctx;
761 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
762
763 if (!ss) {
764 postfix->shader = 0;
765 return;
766 }
767
768 struct mali_shader_meta meta;
769
770 /* Add the shader BO to the batch. */
771 panfrost_batch_add_bo(batch, ss->bo,
772 PAN_BO_ACCESS_PRIVATE |
773 PAN_BO_ACCESS_READ |
774 panfrost_bo_access_for_stage(st));
775
776 mali_ptr shader_ptr;
777
778 if (st == PIPE_SHADER_FRAGMENT) {
779 struct panfrost_device *dev = pan_device(ctx->base.screen);
780 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
781 size_t desc_size = sizeof(meta);
782 void *rts = NULL;
783 struct panfrost_transfer xfer;
784 unsigned rt_size;
785
786 if (dev->quirks & MIDGARD_SFBD)
787 rt_size = 0;
788 else if (dev->quirks & IS_BIFROST)
789 rt_size = sizeof(struct bifrost_blend_rt);
790 else
791 rt_size = sizeof(struct midgard_blend_rt);
792
793 desc_size += rt_size * rt_count;
794
795 if (rt_size)
796 rts = rzalloc_size(ctx, rt_size * rt_count);
797
798 struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
799
800 for (unsigned c = 0; c < ctx->pipe_framebuffer.nr_cbufs; ++c)
801 blend[c] = panfrost_get_blend_for_context(ctx, c);
802
803 panfrost_emit_frag_shader(ctx, &meta, blend);
804
805 if (!(dev->quirks & MIDGARD_SFBD))
806 panfrost_emit_blend(batch, rts, blend);
807 else
808 batch->draws |= PIPE_CLEAR_COLOR0;
809
810 xfer = panfrost_pool_alloc_aligned(&batch->pool, desc_size, sizeof(meta));
811
812 memcpy(xfer.cpu, &meta, sizeof(meta));
813 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
814
815 if (rt_size)
816 ralloc_free(rts);
817
818 shader_ptr = xfer.gpu;
819 } else {
820 panfrost_emit_compute_shader(ctx, st, &meta);
821
822 shader_ptr = panfrost_pool_upload(&batch->pool, &meta,
823 sizeof(meta));
824 }
825
826 postfix->shader = shader_ptr;
827 }
828
829 void
830 panfrost_emit_viewport(struct panfrost_batch *batch,
831 struct mali_vertex_tiler_postfix *tiler_postfix)
832 {
833 struct panfrost_context *ctx = batch->ctx;
834 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
835 const struct pipe_scissor_state *ss = &ctx->scissor;
836 const struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
837 const struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer;
838
839 /* Derive min/max from translate/scale. Note since |x| >= 0 by
840 * definition, we have that -|x| <= |x| hence translate - |scale| <=
841 * translate + |scale|, so the ordering is correct here. */
842 float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
843 float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
844 float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
845 float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
846 float minz = (vp->translate[2] - fabsf(vp->scale[2]));
847 float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
848
849 /* Scissor to the intersection of viewport and to the scissor, clamped
850 * to the framebuffer */
851
852 unsigned minx = MIN2(fb->width, vp_minx);
853 unsigned maxx = MIN2(fb->width, vp_maxx);
854 unsigned miny = MIN2(fb->height, vp_miny);
855 unsigned maxy = MIN2(fb->height, vp_maxy);
856
857 if (ss && rast->scissor) {
858 minx = MAX2(ss->minx, minx);
859 miny = MAX2(ss->miny, miny);
860 maxx = MIN2(ss->maxx, maxx);
861 maxy = MIN2(ss->maxy, maxy);
862 }
863
864 struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool, MALI_VIEWPORT_LENGTH);
865
866 pan_pack(T.cpu, VIEWPORT, cfg) {
867 cfg.scissor_minimum_x = minx;
868 cfg.scissor_minimum_y = miny;
869 cfg.scissor_maximum_x = maxx - 1;
870 cfg.scissor_maximum_y = maxy - 1;
871
872 cfg.minimum_z = rast->depth_clip_near ? minz : -INFINITY;
873 cfg.maximum_z = rast->depth_clip_far ? maxz : INFINITY;
874 }
875
876 tiler_postfix->viewport = T.gpu;
877 panfrost_batch_union_scissor(batch, minx, miny, maxx, maxy);
878 }
879
880 static mali_ptr
881 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
882 enum pipe_shader_type st,
883 struct panfrost_constant_buffer *buf,
884 unsigned index)
885 {
886 struct pipe_constant_buffer *cb = &buf->cb[index];
887 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
888
889 if (rsrc) {
890 panfrost_batch_add_bo(batch, rsrc->bo,
891 PAN_BO_ACCESS_SHARED |
892 PAN_BO_ACCESS_READ |
893 panfrost_bo_access_for_stage(st));
894
895 /* Alignment gauranteed by
896 * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
897 return rsrc->bo->gpu + cb->buffer_offset;
898 } else if (cb->user_buffer) {
899 return panfrost_pool_upload_aligned(&batch->pool,
900 cb->user_buffer +
901 cb->buffer_offset,
902 cb->buffer_size, 16);
903 } else {
904 unreachable("No constant buffer");
905 }
906 }
907
908 struct sysval_uniform {
909 union {
910 float f[4];
911 int32_t i[4];
912 uint32_t u[4];
913 uint64_t du[2];
914 };
915 };
916
917 static void
918 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
919 struct sysval_uniform *uniform)
920 {
921 struct panfrost_context *ctx = batch->ctx;
922 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
923
924 uniform->f[0] = vp->scale[0];
925 uniform->f[1] = vp->scale[1];
926 uniform->f[2] = vp->scale[2];
927 }
928
929 static void
930 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
931 struct sysval_uniform *uniform)
932 {
933 struct panfrost_context *ctx = batch->ctx;
934 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
935
936 uniform->f[0] = vp->translate[0];
937 uniform->f[1] = vp->translate[1];
938 uniform->f[2] = vp->translate[2];
939 }
940
941 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
942 enum pipe_shader_type st,
943 unsigned int sysvalid,
944 struct sysval_uniform *uniform)
945 {
946 struct panfrost_context *ctx = batch->ctx;
947 unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
948 unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
949 bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
950 struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
951
952 assert(dim);
953 uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
954
955 if (dim > 1)
956 uniform->i[1] = u_minify(tex->texture->height0,
957 tex->u.tex.first_level);
958
959 if (dim > 2)
960 uniform->i[2] = u_minify(tex->texture->depth0,
961 tex->u.tex.first_level);
962
963 if (is_array)
964 uniform->i[dim] = tex->texture->array_size;
965 }
966
967 static void
968 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
969 enum pipe_shader_type st,
970 unsigned ssbo_id,
971 struct sysval_uniform *uniform)
972 {
973 struct panfrost_context *ctx = batch->ctx;
974
975 assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
976 struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
977
978 /* Compute address */
979 struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
980
981 panfrost_batch_add_bo(batch, bo,
982 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
983 panfrost_bo_access_for_stage(st));
984
985 /* Upload address and size as sysval */
986 uniform->du[0] = bo->gpu + sb.buffer_offset;
987 uniform->u[2] = sb.buffer_size;
988 }
989
990 static void
991 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
992 enum pipe_shader_type st,
993 unsigned samp_idx,
994 struct sysval_uniform *uniform)
995 {
996 struct panfrost_context *ctx = batch->ctx;
997 struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
998
999 uniform->f[0] = sampl->min_lod;
1000 uniform->f[1] = sampl->max_lod;
1001 uniform->f[2] = sampl->lod_bias;
1002
1003 /* Even without any errata, Midgard represents "no mipmapping" as
1004 * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1005 * panfrost_create_sampler_state which also explains our choice of
1006 * epsilon value (again to keep behaviour consistent) */
1007
1008 if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1009 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1010 }
1011
1012 static void
1013 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1014 struct sysval_uniform *uniform)
1015 {
1016 struct panfrost_context *ctx = batch->ctx;
1017
1018 uniform->u[0] = ctx->compute_grid->grid[0];
1019 uniform->u[1] = ctx->compute_grid->grid[1];
1020 uniform->u[2] = ctx->compute_grid->grid[2];
1021 }
1022
1023 static void
1024 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1025 struct panfrost_shader_state *ss,
1026 enum pipe_shader_type st)
1027 {
1028 struct sysval_uniform *uniforms = (void *)buf;
1029
1030 for (unsigned i = 0; i < ss->sysval_count; ++i) {
1031 int sysval = ss->sysval[i];
1032
1033 switch (PAN_SYSVAL_TYPE(sysval)) {
1034 case PAN_SYSVAL_VIEWPORT_SCALE:
1035 panfrost_upload_viewport_scale_sysval(batch,
1036 &uniforms[i]);
1037 break;
1038 case PAN_SYSVAL_VIEWPORT_OFFSET:
1039 panfrost_upload_viewport_offset_sysval(batch,
1040 &uniforms[i]);
1041 break;
1042 case PAN_SYSVAL_TEXTURE_SIZE:
1043 panfrost_upload_txs_sysval(batch, st,
1044 PAN_SYSVAL_ID(sysval),
1045 &uniforms[i]);
1046 break;
1047 case PAN_SYSVAL_SSBO:
1048 panfrost_upload_ssbo_sysval(batch, st,
1049 PAN_SYSVAL_ID(sysval),
1050 &uniforms[i]);
1051 break;
1052 case PAN_SYSVAL_NUM_WORK_GROUPS:
1053 panfrost_upload_num_work_groups_sysval(batch,
1054 &uniforms[i]);
1055 break;
1056 case PAN_SYSVAL_SAMPLER:
1057 panfrost_upload_sampler_sysval(batch, st,
1058 PAN_SYSVAL_ID(sysval),
1059 &uniforms[i]);
1060 break;
1061 default:
1062 assert(0);
1063 }
1064 }
1065 }
1066
1067 static const void *
1068 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1069 unsigned index)
1070 {
1071 struct pipe_constant_buffer *cb = &buf->cb[index];
1072 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1073
1074 if (rsrc)
1075 return rsrc->bo->cpu;
1076 else if (cb->user_buffer)
1077 return cb->user_buffer;
1078 else
1079 unreachable("No constant buffer");
1080 }
1081
1082 void
1083 panfrost_emit_const_buf(struct panfrost_batch *batch,
1084 enum pipe_shader_type stage,
1085 struct mali_vertex_tiler_postfix *postfix)
1086 {
1087 struct panfrost_context *ctx = batch->ctx;
1088 struct panfrost_shader_variants *all = ctx->shader[stage];
1089
1090 if (!all)
1091 return;
1092
1093 struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1094
1095 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1096
1097 /* Uniforms are implicitly UBO #0 */
1098 bool has_uniforms = buf->enabled_mask & (1 << 0);
1099
1100 /* Allocate room for the sysval and the uniforms */
1101 size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1102 size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1103 size_t size = sys_size + uniform_size;
1104 struct panfrost_transfer transfer =
1105 panfrost_pool_alloc_aligned(&batch->pool, size, 16);
1106
1107 /* Upload sysvals requested by the shader */
1108 panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1109
1110 /* Upload uniforms */
1111 if (has_uniforms && uniform_size) {
1112 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1113 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1114 }
1115
1116 /* Next up, attach UBOs. UBO #0 is the uniforms we just
1117 * uploaded */
1118
1119 unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1120 assert(ubo_count >= 1);
1121
1122 size_t sz = MALI_UNIFORM_BUFFER_LENGTH * ubo_count;
1123 struct panfrost_transfer ubos =
1124 panfrost_pool_alloc_aligned(&batch->pool, sz,
1125 MALI_UNIFORM_BUFFER_LENGTH);
1126
1127 uint64_t *ubo_ptr = (uint64_t *) ubos.cpu;
1128
1129 /* Upload uniforms as a UBO */
1130
1131 if (size) {
1132 pan_pack(ubo_ptr, UNIFORM_BUFFER, cfg) {
1133 cfg.entries = DIV_ROUND_UP(size, 16);
1134 cfg.pointer = transfer.gpu;
1135 }
1136 } else {
1137 *ubo_ptr = 0;
1138 }
1139
1140 /* The rest are honest-to-goodness UBOs */
1141
1142 for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1143 size_t usz = buf->cb[ubo].buffer_size;
1144 bool enabled = buf->enabled_mask & (1 << ubo);
1145 bool empty = usz == 0;
1146
1147 if (!enabled || empty) {
1148 ubo_ptr[ubo] = 0;
1149 continue;
1150 }
1151
1152 pan_pack(ubo_ptr + ubo, UNIFORM_BUFFER, cfg) {
1153 cfg.entries = DIV_ROUND_UP(usz, 16);
1154 cfg.pointer = panfrost_map_constant_buffer_gpu(batch,
1155 stage, buf, ubo);
1156 }
1157 }
1158
1159 postfix->uniforms = transfer.gpu;
1160 postfix->uniform_buffers = ubos.gpu;
1161
1162 buf->dirty_mask = 0;
1163 }
1164
1165 void
1166 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1167 const struct pipe_grid_info *info,
1168 struct midgard_payload_vertex_tiler *vtp)
1169 {
1170 struct panfrost_context *ctx = batch->ctx;
1171 struct panfrost_device *dev = pan_device(ctx->base.screen);
1172 struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1173 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1174 unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1175 128));
1176
1177 unsigned log2_instances =
1178 util_logbase2_ceil(info->grid[0]) +
1179 util_logbase2_ceil(info->grid[1]) +
1180 util_logbase2_ceil(info->grid[2]);
1181
1182 unsigned shared_size = single_size * (1 << log2_instances) * dev->core_count;
1183 struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1184 shared_size,
1185 1);
1186
1187 struct mali_shared_memory shared = {
1188 .shared_memory = bo->gpu,
1189 .shared_workgroup_count = log2_instances,
1190 .shared_shift = util_logbase2(single_size) + 1
1191 };
1192
1193 vtp->postfix.shared_memory = panfrost_pool_upload_aligned(&batch->pool, &shared,
1194 sizeof(shared), 64);
1195 }
1196
1197 static mali_ptr
1198 panfrost_get_tex_desc(struct panfrost_batch *batch,
1199 enum pipe_shader_type st,
1200 struct panfrost_sampler_view *view)
1201 {
1202 if (!view)
1203 return (mali_ptr) 0;
1204
1205 struct pipe_sampler_view *pview = &view->base;
1206 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1207
1208 /* Add the BO to the job so it's retained until the job is done. */
1209
1210 panfrost_batch_add_bo(batch, rsrc->bo,
1211 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1212 panfrost_bo_access_for_stage(st));
1213
1214 panfrost_batch_add_bo(batch, view->bo,
1215 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1216 panfrost_bo_access_for_stage(st));
1217
1218 return view->bo->gpu;
1219 }
1220
1221 static void
1222 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
1223 struct pipe_context *pctx)
1224 {
1225 struct panfrost_resource *rsrc = pan_resource(view->base.texture);
1226 if (view->texture_bo != rsrc->bo->gpu ||
1227 view->modifier != rsrc->modifier) {
1228 panfrost_bo_unreference(view->bo);
1229 panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);
1230 }
1231 }
1232
1233 void
1234 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1235 enum pipe_shader_type stage,
1236 struct mali_vertex_tiler_postfix *postfix)
1237 {
1238 struct panfrost_context *ctx = batch->ctx;
1239 struct panfrost_device *device = pan_device(ctx->base.screen);
1240
1241 if (!ctx->sampler_view_count[stage])
1242 return;
1243
1244 if (device->quirks & IS_BIFROST) {
1245 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1246 MALI_BIFROST_TEXTURE_LENGTH *
1247 ctx->sampler_view_count[stage],
1248 MALI_BIFROST_TEXTURE_LENGTH);
1249
1250 struct mali_bifrost_texture_packed *out =
1251 (struct mali_bifrost_texture_packed *) T.cpu;
1252
1253 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1254 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1255 struct pipe_sampler_view *pview = &view->base;
1256 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1257
1258 panfrost_update_sampler_view(view, &ctx->base);
1259 out[i] = view->bifrost_descriptor;
1260
1261 /* Add the BOs to the job so they are retained until the job is done. */
1262
1263 panfrost_batch_add_bo(batch, rsrc->bo,
1264 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1265 panfrost_bo_access_for_stage(stage));
1266
1267 panfrost_batch_add_bo(batch, view->bo,
1268 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1269 panfrost_bo_access_for_stage(stage));
1270 }
1271
1272 postfix->textures = T.gpu;
1273 } else {
1274 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1275
1276 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1277 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1278
1279 panfrost_update_sampler_view(view, &ctx->base);
1280
1281 trampolines[i] = panfrost_get_tex_desc(batch, stage, view);
1282 }
1283
1284 postfix->textures = panfrost_pool_upload_aligned(&batch->pool,
1285 trampolines,
1286 sizeof(uint64_t) *
1287 ctx->sampler_view_count[stage],
1288 sizeof(uint64_t));
1289 }
1290 }
1291
1292 void
1293 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1294 enum pipe_shader_type stage,
1295 struct mali_vertex_tiler_postfix *postfix)
1296 {
1297 struct panfrost_context *ctx = batch->ctx;
1298
1299 if (!ctx->sampler_count[stage])
1300 return;
1301
1302 size_t desc_size = MALI_BIFROST_SAMPLER_LENGTH;
1303 assert(MALI_BIFROST_SAMPLER_LENGTH == MALI_MIDGARD_SAMPLER_LENGTH);
1304
1305 size_t sz = desc_size * ctx->sampler_count[stage];
1306 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool, sz, desc_size);
1307 struct mali_midgard_sampler_packed *out = (struct mali_midgard_sampler_packed *) T.cpu;
1308
1309 for (unsigned i = 0; i < ctx->sampler_count[stage]; ++i)
1310 out[i] = ctx->samplers[stage][i]->hw;
1311
1312 postfix->sampler_descriptor = T.gpu;
1313 }
1314
1315 void
1316 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1317 struct mali_vertex_tiler_postfix *vertex_postfix)
1318 {
1319 struct panfrost_context *ctx = batch->ctx;
1320 struct panfrost_vertex_state *so = ctx->vertex;
1321 struct panfrost_shader_state *vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1322
1323 unsigned instance_shift = vertex_postfix->instance_shift;
1324 unsigned instance_odd = vertex_postfix->instance_odd;
1325
1326 /* Worst case: everything is NPOT, which is only possible if instancing
1327 * is enabled. Otherwise single record is gauranteed */
1328 bool could_npot = instance_shift || instance_odd;
1329
1330 struct panfrost_transfer S = panfrost_pool_alloc_aligned(&batch->pool,
1331 MALI_ATTRIBUTE_BUFFER_LENGTH * vs->attribute_count *
1332 (could_npot ? 2 : 1),
1333 MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
1334
1335 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1336 MALI_ATTRIBUTE_LENGTH * vs->attribute_count,
1337 MALI_ATTRIBUTE_LENGTH);
1338
1339 struct mali_attribute_buffer_packed *bufs =
1340 (struct mali_attribute_buffer_packed *) S.cpu;
1341
1342 struct mali_attribute_packed *out =
1343 (struct mali_attribute_packed *) T.cpu;
1344
1345 unsigned attrib_to_buffer[PIPE_MAX_ATTRIBS] = { 0 };
1346 unsigned k = 0;
1347
1348 for (unsigned i = 0; i < so->num_elements; ++i) {
1349 /* We map buffers 1:1 with the attributes, which
1350 * means duplicating some vertex buffers (who cares? aside from
1351 * maybe some caching implications but I somehow doubt that
1352 * matters) */
1353
1354 struct pipe_vertex_element *elem = &so->pipe[i];
1355 unsigned vbi = elem->vertex_buffer_index;
1356 attrib_to_buffer[i] = k;
1357
1358 if (!(ctx->vb_mask & (1 << vbi)))
1359 continue;
1360
1361 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1362 struct panfrost_resource *rsrc;
1363
1364 rsrc = pan_resource(buf->buffer.resource);
1365 if (!rsrc)
1366 continue;
1367
1368 /* Add a dependency of the batch on the vertex buffer */
1369 panfrost_batch_add_bo(batch, rsrc->bo,
1370 PAN_BO_ACCESS_SHARED |
1371 PAN_BO_ACCESS_READ |
1372 PAN_BO_ACCESS_VERTEX_TILER);
1373
1374 /* Mask off lower bits, see offset fixup below */
1375 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1376 mali_ptr addr = raw_addr & ~63;
1377
1378 /* Since we advanced the base pointer, we shrink the buffer
1379 * size, but add the offset we subtracted */
1380 unsigned size = rsrc->base.width0 + (raw_addr - addr)
1381 - buf->buffer_offset;
1382
1383 /* When there is a divisor, the hardware-level divisor is
1384 * the product of the instance divisor and the padded count */
1385 unsigned divisor = elem->instance_divisor;
1386 unsigned hw_divisor = ctx->padded_count * divisor;
1387 unsigned stride = buf->stride;
1388
1389 /* If there's a divisor(=1) but no instancing, we want every
1390 * attribute to be the same */
1391
1392 if (divisor && ctx->instance_count == 1)
1393 stride = 0;
1394
1395 if (!divisor || ctx->instance_count <= 1) {
1396 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1397 if (ctx->instance_count > 1)
1398 cfg.type = MALI_ATTRIBUTE_TYPE_1D_MODULUS;
1399
1400 cfg.pointer = addr;
1401 cfg.stride = stride;
1402 cfg.size = size;
1403 cfg.divisor_r = instance_shift;
1404 cfg.divisor_p = instance_odd;
1405 }
1406 } else if (util_is_power_of_two_or_zero(hw_divisor)) {
1407 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1408 cfg.type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;
1409 cfg.pointer = addr;
1410 cfg.stride = stride;
1411 cfg.size = size;
1412 cfg.divisor_r = __builtin_ctz(hw_divisor);
1413 }
1414
1415 } else {
1416 unsigned shift = 0, extra_flags = 0;
1417
1418 unsigned magic_divisor =
1419 panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags);
1420
1421 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1422 cfg.type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;
1423 cfg.pointer = addr;
1424 cfg.stride = stride;
1425 cfg.size = size;
1426
1427 cfg.divisor_r = shift;
1428 cfg.divisor_e = extra_flags;
1429 }
1430
1431 pan_pack(bufs + k + 1, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) {
1432 cfg.divisor_numerator = magic_divisor;
1433 cfg.divisor = divisor;
1434 }
1435
1436 ++k;
1437 }
1438
1439 ++k;
1440 }
1441
1442 /* Add special gl_VertexID/gl_InstanceID buffers */
1443
1444 if (unlikely(vs->attribute_count >= PAN_VERTEX_ID)) {
1445 panfrost_vertex_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1446
1447 pan_pack(out + PAN_VERTEX_ID, ATTRIBUTE, cfg) {
1448 cfg.buffer_index = k++;
1449 cfg.format = so->formats[PAN_VERTEX_ID];
1450 }
1451
1452 panfrost_instance_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1453
1454 pan_pack(out + PAN_INSTANCE_ID, ATTRIBUTE, cfg) {
1455 cfg.buffer_index = k++;
1456 cfg.format = so->formats[PAN_INSTANCE_ID];
1457 }
1458 }
1459
1460 /* Attribute addresses require 64-byte alignment, so let:
1461 *
1462 * base' = base & ~63 = base - (base & 63)
1463 * offset' = offset + (base & 63)
1464 *
1465 * Since base' + offset' = base + offset, these are equivalent
1466 * addressing modes and now base is 64 aligned.
1467 */
1468
1469 unsigned start = vertex_postfix->offset_start;
1470
1471 for (unsigned i = 0; i < so->num_elements; ++i) {
1472 unsigned vbi = so->pipe[i].vertex_buffer_index;
1473 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1474
1475 /* Adjust by the masked off bits of the offset. Make sure we
1476 * read src_offset from so->hw (which is not GPU visible)
1477 * rather than target (which is) due to caching effects */
1478
1479 unsigned src_offset = so->pipe[i].src_offset;
1480
1481 /* BOs aligned to 4k so guaranteed aligned to 64 */
1482 src_offset += (buf->buffer_offset & 63);
1483
1484 /* Also, somewhat obscurely per-instance data needs to be
1485 * offset in response to a delayed start in an indexed draw */
1486
1487 if (so->pipe[i].instance_divisor && ctx->instance_count > 1 && start)
1488 src_offset -= buf->stride * start;
1489
1490 pan_pack(out + i, ATTRIBUTE, cfg) {
1491 cfg.buffer_index = attrib_to_buffer[i];
1492 cfg.format = so->formats[i];
1493 cfg.offset = src_offset;
1494 }
1495 }
1496
1497 vertex_postfix->attributes = S.gpu;
1498 vertex_postfix->attribute_meta = T.gpu;
1499 }
1500
1501 static mali_ptr
1502 panfrost_emit_varyings(struct panfrost_batch *batch,
1503 struct mali_attribute_buffer_packed *slot,
1504 unsigned stride, unsigned count)
1505 {
1506 unsigned size = stride * count;
1507 mali_ptr ptr = panfrost_pool_alloc_aligned(&batch->invisible_pool, size, 64).gpu;
1508
1509 pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1510 cfg.stride = stride;
1511 cfg.size = size;
1512 cfg.pointer = ptr;
1513 }
1514
1515 return ptr;
1516 }
1517
1518 static unsigned
1519 panfrost_streamout_offset(unsigned stride, unsigned offset,
1520 struct pipe_stream_output_target *target)
1521 {
1522 return (target->buffer_offset + (offset * stride * 4)) & 63;
1523 }
1524
1525 static void
1526 panfrost_emit_streamout(struct panfrost_batch *batch,
1527 struct mali_attribute_buffer_packed *slot,
1528 unsigned stride_words, unsigned offset, unsigned count,
1529 struct pipe_stream_output_target *target)
1530 {
1531 unsigned stride = stride_words * 4;
1532 unsigned max_size = target->buffer_size;
1533 unsigned expected_size = stride * count;
1534
1535 /* Grab the BO and bind it to the batch */
1536 struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1537
1538 /* Varyings are WRITE from the perspective of the VERTEX but READ from
1539 * the perspective of the TILER and FRAGMENT.
1540 */
1541 panfrost_batch_add_bo(batch, bo,
1542 PAN_BO_ACCESS_SHARED |
1543 PAN_BO_ACCESS_RW |
1544 PAN_BO_ACCESS_VERTEX_TILER |
1545 PAN_BO_ACCESS_FRAGMENT);
1546
1547 /* We will have an offset applied to get alignment */
1548 mali_ptr addr = bo->gpu + target->buffer_offset + (offset * stride);
1549
1550 pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1551 cfg.pointer = (addr & ~63);
1552 cfg.stride = stride;
1553 cfg.size = MIN2(max_size, expected_size) + (addr & 63);
1554 }
1555 }
1556
1557 static bool
1558 has_point_coord(unsigned mask, gl_varying_slot loc)
1559 {
1560 if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1561 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1562 else if (loc == VARYING_SLOT_PNTC)
1563 return (mask & (1 << 8));
1564 else
1565 return false;
1566 }
1567
1568 /* Helpers for manipulating stream out information so we can pack varyings
1569 * accordingly. Compute the src_offset for a given captured varying */
1570
1571 static struct pipe_stream_output *
1572 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1573 {
1574 for (unsigned i = 0; i < info->num_outputs; ++i) {
1575 if (info->output[i].register_index == loc)
1576 return &info->output[i];
1577 }
1578
1579 unreachable("Varying not captured");
1580 }
1581
1582 static unsigned
1583 pan_varying_size(enum mali_format fmt)
1584 {
1585 unsigned type = MALI_EXTRACT_TYPE(fmt);
1586 unsigned chan = MALI_EXTRACT_CHANNELS(fmt);
1587 unsigned bits = MALI_EXTRACT_BITS(fmt);
1588 unsigned bpc = 0;
1589
1590 if (bits == MALI_CHANNEL_FLOAT) {
1591 /* No doubles */
1592 bool fp16 = (type == MALI_FORMAT_SINT);
1593 assert(fp16 || (type == MALI_FORMAT_UNORM));
1594
1595 bpc = fp16 ? 2 : 4;
1596 } else {
1597 assert(type >= MALI_FORMAT_SNORM && type <= MALI_FORMAT_SINT);
1598
1599 /* See the enums */
1600 bits = 1 << bits;
1601 assert(bits >= 8);
1602 bpc = bits / 8;
1603 }
1604
1605 return bpc * chan;
1606 }
1607
1608 /* Indices for named (non-XFB) varyings that are present. These are packed
1609 * tightly so they correspond to a bitfield present (P) indexed by (1 <<
1610 * PAN_VARY_*). This has the nice property that you can lookup the buffer index
1611 * of a given special field given a shift S by:
1612 *
1613 * idx = popcount(P & ((1 << S) - 1))
1614 *
1615 * That is... look at all of the varyings that come earlier and count them, the
1616 * count is the new index since plus one. Likewise, the total number of special
1617 * buffers required is simply popcount(P)
1618 */
1619
1620 enum pan_special_varying {
1621 PAN_VARY_GENERAL = 0,
1622 PAN_VARY_POSITION = 1,
1623 PAN_VARY_PSIZ = 2,
1624 PAN_VARY_PNTCOORD = 3,
1625 PAN_VARY_FACE = 4,
1626 PAN_VARY_FRAGCOORD = 5,
1627
1628 /* Keep last */
1629 PAN_VARY_MAX,
1630 };
1631
1632 /* Given a varying, figure out which index it correpsonds to */
1633
1634 static inline unsigned
1635 pan_varying_index(unsigned present, enum pan_special_varying v)
1636 {
1637 unsigned mask = (1 << v) - 1;
1638 return util_bitcount(present & mask);
1639 }
1640
1641 /* Get the base offset for XFB buffers, which by convention come after
1642 * everything else. Wrapper function for semantic reasons; by construction this
1643 * is just popcount. */
1644
1645 static inline unsigned
1646 pan_xfb_base(unsigned present)
1647 {
1648 return util_bitcount(present);
1649 }
1650
1651 /* Computes the present mask for varyings so we can start emitting varying records */
1652
1653 static inline unsigned
1654 pan_varying_present(
1655 struct panfrost_shader_state *vs,
1656 struct panfrost_shader_state *fs,
1657 unsigned quirks)
1658 {
1659 /* At the moment we always emit general and position buffers. Not
1660 * strictly necessary but usually harmless */
1661
1662 unsigned present = (1 << PAN_VARY_GENERAL) | (1 << PAN_VARY_POSITION);
1663
1664 /* Enable special buffers by the shader info */
1665
1666 if (vs->writes_point_size)
1667 present |= (1 << PAN_VARY_PSIZ);
1668
1669 if (fs->reads_point_coord)
1670 present |= (1 << PAN_VARY_PNTCOORD);
1671
1672 if (fs->reads_face)
1673 present |= (1 << PAN_VARY_FACE);
1674
1675 if (fs->reads_frag_coord && !(quirks & IS_BIFROST))
1676 present |= (1 << PAN_VARY_FRAGCOORD);
1677
1678 /* Also, if we have a point sprite, we need a point coord buffer */
1679
1680 for (unsigned i = 0; i < fs->varying_count; i++) {
1681 gl_varying_slot loc = fs->varyings_loc[i];
1682
1683 if (has_point_coord(fs->point_sprite_mask, loc))
1684 present |= (1 << PAN_VARY_PNTCOORD);
1685 }
1686
1687 return present;
1688 }
1689
1690 /* Emitters for varying records */
1691
1692 static void
1693 pan_emit_vary(struct mali_attribute_packed *out,
1694 unsigned present, enum pan_special_varying buf,
1695 unsigned quirks, enum mali_format format,
1696 unsigned offset)
1697 {
1698 unsigned nr_channels = MALI_EXTRACT_CHANNELS(format);
1699 unsigned swizzle = quirks & HAS_SWIZZLES ?
1700 panfrost_get_default_swizzle(nr_channels) :
1701 panfrost_bifrost_swizzle(nr_channels);
1702
1703 pan_pack(out, ATTRIBUTE, cfg) {
1704 cfg.buffer_index = pan_varying_index(present, buf);
1705 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1706 cfg.format = (format << 12) | swizzle;
1707 cfg.offset = offset;
1708 }
1709 }
1710
1711 /* General varying that is unused */
1712
1713 static void
1714 pan_emit_vary_only(struct mali_attribute_packed *out,
1715 unsigned present, unsigned quirks)
1716 {
1717 pan_emit_vary(out, present, 0, quirks, MALI_VARYING_DISCARD, 0);
1718 }
1719
1720 /* Special records */
1721
1722 static const enum mali_format pan_varying_formats[PAN_VARY_MAX] = {
1723 [PAN_VARY_POSITION] = MALI_VARYING_POS,
1724 [PAN_VARY_PSIZ] = MALI_R16F,
1725 [PAN_VARY_PNTCOORD] = MALI_R16F,
1726 [PAN_VARY_FACE] = MALI_R32I,
1727 [PAN_VARY_FRAGCOORD] = MALI_RGBA32F
1728 };
1729
1730 static void
1731 pan_emit_vary_special(struct mali_attribute_packed *out,
1732 unsigned present, enum pan_special_varying buf,
1733 unsigned quirks)
1734 {
1735 assert(buf < PAN_VARY_MAX);
1736 pan_emit_vary(out, present, buf, quirks, pan_varying_formats[buf], 0);
1737 }
1738
1739 static enum mali_format
1740 pan_xfb_format(enum mali_format format, unsigned nr)
1741 {
1742 if (MALI_EXTRACT_BITS(format) == MALI_CHANNEL_FLOAT)
1743 return MALI_R32F | MALI_NR_CHANNELS(nr);
1744 else
1745 return MALI_EXTRACT_TYPE(format) | MALI_NR_CHANNELS(nr) | MALI_CHANNEL_32;
1746 }
1747
1748 /* Transform feedback records. Note struct pipe_stream_output is (if packed as
1749 * a bitfield) 32-bit, smaller than a 64-bit pointer, so may as well pass by
1750 * value. */
1751
1752 static void
1753 pan_emit_vary_xfb(struct mali_attribute_packed *out,
1754 unsigned present,
1755 unsigned max_xfb,
1756 unsigned *streamout_offsets,
1757 unsigned quirks,
1758 enum mali_format format,
1759 struct pipe_stream_output o)
1760 {
1761 unsigned swizzle = quirks & HAS_SWIZZLES ?
1762 panfrost_get_default_swizzle(o.num_components) :
1763 panfrost_bifrost_swizzle(o.num_components);
1764
1765 pan_pack(out, ATTRIBUTE, cfg) {
1766 /* XFB buffers come after everything else */
1767 cfg.buffer_index = pan_xfb_base(present) + o.output_buffer;
1768 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1769
1770 /* Override number of channels and precision to highp */
1771 cfg.format = (pan_xfb_format(format, o.num_components) << 12) | swizzle;
1772
1773 /* Apply given offsets together */
1774 cfg.offset = (o.dst_offset * 4) /* dwords */
1775 + streamout_offsets[o.output_buffer];
1776 }
1777 }
1778
1779 /* Determine if we should capture a varying for XFB. This requires actually
1780 * having a buffer for it. If we don't capture it, we'll fallback to a general
1781 * varying path (linked or unlinked, possibly discarding the write) */
1782
1783 static bool
1784 panfrost_xfb_captured(struct panfrost_shader_state *xfb,
1785 unsigned loc, unsigned max_xfb)
1786 {
1787 if (!(xfb->so_mask & (1ll << loc)))
1788 return false;
1789
1790 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1791 return o->output_buffer < max_xfb;
1792 }
1793
1794 static void
1795 pan_emit_general_varying(struct mali_attribute_packed *out,
1796 struct panfrost_shader_state *other,
1797 struct panfrost_shader_state *xfb,
1798 gl_varying_slot loc,
1799 enum mali_format format,
1800 unsigned present,
1801 unsigned quirks,
1802 unsigned *gen_offsets,
1803 enum mali_format *gen_formats,
1804 unsigned *gen_stride,
1805 unsigned idx,
1806 bool should_alloc)
1807 {
1808 /* Check if we're linked */
1809 signed other_idx = -1;
1810
1811 for (unsigned j = 0; j < other->varying_count; ++j) {
1812 if (other->varyings_loc[j] == loc) {
1813 other_idx = j;
1814 break;
1815 }
1816 }
1817
1818 if (other_idx < 0) {
1819 pan_emit_vary_only(out, present, quirks);
1820 return;
1821 }
1822
1823 unsigned offset = gen_offsets[other_idx];
1824
1825 if (should_alloc) {
1826 /* We're linked, so allocate a space via a watermark allocation */
1827 enum mali_format alt = other->varyings[other_idx];
1828
1829 /* Do interpolation at minimum precision */
1830 unsigned size_main = pan_varying_size(format);
1831 unsigned size_alt = pan_varying_size(alt);
1832 unsigned size = MIN2(size_main, size_alt);
1833
1834 /* If a varying is marked for XFB but not actually captured, we
1835 * should match the format to the format that would otherwise
1836 * be used for XFB, since dEQP checks for invariance here. It's
1837 * unclear if this is required by the spec. */
1838
1839 if (xfb->so_mask & (1ull << loc)) {
1840 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1841 format = pan_xfb_format(format, o->num_components);
1842 size = pan_varying_size(format);
1843 } else if (size == size_alt) {
1844 format = alt;
1845 }
1846
1847 gen_offsets[idx] = *gen_stride;
1848 gen_formats[other_idx] = format;
1849 offset = *gen_stride;
1850 *gen_stride += size;
1851 }
1852
1853 pan_emit_vary(out, present, PAN_VARY_GENERAL, quirks, format, offset);
1854 }
1855
1856 /* Higher-level wrapper around all of the above, classifying a varying into one
1857 * of the above types */
1858
1859 static void
1860 panfrost_emit_varying(
1861 struct mali_attribute_packed *out,
1862 struct panfrost_shader_state *stage,
1863 struct panfrost_shader_state *other,
1864 struct panfrost_shader_state *xfb,
1865 unsigned present,
1866 unsigned max_xfb,
1867 unsigned *streamout_offsets,
1868 unsigned quirks,
1869 unsigned *gen_offsets,
1870 enum mali_format *gen_formats,
1871 unsigned *gen_stride,
1872 unsigned idx,
1873 bool should_alloc,
1874 bool is_fragment)
1875 {
1876 gl_varying_slot loc = stage->varyings_loc[idx];
1877 enum mali_format format = stage->varyings[idx];
1878
1879 /* Override format to match linkage */
1880 if (!should_alloc && gen_formats[idx])
1881 format = gen_formats[idx];
1882
1883 if (has_point_coord(stage->point_sprite_mask, loc)) {
1884 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1885 } else if (panfrost_xfb_captured(xfb, loc, max_xfb)) {
1886 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1887 pan_emit_vary_xfb(out, present, max_xfb, streamout_offsets, quirks, format, *o);
1888 } else if (loc == VARYING_SLOT_POS) {
1889 if (is_fragment)
1890 pan_emit_vary_special(out, present, PAN_VARY_FRAGCOORD, quirks);
1891 else
1892 pan_emit_vary_special(out, present, PAN_VARY_POSITION, quirks);
1893 } else if (loc == VARYING_SLOT_PSIZ) {
1894 pan_emit_vary_special(out, present, PAN_VARY_PSIZ, quirks);
1895 } else if (loc == VARYING_SLOT_PNTC) {
1896 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1897 } else if (loc == VARYING_SLOT_FACE) {
1898 pan_emit_vary_special(out, present, PAN_VARY_FACE, quirks);
1899 } else {
1900 pan_emit_general_varying(out, other, xfb, loc, format, present,
1901 quirks, gen_offsets, gen_formats, gen_stride,
1902 idx, should_alloc);
1903 }
1904 }
1905
1906 static void
1907 pan_emit_special_input(struct mali_attribute_buffer_packed *out,
1908 unsigned present,
1909 enum pan_special_varying v,
1910 unsigned special)
1911 {
1912 if (present & (1 << v)) {
1913 unsigned idx = pan_varying_index(present, v);
1914
1915 pan_pack(out + idx, ATTRIBUTE_BUFFER, cfg) {
1916 cfg.special = special;
1917 cfg.type = 0;
1918 }
1919 }
1920 }
1921
1922 void
1923 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1924 unsigned vertex_count,
1925 struct mali_vertex_tiler_postfix *vertex_postfix,
1926 struct mali_vertex_tiler_postfix *tiler_postfix,
1927 union midgard_primitive_size *primitive_size)
1928 {
1929 /* Load the shaders */
1930 struct panfrost_context *ctx = batch->ctx;
1931 struct panfrost_device *dev = pan_device(ctx->base.screen);
1932 struct panfrost_shader_state *vs, *fs;
1933 size_t vs_size, fs_size;
1934
1935 /* Allocate the varying descriptor */
1936
1937 vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1938 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1939 vs_size = MALI_ATTRIBUTE_LENGTH * vs->varying_count;
1940 fs_size = MALI_ATTRIBUTE_LENGTH * fs->varying_count;
1941
1942 struct panfrost_transfer trans = panfrost_pool_alloc_aligned(
1943 &batch->pool, vs_size + fs_size, MALI_ATTRIBUTE_LENGTH);
1944
1945 struct pipe_stream_output_info *so = &vs->stream_output;
1946 unsigned present = pan_varying_present(vs, fs, dev->quirks);
1947
1948 /* Check if this varying is linked by us. This is the case for
1949 * general-purpose, non-captured varyings. If it is, link it. If it's
1950 * not, use the provided stream out information to determine the
1951 * offset, since it was already linked for us. */
1952
1953 unsigned gen_offsets[32];
1954 enum mali_format gen_formats[32];
1955 memset(gen_offsets, 0, sizeof(gen_offsets));
1956 memset(gen_formats, 0, sizeof(gen_formats));
1957
1958 unsigned gen_stride = 0;
1959 assert(vs->varying_count < ARRAY_SIZE(gen_offsets));
1960 assert(fs->varying_count < ARRAY_SIZE(gen_offsets));
1961
1962 unsigned streamout_offsets[32];
1963
1964 for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1965 streamout_offsets[i] = panfrost_streamout_offset(
1966 so->stride[i],
1967 ctx->streamout.offsets[i],
1968 ctx->streamout.targets[i]);
1969 }
1970
1971 struct mali_attribute_packed *ovs = (struct mali_attribute_packed *)trans.cpu;
1972 struct mali_attribute_packed *ofs = ovs + vs->varying_count;
1973
1974 for (unsigned i = 0; i < vs->varying_count; i++) {
1975 panfrost_emit_varying(ovs + i, vs, fs, vs, present,
1976 ctx->streamout.num_targets, streamout_offsets,
1977 dev->quirks,
1978 gen_offsets, gen_formats, &gen_stride, i, true, false);
1979 }
1980
1981 for (unsigned i = 0; i < fs->varying_count; i++) {
1982 panfrost_emit_varying(ofs + i, fs, vs, vs, present,
1983 ctx->streamout.num_targets, streamout_offsets,
1984 dev->quirks,
1985 gen_offsets, gen_formats, &gen_stride, i, false, true);
1986 }
1987
1988 unsigned xfb_base = pan_xfb_base(present);
1989 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1990 MALI_ATTRIBUTE_BUFFER_LENGTH * (xfb_base + ctx->streamout.num_targets),
1991 MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
1992 struct mali_attribute_buffer_packed *varyings =
1993 (struct mali_attribute_buffer_packed *) T.cpu;
1994
1995 /* Emit the stream out buffers */
1996
1997 unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1998 ctx->vertex_count);
1999
2000 for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
2001 panfrost_emit_streamout(batch, &varyings[xfb_base + i],
2002 so->stride[i],
2003 ctx->streamout.offsets[i],
2004 out_count,
2005 ctx->streamout.targets[i]);
2006 }
2007
2008 panfrost_emit_varyings(batch,
2009 &varyings[pan_varying_index(present, PAN_VARY_GENERAL)],
2010 gen_stride, vertex_count);
2011
2012 /* fp32 vec4 gl_Position */
2013 tiler_postfix->position_varying = panfrost_emit_varyings(batch,
2014 &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
2015 sizeof(float) * 4, vertex_count);
2016
2017 if (present & (1 << PAN_VARY_PSIZ)) {
2018 primitive_size->pointer = panfrost_emit_varyings(batch,
2019 &varyings[pan_varying_index(present, PAN_VARY_PSIZ)],
2020 2, vertex_count);
2021 }
2022
2023 pan_emit_special_input(varyings, present, PAN_VARY_PNTCOORD, MALI_ATTRIBUTE_SPECIAL_POINT_COORD);
2024 pan_emit_special_input(varyings, present, PAN_VARY_FACE, MALI_ATTRIBUTE_SPECIAL_FRONT_FACING);
2025 pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD, MALI_ATTRIBUTE_SPECIAL_FRAG_COORD);
2026
2027 vertex_postfix->varyings = T.gpu;
2028 tiler_postfix->varyings = T.gpu;
2029
2030 vertex_postfix->varying_meta = trans.gpu;
2031 tiler_postfix->varying_meta = trans.gpu + vs_size;
2032 }
2033
2034 void
2035 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
2036 struct mali_vertex_tiler_prefix *vertex_prefix,
2037 struct mali_vertex_tiler_postfix *vertex_postfix,
2038 struct mali_vertex_tiler_prefix *tiler_prefix,
2039 struct mali_vertex_tiler_postfix *tiler_postfix,
2040 union midgard_primitive_size *primitive_size)
2041 {
2042 struct panfrost_context *ctx = batch->ctx;
2043 struct panfrost_device *device = pan_device(ctx->base.screen);
2044 bool wallpapering = ctx->wallpaper_batch && batch->scoreboard.tiler_dep;
2045 struct bifrost_payload_vertex bifrost_vertex = {0,};
2046 struct bifrost_payload_tiler bifrost_tiler = {0,};
2047 struct midgard_payload_vertex_tiler midgard_vertex = {0,};
2048 struct midgard_payload_vertex_tiler midgard_tiler = {0,};
2049 void *vp, *tp;
2050 size_t vp_size, tp_size;
2051
2052 if (device->quirks & IS_BIFROST) {
2053 bifrost_vertex.prefix = *vertex_prefix;
2054 bifrost_vertex.postfix = *vertex_postfix;
2055 vp = &bifrost_vertex;
2056 vp_size = sizeof(bifrost_vertex);
2057
2058 bifrost_tiler.prefix = *tiler_prefix;
2059 bifrost_tiler.tiler.primitive_size = *primitive_size;
2060 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
2061 bifrost_tiler.postfix = *tiler_postfix;
2062 tp = &bifrost_tiler;
2063 tp_size = sizeof(bifrost_tiler);
2064 } else {
2065 midgard_vertex.prefix = *vertex_prefix;
2066 midgard_vertex.postfix = *vertex_postfix;
2067 vp = &midgard_vertex;
2068 vp_size = sizeof(midgard_vertex);
2069
2070 midgard_tiler.prefix = *tiler_prefix;
2071 midgard_tiler.postfix = *tiler_postfix;
2072 midgard_tiler.primitive_size = *primitive_size;
2073 tp = &midgard_tiler;
2074 tp_size = sizeof(midgard_tiler);
2075 }
2076
2077 if (wallpapering) {
2078 /* Inject in reverse order, with "predicted" job indices.
2079 * THIS IS A HACK XXX */
2080 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false,
2081 batch->scoreboard.job_index + 2, tp, tp_size, true);
2082 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2083 vp, vp_size, true);
2084 return;
2085 }
2086
2087 /* If rasterizer discard is enable, only submit the vertex */
2088
2089 unsigned vertex = panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2090 vp, vp_size, false);
2091
2092 if (ctx->rasterizer->base.rasterizer_discard)
2093 return;
2094
2095 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false, vertex, tp, tp_size,
2096 false);
2097 }
2098
2099 /* TODO: stop hardcoding this */
2100 mali_ptr
2101 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2102 {
2103 uint16_t locations[] = {
2104 128, 128,
2105 0, 256,
2106 0, 256,
2107 0, 256,
2108 0, 256,
2109 0, 256,
2110 0, 256,
2111 0, 256,
2112 0, 256,
2113 0, 256,
2114 0, 256,
2115 0, 256,
2116 0, 256,
2117 0, 256,
2118 0, 256,
2119 0, 256,
2120 0, 256,
2121 0, 256,
2122 0, 256,
2123 0, 256,
2124 0, 256,
2125 0, 256,
2126 0, 256,
2127 0, 256,
2128 0, 256,
2129 0, 256,
2130 0, 256,
2131 0, 256,
2132 0, 256,
2133 0, 256,
2134 0, 256,
2135 0, 256,
2136 128, 128,
2137 0, 0,
2138 0, 0,
2139 0, 0,
2140 0, 0,
2141 0, 0,
2142 0, 0,
2143 0, 0,
2144 0, 0,
2145 0, 0,
2146 0, 0,
2147 0, 0,
2148 0, 0,
2149 0, 0,
2150 0, 0,
2151 0, 0,
2152 };
2153
2154 return panfrost_pool_upload_aligned(&batch->pool, locations, 96 * sizeof(uint16_t), 64);
2155 }