c214dd6ba726a3458de27d809826b91fff4e861c
[mesa.git] / src / gallium / drivers / panfrost / pan_cmdstream.c
1 /*
2 * Copyright (C) 2018 Alyssa Rosenzweig
3 * Copyright (C) 2020 Collabora Ltd.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "util/macros.h"
26 #include "util/u_prim.h"
27 #include "util/u_vbuf.h"
28
29 #include "panfrost-quirks.h"
30
31 #include "pan_pool.h"
32 #include "pan_bo.h"
33 #include "pan_cmdstream.h"
34 #include "pan_context.h"
35 #include "pan_job.h"
36
37 /* If a BO is accessed for a particular shader stage, will it be in the primary
38 * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
39 * fragment will be primary, e.g. compute jobs will be considered
40 * "vertex/tiler" by analogy */
41
42 static inline uint32_t
43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
44 {
45 assert(stage == PIPE_SHADER_FRAGMENT ||
46 stage == PIPE_SHADER_VERTEX ||
47 stage == PIPE_SHADER_COMPUTE);
48
49 return stage == PIPE_SHADER_FRAGMENT ?
50 PAN_BO_ACCESS_FRAGMENT :
51 PAN_BO_ACCESS_VERTEX_TILER;
52 }
53
54 static void
55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
56 struct mali_vertex_tiler_postfix *postfix)
57 {
58 struct panfrost_device *dev = pan_device(ctx->base.screen);
59 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
60
61 struct mali_shared_memory shared = {
62 .shared_workgroup_count = ~0,
63 };
64
65 if (batch->stack_size) {
66 struct panfrost_bo *stack =
67 panfrost_batch_get_scratchpad(batch, batch->stack_size,
68 dev->thread_tls_alloc,
69 dev->core_count);
70
71 shared.stack_shift = panfrost_get_stack_shift(batch->stack_size);
72 shared.scratchpad = stack->gpu;
73 }
74
75 postfix->shared_memory = panfrost_pool_upload_aligned(&batch->pool, &shared, sizeof(shared), 64);
76 }
77
78 static void
79 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
80 struct mali_vertex_tiler_postfix *postfix)
81 {
82 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
83 postfix->shared_memory = panfrost_batch_reserve_framebuffer(batch);
84 }
85
86 static void
87 panfrost_vt_update_rasterizer(struct panfrost_rasterizer *rasterizer,
88 struct mali_vertex_tiler_prefix *prefix,
89 struct mali_vertex_tiler_postfix *postfix)
90 {
91 postfix->gl_enables |= 0x7;
92 SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
93 rasterizer->base.front_ccw);
94 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
95 (rasterizer->base.cull_face & PIPE_FACE_FRONT));
96 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
97 (rasterizer->base.cull_face & PIPE_FACE_BACK));
98 SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
99 rasterizer->base.flatshade_first);
100 }
101
102 void
103 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
104 struct mali_vertex_tiler_prefix *prefix,
105 union midgard_primitive_size *primitive_size)
106 {
107 struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
108
109 if (!panfrost_writes_point_size(ctx)) {
110 float val = (prefix->draw_mode == MALI_DRAW_MODE_POINTS) ?
111 rasterizer->base.point_size :
112 rasterizer->base.line_width;
113
114 primitive_size->constant = val;
115 }
116 }
117
118 static void
119 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
120 struct mali_vertex_tiler_postfix *postfix)
121 {
122 SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
123 if (ctx->occlusion_query) {
124 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
125 panfrost_batch_add_bo(ctx->batch, ctx->occlusion_query->bo,
126 PAN_BO_ACCESS_SHARED |
127 PAN_BO_ACCESS_RW |
128 PAN_BO_ACCESS_FRAGMENT);
129 } else {
130 postfix->occlusion_counter = 0;
131 }
132 }
133
134 void
135 panfrost_vt_init(struct panfrost_context *ctx,
136 enum pipe_shader_type stage,
137 struct mali_vertex_tiler_prefix *prefix,
138 struct mali_vertex_tiler_postfix *postfix)
139 {
140 struct panfrost_device *device = pan_device(ctx->base.screen);
141
142 if (!ctx->shader[stage])
143 return;
144
145 memset(prefix, 0, sizeof(*prefix));
146 memset(postfix, 0, sizeof(*postfix));
147
148 if (device->quirks & IS_BIFROST) {
149 postfix->gl_enables = 0x2;
150 panfrost_vt_emit_shared_memory(ctx, postfix);
151 } else {
152 postfix->gl_enables = 0x6;
153 panfrost_vt_attach_framebuffer(ctx, postfix);
154 }
155
156 if (stage == PIPE_SHADER_FRAGMENT) {
157 panfrost_vt_update_occlusion_query(ctx, postfix);
158 panfrost_vt_update_rasterizer(ctx->rasterizer, prefix, postfix);
159 }
160 }
161
162 static unsigned
163 panfrost_translate_index_size(unsigned size)
164 {
165 switch (size) {
166 case 1:
167 return MALI_DRAW_INDEXED_UINT8;
168
169 case 2:
170 return MALI_DRAW_INDEXED_UINT16;
171
172 case 4:
173 return MALI_DRAW_INDEXED_UINT32;
174
175 default:
176 unreachable("Invalid index size");
177 }
178 }
179
180 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
181 * good for the duration of the draw (transient), could last longer. Also get
182 * the bounds on the index buffer for the range accessed by the draw. We do
183 * these operations together because there are natural optimizations which
184 * require them to be together. */
185
186 static mali_ptr
187 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
188 const struct pipe_draw_info *info,
189 unsigned *min_index, unsigned *max_index)
190 {
191 struct panfrost_resource *rsrc = pan_resource(info->index.resource);
192 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
193 off_t offset = info->start * info->index_size;
194 bool needs_indices = true;
195 mali_ptr out = 0;
196
197 if (info->max_index != ~0u) {
198 *min_index = info->min_index;
199 *max_index = info->max_index;
200 needs_indices = false;
201 }
202
203 if (!info->has_user_indices) {
204 /* Only resources can be directly mapped */
205 panfrost_batch_add_bo(batch, rsrc->bo,
206 PAN_BO_ACCESS_SHARED |
207 PAN_BO_ACCESS_READ |
208 PAN_BO_ACCESS_VERTEX_TILER);
209 out = rsrc->bo->gpu + offset;
210
211 /* Check the cache */
212 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
213 info->start,
214 info->count,
215 min_index,
216 max_index);
217 } else {
218 /* Otherwise, we need to upload to transient memory */
219 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
220 struct panfrost_transfer T =
221 panfrost_pool_alloc_aligned(&batch->pool,
222 info->count * info->index_size,
223 info->index_size);
224
225 memcpy(T.cpu, ibuf8 + offset, info->count * info->index_size);
226 out = T.gpu;
227 }
228
229 if (needs_indices) {
230 /* Fallback */
231 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
232
233 if (!info->has_user_indices)
234 panfrost_minmax_cache_add(rsrc->index_cache,
235 info->start, info->count,
236 *min_index, *max_index);
237 }
238
239 return out;
240 }
241
242 void
243 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
244 const struct pipe_draw_info *info,
245 enum mali_draw_mode draw_mode,
246 struct mali_vertex_tiler_postfix *vertex_postfix,
247 struct mali_vertex_tiler_prefix *tiler_prefix,
248 struct mali_vertex_tiler_postfix *tiler_postfix,
249 unsigned *vertex_count,
250 unsigned *padded_count)
251 {
252 tiler_prefix->draw_mode = draw_mode;
253
254 unsigned draw_flags = 0;
255
256 if (panfrost_writes_point_size(ctx))
257 draw_flags |= MALI_DRAW_VARYING_SIZE;
258
259 if (info->primitive_restart)
260 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
261
262 /* These doesn't make much sense */
263
264 draw_flags |= 0x3000;
265
266 if (info->index_size) {
267 unsigned min_index = 0, max_index = 0;
268
269 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
270 info,
271 &min_index,
272 &max_index);
273
274 /* Use the corresponding values */
275 *vertex_count = max_index - min_index + 1;
276 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
277 tiler_prefix->offset_bias_correction = -min_index;
278 tiler_prefix->index_count = MALI_POSITIVE(info->count);
279 draw_flags |= panfrost_translate_index_size(info->index_size);
280 } else {
281 tiler_prefix->indices = 0;
282 *vertex_count = ctx->vertex_count;
283 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
284 tiler_prefix->offset_bias_correction = 0;
285 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
286 }
287
288 tiler_prefix->unknown_draw = draw_flags;
289
290 /* Encode the padded vertex count */
291
292 if (info->instance_count > 1) {
293 *padded_count = panfrost_padded_vertex_count(*vertex_count);
294
295 unsigned shift = __builtin_ctz(ctx->padded_count);
296 unsigned k = ctx->padded_count >> (shift + 1);
297
298 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
299 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
300 } else {
301 *padded_count = *vertex_count;
302
303 /* Reset instancing state */
304 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
305 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
306 }
307 }
308
309 static void
310 panfrost_emit_compute_shader(struct panfrost_context *ctx,
311 enum pipe_shader_type st,
312 struct mali_shader_meta *meta)
313 {
314 const struct panfrost_device *dev = pan_device(ctx->base.screen);
315 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
316
317 memset(meta, 0, sizeof(*meta));
318 meta->shader = ss->shader;
319 meta->attribute_count = ss->attribute_count;
320 meta->varying_count = ss->varying_count;
321 meta->texture_count = ctx->sampler_view_count[st];
322 meta->sampler_count = ctx->sampler_count[st];
323
324 if (dev->quirks & IS_BIFROST) {
325 struct mali_bifrost_properties_packed prop;
326
327 pan_pack(&prop, BIFROST_PROPERTIES, cfg) {
328 cfg.unknown = 0x800000; /* XXX */
329 cfg.uniform_buffer_count = panfrost_ubo_count(ctx, st);
330 }
331
332 memcpy(&meta->bifrost_props, &prop, sizeof(prop));
333
334 meta->bifrost2.preload_regs = 0xC0;
335 meta->bifrost2.uniform_count = ss->uniform_count;
336 } else {
337 struct mali_midgard_properties_packed prop;
338
339 pan_pack(&prop, MIDGARD_PROPERTIES, cfg) {
340 cfg.uniform_buffer_count = panfrost_ubo_count(ctx, st);
341 cfg.uniform_count = ss->uniform_count;
342 cfg.work_register_count = ss->work_reg_count;
343 cfg.writes_globals = ss->writes_global;
344 cfg.suppress_inf_nan = true; /* XXX */
345 }
346
347 memcpy(&meta->midgard_props, &prop, sizeof(prop));
348 }
349 }
350
351 static unsigned
352 translate_tex_wrap(enum pipe_tex_wrap w)
353 {
354 switch (w) {
355 case PIPE_TEX_WRAP_REPEAT: return MALI_WRAP_MODE_REPEAT;
356 case PIPE_TEX_WRAP_CLAMP: return MALI_WRAP_MODE_CLAMP;
357 case PIPE_TEX_WRAP_CLAMP_TO_EDGE: return MALI_WRAP_MODE_CLAMP_TO_EDGE;
358 case PIPE_TEX_WRAP_CLAMP_TO_BORDER: return MALI_WRAP_MODE_CLAMP_TO_BORDER;
359 case PIPE_TEX_WRAP_MIRROR_REPEAT: return MALI_WRAP_MODE_MIRRORED_REPEAT;
360 case PIPE_TEX_WRAP_MIRROR_CLAMP: return MALI_WRAP_MODE_MIRRORED_CLAMP;
361 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE;
362 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER;
363 default: unreachable("Invalid wrap");
364 }
365 }
366
367 /* The hardware compares in the wrong order order, so we have to flip before
368 * encoding. Yes, really. */
369
370 static enum mali_func
371 panfrost_sampler_compare_func(const struct pipe_sampler_state *cso)
372 {
373 if (!cso->compare_mode)
374 return MALI_FUNC_NEVER;
375
376 enum mali_func f = panfrost_translate_compare_func(cso->compare_func);
377 return panfrost_flip_compare_func(f);
378 }
379
380 static enum mali_mipmap_mode
381 pan_pipe_to_mipmode(enum pipe_tex_mipfilter f)
382 {
383 switch (f) {
384 case PIPE_TEX_MIPFILTER_NEAREST: return MALI_MIPMAP_MODE_NEAREST;
385 case PIPE_TEX_MIPFILTER_LINEAR: return MALI_MIPMAP_MODE_TRILINEAR;
386 case PIPE_TEX_MIPFILTER_NONE: return MALI_MIPMAP_MODE_NONE;
387 default: unreachable("Invalid");
388 }
389 }
390
391 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
392 struct mali_midgard_sampler_packed *hw)
393 {
394 pan_pack(hw, MIDGARD_SAMPLER, cfg) {
395 cfg.magnify_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
396 cfg.minify_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
397 cfg.mipmap_mode = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR) ?
398 MALI_MIPMAP_MODE_TRILINEAR : MALI_MIPMAP_MODE_NEAREST;
399 cfg.normalized_coordinates = cso->normalized_coords;
400
401 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
402
403 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
404
405 /* If necessary, we disable mipmapping in the sampler descriptor by
406 * clamping the LOD as tight as possible (from 0 to epsilon,
407 * essentially -- remember these are fixed point numbers, so
408 * epsilon=1/256) */
409
410 cfg.maximum_lod = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) ?
411 cfg.minimum_lod + 1 :
412 FIXED_16(cso->max_lod, false);
413
414 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
415 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
416 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
417
418 cfg.compare_function = panfrost_sampler_compare_func(cso);
419 cfg.seamless_cube_map = cso->seamless_cube_map;
420
421 cfg.border_color_r = cso->border_color.f[0];
422 cfg.border_color_g = cso->border_color.f[1];
423 cfg.border_color_b = cso->border_color.f[2];
424 cfg.border_color_a = cso->border_color.f[3];
425 }
426 }
427
428 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
429 struct mali_bifrost_sampler_packed *hw)
430 {
431 pan_pack(hw, BIFROST_SAMPLER, cfg) {
432 cfg.magnify_linear = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR;
433 cfg.minify_linear = cso->min_img_filter == PIPE_TEX_FILTER_LINEAR;
434 cfg.mipmap_mode = pan_pipe_to_mipmode(cso->min_mip_filter);
435 cfg.normalized_coordinates = cso->normalized_coords;
436
437 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
438 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
439 cfg.maximum_lod = FIXED_16(cso->max_lod, false);
440
441 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
442 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
443 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
444
445 cfg.compare_function = panfrost_sampler_compare_func(cso);
446 cfg.seamless_cube_map = cso->seamless_cube_map;
447 }
448 }
449
450 static bool
451 panfrost_fs_required(
452 struct panfrost_shader_state *fs,
453 struct panfrost_blend_final *blend,
454 unsigned rt_count)
455 {
456 /* If we generally have side effects */
457 if (fs->fs_sidefx)
458 return true;
459
460 /* If colour is written we need to execute */
461 for (unsigned i = 0; i < rt_count; ++i) {
462 if (!blend[i].no_colour)
463 return true;
464 }
465
466 /* If depth is written and not implied we need to execute.
467 * TODO: Predicate on Z/S writes being enabled */
468 return (fs->writes_depth || fs->writes_stencil);
469 }
470
471 static void
472 panfrost_emit_blend(struct panfrost_batch *batch, void *rts,
473 struct panfrost_blend_final *blend)
474 {
475 const struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
476 struct panfrost_shader_state *fs = panfrost_get_shader_state(batch->ctx, PIPE_SHADER_FRAGMENT);
477 unsigned rt_count = batch->key.nr_cbufs;
478
479 struct bifrost_blend_rt *brts = rts;
480 struct midgard_blend_rt *mrts = rts;
481
482 /* Disable blending for depth-only on Bifrost */
483
484 if (rt_count == 0 && dev->quirks & IS_BIFROST)
485 brts[0].unk2 = 0x3;
486
487 for (unsigned i = 0; i < rt_count; ++i) {
488 unsigned flags = 0;
489
490 pan_pack(&flags, BLEND_FLAGS, cfg) {
491 if (blend[i].no_colour) {
492 cfg.enable = false;
493 break;
494 }
495
496 batch->draws |= (PIPE_CLEAR_COLOR0 << i);
497
498 cfg.srgb = util_format_is_srgb(batch->key.cbufs[i]->format);
499 cfg.load_destination = blend[i].load_dest;
500 cfg.dither_disable = !batch->ctx->blend->base.dither;
501
502 if (!(dev->quirks & IS_BIFROST))
503 cfg.midgard_blend_shader = blend[i].is_shader;
504 }
505
506 if (dev->quirks & IS_BIFROST) {
507 brts[i].flags = flags;
508
509 if (blend[i].is_shader) {
510 /* The blend shader's address needs to be at
511 * the same top 32 bit as the fragment shader.
512 * TODO: Ensure that's always the case.
513 */
514 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
515 (fs->bo->gpu & (0xffffffffull << 32)));
516 brts[i].shader = blend[i].shader.gpu;
517 brts[i].unk2 = 0x0;
518 } else {
519 enum pipe_format format = batch->key.cbufs[i]->format;
520 const struct util_format_description *format_desc;
521 format_desc = util_format_description(format);
522
523 brts[i].equation = blend[i].equation.equation;
524
525 /* TODO: this is a bit more complicated */
526 brts[i].constant = blend[i].equation.constant;
527
528 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
529
530 /* 0x19 disables blending and forces REPLACE
531 * mode (equivalent to rgb_mode = alpha_mode =
532 * x122, colour mask = 0xF). 0x1a allows
533 * blending. */
534 brts[i].unk2 = blend[i].opaque ? 0x19 : 0x1a;
535
536 brts[i].shader_type = fs->blend_types[i];
537 }
538 } else {
539 memcpy(&mrts[i].flags, &flags, sizeof(flags));
540
541 if (blend[i].is_shader) {
542 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
543 } else {
544 mrts[i].blend.equation = blend[i].equation.equation;
545 mrts[i].blend.constant = blend[i].equation.constant;
546 }
547 }
548 }
549 }
550
551 static void
552 panfrost_emit_frag_shader(struct panfrost_context *ctx,
553 struct mali_shader_meta *fragmeta,
554 struct panfrost_blend_final *blend)
555 {
556 const struct panfrost_device *dev = pan_device(ctx->base.screen);
557 struct panfrost_shader_state *fs;
558
559 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
560
561 struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
562 const struct panfrost_zsa_state *zsa = ctx->depth_stencil;
563 unsigned rt_count = ctx->pipe_framebuffer.nr_cbufs;
564
565 memset(fragmeta, 0, sizeof(*fragmeta));
566
567 fragmeta->shader = fs->shader;
568 fragmeta->attribute_count = fs->attribute_count;
569 fragmeta->varying_count = fs->varying_count;
570 fragmeta->texture_count = ctx->sampler_view_count[PIPE_SHADER_FRAGMENT];
571 fragmeta->sampler_count = ctx->sampler_count[PIPE_SHADER_FRAGMENT];
572
573 if (dev->quirks & IS_BIFROST) {
574 struct mali_bifrost_properties_packed prop;
575
576 bool no_blend = true;
577
578 for (unsigned i = 0; i < rt_count; ++i)
579 no_blend &= (!blend[i].load_dest | blend[i].no_colour);
580
581 pan_pack(&prop, BIFROST_PROPERTIES, cfg) {
582 cfg.unknown = 0x950020; /* XXX */
583 cfg.uniform_buffer_count = panfrost_ubo_count(ctx, PIPE_SHADER_FRAGMENT);
584 cfg.early_z_enable = !fs->can_discard && !fs->writes_depth && no_blend;
585 }
586
587 memcpy(&fragmeta->bifrost_props, &prop, sizeof(prop));
588
589 fragmeta->bifrost2.preload_regs = 0x1;
590 SET_BIT(fragmeta->bifrost2.preload_regs, 0x10, fs->reads_frag_coord);
591
592 fragmeta->bifrost2.uniform_count = fs->uniform_count;
593 } else {
594 struct mali_midgard_properties_packed prop;
595
596 /* Reasons to disable early-Z from a shader perspective */
597 bool late_z = fs->can_discard || fs->writes_global ||
598 fs->writes_depth || fs->writes_stencil;
599
600 /* Reasons to disable early-Z from a CSO perspective */
601 bool alpha_to_coverage = ctx->blend->base.alpha_to_coverage;
602
603 /* If either depth or stencil is enabled, discard matters */
604 bool zs_enabled =
605 (zsa->base.depth.enabled && zsa->base.depth.func != PIPE_FUNC_ALWAYS) ||
606 zsa->base.stencil[0].enabled;
607
608 bool has_blend_shader = false;
609
610 for (unsigned c = 0; c < rt_count; ++c)
611 has_blend_shader |= blend[c].is_shader;
612
613 pan_pack(&prop, MIDGARD_PROPERTIES, cfg) {
614 cfg.uniform_buffer_count = panfrost_ubo_count(ctx, PIPE_SHADER_FRAGMENT);
615 cfg.uniform_count = fs->uniform_count;
616 cfg.work_register_count = fs->work_reg_count;
617 cfg.writes_globals = fs->writes_global;
618 cfg.suppress_inf_nan = true; /* XXX */
619
620 /* TODO: Reduce this limit? */
621 if (has_blend_shader)
622 cfg.work_register_count = MAX2(cfg.work_register_count, 8);
623
624 cfg.stencil_from_shader = fs->writes_stencil;
625 cfg.helper_invocation_enable = fs->helper_invocations;
626 cfg.depth_source = fs->writes_depth ?
627 MALI_DEPTH_SOURCE_SHADER :
628 MALI_DEPTH_SOURCE_FIXED_FUNCTION;
629
630 /* Depend on other state */
631 cfg.early_z_enable = !(late_z || alpha_to_coverage);
632 cfg.reads_tilebuffer = fs->outputs_read || (!zs_enabled && fs->can_discard);
633 cfg.reads_depth_stencil = zs_enabled && fs->can_discard;
634 }
635
636 memcpy(&fragmeta->midgard_props, &prop, sizeof(prop));
637 }
638
639 bool msaa = rast->multisample;
640 fragmeta->coverage_mask = msaa ? ctx->sample_mask : ~0;
641
642 fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x10;
643 fragmeta->unknown2_4 = 0x4e0;
644
645 /* TODO: Sample size */
646 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
647 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
648
649 /* EXT_shader_framebuffer_fetch requires the shader to be run
650 * per-sample when outputs are read. */
651 bool per_sample = ctx->min_samples > 1 || fs->outputs_read;
652 SET_BIT(fragmeta->unknown2_3, MALI_PER_SAMPLE, msaa && per_sample);
653
654 fragmeta->depth_units = rast->offset_units * 2.0f;
655 fragmeta->depth_factor = rast->offset_scale;
656
657 /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
658
659 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, rast->offset_tri);
660 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, rast->offset_tri);
661
662 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_NEAR, rast->depth_clip_near);
663 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_FAR, rast->depth_clip_far);
664
665 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
666 zsa->base.stencil[0].enabled);
667
668 fragmeta->stencil_mask_front = zsa->stencil_mask_front;
669 fragmeta->stencil_mask_back = zsa->stencil_mask_back;
670
671 /* Bottom bits for stencil ref, exactly one word */
672 fragmeta->stencil_front.opaque[0] = zsa->stencil_front.opaque[0] | ctx->stencil_ref.ref_value[0];
673
674 /* If back-stencil is not enabled, use the front values */
675
676 if (zsa->base.stencil[1].enabled)
677 fragmeta->stencil_back.opaque[0] = zsa->stencil_back.opaque[0] | ctx->stencil_ref.ref_value[1];
678 else
679 fragmeta->stencil_back = fragmeta->stencil_front;
680
681 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
682 zsa->base.depth.writemask);
683
684 fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
685 fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(
686 zsa->base.depth.enabled ? zsa->base.depth.func : PIPE_FUNC_ALWAYS));
687
688 SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
689 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
690 !ctx->blend->base.dither);
691
692 SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
693
694 SET_BIT(fragmeta->unknown2_4, MALI_ALPHA_TO_COVERAGE,
695 ctx->blend->base.alpha_to_coverage);
696
697 /* Disable shader execution if we can */
698 if (dev->quirks & MIDGARD_SHADERLESS
699 && !panfrost_fs_required(fs, blend, rt_count)) {
700 fragmeta->shader = 0x1;
701 fragmeta->attribute_count = 0;
702 fragmeta->varying_count = 0;
703 fragmeta->texture_count = 0;
704 fragmeta->sampler_count = 0;
705
706 /* This feature is not known to work on Bifrost */
707 struct mali_midgard_properties_packed prop;
708
709 pan_pack(&prop, MIDGARD_PROPERTIES, cfg) {
710 cfg.work_register_count = 1;
711 cfg.depth_source = MALI_DEPTH_SOURCE_FIXED_FUNCTION;
712 cfg.early_z_enable = true;
713 }
714
715 memcpy(&fragmeta->midgard_props, &prop, sizeof(prop));
716 }
717
718 if (dev->quirks & MIDGARD_SFBD) {
719 /* When only a single render target platform is used, the blend
720 * information is inside the shader meta itself. We additionally
721 * need to signal CAN_DISCARD for nontrivial blend modes (so
722 * we're able to read back the destination buffer) */
723
724 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
725 blend[0].is_shader);
726
727 if (blend[0].is_shader) {
728 fragmeta->blend.shader = blend[0].shader.gpu |
729 blend[0].shader.first_tag;
730 } else {
731 fragmeta->blend.equation = blend[0].equation.equation;
732 fragmeta->blend.constant = blend[0].equation.constant;
733 }
734
735 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
736 blend[0].load_dest);
737 } else if (!(dev->quirks & IS_BIFROST)) {
738 /* Bug where MRT-capable hw apparently reads the last blend
739 * shader from here instead of the usual location? */
740
741 for (signed rt = ((signed) rt_count - 1); rt >= 0; --rt) {
742 if (!blend[rt].is_shader)
743 continue;
744
745 fragmeta->blend.shader = blend[rt].shader.gpu |
746 blend[rt].shader.first_tag;
747 break;
748 }
749 }
750 }
751
752 void
753 panfrost_emit_shader_meta(struct panfrost_batch *batch,
754 enum pipe_shader_type st,
755 struct mali_vertex_tiler_postfix *postfix)
756 {
757 struct panfrost_context *ctx = batch->ctx;
758 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
759
760 if (!ss) {
761 postfix->shader = 0;
762 return;
763 }
764
765 struct mali_shader_meta meta;
766
767 /* Add the shader BO to the batch. */
768 panfrost_batch_add_bo(batch, ss->bo,
769 PAN_BO_ACCESS_PRIVATE |
770 PAN_BO_ACCESS_READ |
771 panfrost_bo_access_for_stage(st));
772
773 mali_ptr shader_ptr;
774
775 if (st == PIPE_SHADER_FRAGMENT) {
776 struct panfrost_device *dev = pan_device(ctx->base.screen);
777 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
778 size_t desc_size = sizeof(meta);
779 void *rts = NULL;
780 struct panfrost_transfer xfer;
781 unsigned rt_size;
782
783 if (dev->quirks & MIDGARD_SFBD)
784 rt_size = 0;
785 else if (dev->quirks & IS_BIFROST)
786 rt_size = sizeof(struct bifrost_blend_rt);
787 else
788 rt_size = sizeof(struct midgard_blend_rt);
789
790 desc_size += rt_size * rt_count;
791
792 if (rt_size)
793 rts = rzalloc_size(ctx, rt_size * rt_count);
794
795 struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
796
797 for (unsigned c = 0; c < ctx->pipe_framebuffer.nr_cbufs; ++c)
798 blend[c] = panfrost_get_blend_for_context(ctx, c);
799
800 panfrost_emit_frag_shader(ctx, &meta, blend);
801
802 if (!(dev->quirks & MIDGARD_SFBD))
803 panfrost_emit_blend(batch, rts, blend);
804 else
805 batch->draws |= PIPE_CLEAR_COLOR0;
806
807 xfer = panfrost_pool_alloc_aligned(&batch->pool, desc_size, sizeof(meta));
808
809 memcpy(xfer.cpu, &meta, sizeof(meta));
810 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
811
812 if (rt_size)
813 ralloc_free(rts);
814
815 shader_ptr = xfer.gpu;
816 } else {
817 panfrost_emit_compute_shader(ctx, st, &meta);
818
819 shader_ptr = panfrost_pool_upload(&batch->pool, &meta,
820 sizeof(meta));
821 }
822
823 postfix->shader = shader_ptr;
824 }
825
826 void
827 panfrost_emit_viewport(struct panfrost_batch *batch,
828 struct mali_vertex_tiler_postfix *tiler_postfix)
829 {
830 struct panfrost_context *ctx = batch->ctx;
831 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
832 const struct pipe_scissor_state *ss = &ctx->scissor;
833 const struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
834 const struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer;
835
836 /* Derive min/max from translate/scale. Note since |x| >= 0 by
837 * definition, we have that -|x| <= |x| hence translate - |scale| <=
838 * translate + |scale|, so the ordering is correct here. */
839 float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
840 float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
841 float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
842 float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
843 float minz = (vp->translate[2] - fabsf(vp->scale[2]));
844 float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
845
846 /* Scissor to the intersection of viewport and to the scissor, clamped
847 * to the framebuffer */
848
849 unsigned minx = MIN2(fb->width, vp_minx);
850 unsigned maxx = MIN2(fb->width, vp_maxx);
851 unsigned miny = MIN2(fb->height, vp_miny);
852 unsigned maxy = MIN2(fb->height, vp_maxy);
853
854 if (ss && rast->scissor) {
855 minx = MAX2(ss->minx, minx);
856 miny = MAX2(ss->miny, miny);
857 maxx = MIN2(ss->maxx, maxx);
858 maxy = MIN2(ss->maxy, maxy);
859 }
860
861 struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool, MALI_VIEWPORT_LENGTH);
862
863 pan_pack(T.cpu, VIEWPORT, cfg) {
864 cfg.scissor_minimum_x = minx;
865 cfg.scissor_minimum_y = miny;
866 cfg.scissor_maximum_x = maxx - 1;
867 cfg.scissor_maximum_y = maxy - 1;
868
869 cfg.minimum_z = rast->depth_clip_near ? minz : -INFINITY;
870 cfg.maximum_z = rast->depth_clip_far ? maxz : INFINITY;
871 }
872
873 tiler_postfix->viewport = T.gpu;
874 panfrost_batch_union_scissor(batch, minx, miny, maxx, maxy);
875 }
876
877 static mali_ptr
878 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
879 enum pipe_shader_type st,
880 struct panfrost_constant_buffer *buf,
881 unsigned index)
882 {
883 struct pipe_constant_buffer *cb = &buf->cb[index];
884 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
885
886 if (rsrc) {
887 panfrost_batch_add_bo(batch, rsrc->bo,
888 PAN_BO_ACCESS_SHARED |
889 PAN_BO_ACCESS_READ |
890 panfrost_bo_access_for_stage(st));
891
892 /* Alignment gauranteed by
893 * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
894 return rsrc->bo->gpu + cb->buffer_offset;
895 } else if (cb->user_buffer) {
896 return panfrost_pool_upload_aligned(&batch->pool,
897 cb->user_buffer +
898 cb->buffer_offset,
899 cb->buffer_size, 16);
900 } else {
901 unreachable("No constant buffer");
902 }
903 }
904
905 struct sysval_uniform {
906 union {
907 float f[4];
908 int32_t i[4];
909 uint32_t u[4];
910 uint64_t du[2];
911 };
912 };
913
914 static void
915 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
916 struct sysval_uniform *uniform)
917 {
918 struct panfrost_context *ctx = batch->ctx;
919 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
920
921 uniform->f[0] = vp->scale[0];
922 uniform->f[1] = vp->scale[1];
923 uniform->f[2] = vp->scale[2];
924 }
925
926 static void
927 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
928 struct sysval_uniform *uniform)
929 {
930 struct panfrost_context *ctx = batch->ctx;
931 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
932
933 uniform->f[0] = vp->translate[0];
934 uniform->f[1] = vp->translate[1];
935 uniform->f[2] = vp->translate[2];
936 }
937
938 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
939 enum pipe_shader_type st,
940 unsigned int sysvalid,
941 struct sysval_uniform *uniform)
942 {
943 struct panfrost_context *ctx = batch->ctx;
944 unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
945 unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
946 bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
947 struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
948
949 assert(dim);
950 uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
951
952 if (dim > 1)
953 uniform->i[1] = u_minify(tex->texture->height0,
954 tex->u.tex.first_level);
955
956 if (dim > 2)
957 uniform->i[2] = u_minify(tex->texture->depth0,
958 tex->u.tex.first_level);
959
960 if (is_array)
961 uniform->i[dim] = tex->texture->array_size;
962 }
963
964 static void
965 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
966 enum pipe_shader_type st,
967 unsigned ssbo_id,
968 struct sysval_uniform *uniform)
969 {
970 struct panfrost_context *ctx = batch->ctx;
971
972 assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
973 struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
974
975 /* Compute address */
976 struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
977
978 panfrost_batch_add_bo(batch, bo,
979 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
980 panfrost_bo_access_for_stage(st));
981
982 /* Upload address and size as sysval */
983 uniform->du[0] = bo->gpu + sb.buffer_offset;
984 uniform->u[2] = sb.buffer_size;
985 }
986
987 static void
988 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
989 enum pipe_shader_type st,
990 unsigned samp_idx,
991 struct sysval_uniform *uniform)
992 {
993 struct panfrost_context *ctx = batch->ctx;
994 struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
995
996 uniform->f[0] = sampl->min_lod;
997 uniform->f[1] = sampl->max_lod;
998 uniform->f[2] = sampl->lod_bias;
999
1000 /* Even without any errata, Midgard represents "no mipmapping" as
1001 * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1002 * panfrost_create_sampler_state which also explains our choice of
1003 * epsilon value (again to keep behaviour consistent) */
1004
1005 if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1006 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1007 }
1008
1009 static void
1010 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1011 struct sysval_uniform *uniform)
1012 {
1013 struct panfrost_context *ctx = batch->ctx;
1014
1015 uniform->u[0] = ctx->compute_grid->grid[0];
1016 uniform->u[1] = ctx->compute_grid->grid[1];
1017 uniform->u[2] = ctx->compute_grid->grid[2];
1018 }
1019
1020 static void
1021 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1022 struct panfrost_shader_state *ss,
1023 enum pipe_shader_type st)
1024 {
1025 struct sysval_uniform *uniforms = (void *)buf;
1026
1027 for (unsigned i = 0; i < ss->sysval_count; ++i) {
1028 int sysval = ss->sysval[i];
1029
1030 switch (PAN_SYSVAL_TYPE(sysval)) {
1031 case PAN_SYSVAL_VIEWPORT_SCALE:
1032 panfrost_upload_viewport_scale_sysval(batch,
1033 &uniforms[i]);
1034 break;
1035 case PAN_SYSVAL_VIEWPORT_OFFSET:
1036 panfrost_upload_viewport_offset_sysval(batch,
1037 &uniforms[i]);
1038 break;
1039 case PAN_SYSVAL_TEXTURE_SIZE:
1040 panfrost_upload_txs_sysval(batch, st,
1041 PAN_SYSVAL_ID(sysval),
1042 &uniforms[i]);
1043 break;
1044 case PAN_SYSVAL_SSBO:
1045 panfrost_upload_ssbo_sysval(batch, st,
1046 PAN_SYSVAL_ID(sysval),
1047 &uniforms[i]);
1048 break;
1049 case PAN_SYSVAL_NUM_WORK_GROUPS:
1050 panfrost_upload_num_work_groups_sysval(batch,
1051 &uniforms[i]);
1052 break;
1053 case PAN_SYSVAL_SAMPLER:
1054 panfrost_upload_sampler_sysval(batch, st,
1055 PAN_SYSVAL_ID(sysval),
1056 &uniforms[i]);
1057 break;
1058 default:
1059 assert(0);
1060 }
1061 }
1062 }
1063
1064 static const void *
1065 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1066 unsigned index)
1067 {
1068 struct pipe_constant_buffer *cb = &buf->cb[index];
1069 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1070
1071 if (rsrc)
1072 return rsrc->bo->cpu;
1073 else if (cb->user_buffer)
1074 return cb->user_buffer;
1075 else
1076 unreachable("No constant buffer");
1077 }
1078
1079 void
1080 panfrost_emit_const_buf(struct panfrost_batch *batch,
1081 enum pipe_shader_type stage,
1082 struct mali_vertex_tiler_postfix *postfix)
1083 {
1084 struct panfrost_context *ctx = batch->ctx;
1085 struct panfrost_shader_variants *all = ctx->shader[stage];
1086
1087 if (!all)
1088 return;
1089
1090 struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1091
1092 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1093
1094 /* Uniforms are implicitly UBO #0 */
1095 bool has_uniforms = buf->enabled_mask & (1 << 0);
1096
1097 /* Allocate room for the sysval and the uniforms */
1098 size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1099 size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1100 size_t size = sys_size + uniform_size;
1101 struct panfrost_transfer transfer =
1102 panfrost_pool_alloc_aligned(&batch->pool, size, 16);
1103
1104 /* Upload sysvals requested by the shader */
1105 panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1106
1107 /* Upload uniforms */
1108 if (has_uniforms && uniform_size) {
1109 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1110 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1111 }
1112
1113 /* Next up, attach UBOs. UBO #0 is the uniforms we just
1114 * uploaded */
1115
1116 unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1117 assert(ubo_count >= 1);
1118
1119 size_t sz = MALI_UNIFORM_BUFFER_LENGTH * ubo_count;
1120 struct panfrost_transfer ubos =
1121 panfrost_pool_alloc_aligned(&batch->pool, sz,
1122 MALI_UNIFORM_BUFFER_LENGTH);
1123
1124 uint64_t *ubo_ptr = (uint64_t *) ubos.cpu;
1125
1126 /* Upload uniforms as a UBO */
1127
1128 if (size) {
1129 pan_pack(ubo_ptr, UNIFORM_BUFFER, cfg) {
1130 cfg.entries = DIV_ROUND_UP(size, 16);
1131 cfg.pointer = transfer.gpu;
1132 }
1133 } else {
1134 *ubo_ptr = 0;
1135 }
1136
1137 /* The rest are honest-to-goodness UBOs */
1138
1139 for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1140 size_t usz = buf->cb[ubo].buffer_size;
1141 bool enabled = buf->enabled_mask & (1 << ubo);
1142 bool empty = usz == 0;
1143
1144 if (!enabled || empty) {
1145 ubo_ptr[ubo] = 0;
1146 continue;
1147 }
1148
1149 pan_pack(ubo_ptr + ubo, UNIFORM_BUFFER, cfg) {
1150 cfg.entries = DIV_ROUND_UP(usz, 16);
1151 cfg.pointer = panfrost_map_constant_buffer_gpu(batch,
1152 stage, buf, ubo);
1153 }
1154 }
1155
1156 postfix->uniforms = transfer.gpu;
1157 postfix->uniform_buffers = ubos.gpu;
1158
1159 buf->dirty_mask = 0;
1160 }
1161
1162 void
1163 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1164 const struct pipe_grid_info *info,
1165 struct midgard_payload_vertex_tiler *vtp)
1166 {
1167 struct panfrost_context *ctx = batch->ctx;
1168 struct panfrost_device *dev = pan_device(ctx->base.screen);
1169 struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1170 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1171 unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1172 128));
1173
1174 unsigned log2_instances =
1175 util_logbase2_ceil(info->grid[0]) +
1176 util_logbase2_ceil(info->grid[1]) +
1177 util_logbase2_ceil(info->grid[2]);
1178
1179 unsigned shared_size = single_size * (1 << log2_instances) * dev->core_count;
1180 struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1181 shared_size,
1182 1);
1183
1184 struct mali_shared_memory shared = {
1185 .shared_memory = bo->gpu,
1186 .shared_workgroup_count = log2_instances,
1187 .shared_shift = util_logbase2(single_size) + 1
1188 };
1189
1190 vtp->postfix.shared_memory = panfrost_pool_upload_aligned(&batch->pool, &shared,
1191 sizeof(shared), 64);
1192 }
1193
1194 static mali_ptr
1195 panfrost_get_tex_desc(struct panfrost_batch *batch,
1196 enum pipe_shader_type st,
1197 struct panfrost_sampler_view *view)
1198 {
1199 if (!view)
1200 return (mali_ptr) 0;
1201
1202 struct pipe_sampler_view *pview = &view->base;
1203 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1204
1205 /* Add the BO to the job so it's retained until the job is done. */
1206
1207 panfrost_batch_add_bo(batch, rsrc->bo,
1208 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1209 panfrost_bo_access_for_stage(st));
1210
1211 panfrost_batch_add_bo(batch, view->bo,
1212 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1213 panfrost_bo_access_for_stage(st));
1214
1215 return view->bo->gpu;
1216 }
1217
1218 static void
1219 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
1220 struct pipe_context *pctx)
1221 {
1222 struct panfrost_resource *rsrc = pan_resource(view->base.texture);
1223 if (view->texture_bo != rsrc->bo->gpu ||
1224 view->modifier != rsrc->modifier) {
1225 panfrost_bo_unreference(view->bo);
1226 panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);
1227 }
1228 }
1229
1230 void
1231 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1232 enum pipe_shader_type stage,
1233 struct mali_vertex_tiler_postfix *postfix)
1234 {
1235 struct panfrost_context *ctx = batch->ctx;
1236 struct panfrost_device *device = pan_device(ctx->base.screen);
1237
1238 if (!ctx->sampler_view_count[stage])
1239 return;
1240
1241 if (device->quirks & IS_BIFROST) {
1242 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1243 MALI_BIFROST_TEXTURE_LENGTH *
1244 ctx->sampler_view_count[stage],
1245 MALI_BIFROST_TEXTURE_LENGTH);
1246
1247 struct mali_bifrost_texture_packed *out =
1248 (struct mali_bifrost_texture_packed *) T.cpu;
1249
1250 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1251 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1252 struct pipe_sampler_view *pview = &view->base;
1253 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1254
1255 panfrost_update_sampler_view(view, &ctx->base);
1256 out[i] = view->bifrost_descriptor;
1257
1258 /* Add the BOs to the job so they are retained until the job is done. */
1259
1260 panfrost_batch_add_bo(batch, rsrc->bo,
1261 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1262 panfrost_bo_access_for_stage(stage));
1263
1264 panfrost_batch_add_bo(batch, view->bo,
1265 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1266 panfrost_bo_access_for_stage(stage));
1267 }
1268
1269 postfix->textures = T.gpu;
1270 } else {
1271 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1272
1273 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1274 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1275
1276 panfrost_update_sampler_view(view, &ctx->base);
1277
1278 trampolines[i] = panfrost_get_tex_desc(batch, stage, view);
1279 }
1280
1281 postfix->textures = panfrost_pool_upload_aligned(&batch->pool,
1282 trampolines,
1283 sizeof(uint64_t) *
1284 ctx->sampler_view_count[stage],
1285 sizeof(uint64_t));
1286 }
1287 }
1288
1289 void
1290 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1291 enum pipe_shader_type stage,
1292 struct mali_vertex_tiler_postfix *postfix)
1293 {
1294 struct panfrost_context *ctx = batch->ctx;
1295
1296 if (!ctx->sampler_count[stage])
1297 return;
1298
1299 size_t desc_size = MALI_BIFROST_SAMPLER_LENGTH;
1300 assert(MALI_BIFROST_SAMPLER_LENGTH == MALI_MIDGARD_SAMPLER_LENGTH);
1301
1302 size_t sz = desc_size * ctx->sampler_count[stage];
1303 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool, sz, desc_size);
1304 struct mali_midgard_sampler_packed *out = (struct mali_midgard_sampler_packed *) T.cpu;
1305
1306 for (unsigned i = 0; i < ctx->sampler_count[stage]; ++i)
1307 out[i] = ctx->samplers[stage][i]->hw;
1308
1309 postfix->sampler_descriptor = T.gpu;
1310 }
1311
1312 void
1313 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1314 struct mali_vertex_tiler_postfix *vertex_postfix)
1315 {
1316 struct panfrost_context *ctx = batch->ctx;
1317 struct panfrost_vertex_state *so = ctx->vertex;
1318 struct panfrost_shader_state *vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1319
1320 unsigned instance_shift = vertex_postfix->instance_shift;
1321 unsigned instance_odd = vertex_postfix->instance_odd;
1322
1323 /* Worst case: everything is NPOT, which is only possible if instancing
1324 * is enabled. Otherwise single record is gauranteed */
1325 bool could_npot = instance_shift || instance_odd;
1326
1327 struct panfrost_transfer S = panfrost_pool_alloc_aligned(&batch->pool,
1328 MALI_ATTRIBUTE_BUFFER_LENGTH * vs->attribute_count *
1329 (could_npot ? 2 : 1),
1330 MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
1331
1332 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1333 MALI_ATTRIBUTE_LENGTH * vs->attribute_count,
1334 MALI_ATTRIBUTE_LENGTH);
1335
1336 struct mali_attribute_buffer_packed *bufs =
1337 (struct mali_attribute_buffer_packed *) S.cpu;
1338
1339 struct mali_attribute_packed *out =
1340 (struct mali_attribute_packed *) T.cpu;
1341
1342 unsigned attrib_to_buffer[PIPE_MAX_ATTRIBS] = { 0 };
1343 unsigned k = 0;
1344
1345 for (unsigned i = 0; i < so->num_elements; ++i) {
1346 /* We map buffers 1:1 with the attributes, which
1347 * means duplicating some vertex buffers (who cares? aside from
1348 * maybe some caching implications but I somehow doubt that
1349 * matters) */
1350
1351 struct pipe_vertex_element *elem = &so->pipe[i];
1352 unsigned vbi = elem->vertex_buffer_index;
1353 attrib_to_buffer[i] = k;
1354
1355 if (!(ctx->vb_mask & (1 << vbi)))
1356 continue;
1357
1358 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1359 struct panfrost_resource *rsrc;
1360
1361 rsrc = pan_resource(buf->buffer.resource);
1362 if (!rsrc)
1363 continue;
1364
1365 /* Add a dependency of the batch on the vertex buffer */
1366 panfrost_batch_add_bo(batch, rsrc->bo,
1367 PAN_BO_ACCESS_SHARED |
1368 PAN_BO_ACCESS_READ |
1369 PAN_BO_ACCESS_VERTEX_TILER);
1370
1371 /* Mask off lower bits, see offset fixup below */
1372 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1373 mali_ptr addr = raw_addr & ~63;
1374
1375 /* Since we advanced the base pointer, we shrink the buffer
1376 * size, but add the offset we subtracted */
1377 unsigned size = rsrc->base.width0 + (raw_addr - addr)
1378 - buf->buffer_offset;
1379
1380 /* When there is a divisor, the hardware-level divisor is
1381 * the product of the instance divisor and the padded count */
1382 unsigned divisor = elem->instance_divisor;
1383 unsigned hw_divisor = ctx->padded_count * divisor;
1384 unsigned stride = buf->stride;
1385
1386 /* If there's a divisor(=1) but no instancing, we want every
1387 * attribute to be the same */
1388
1389 if (divisor && ctx->instance_count == 1)
1390 stride = 0;
1391
1392 if (!divisor || ctx->instance_count <= 1) {
1393 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1394 if (ctx->instance_count > 1)
1395 cfg.type = MALI_ATTRIBUTE_TYPE_1D_MODULUS;
1396
1397 cfg.pointer = addr;
1398 cfg.stride = stride;
1399 cfg.size = size;
1400 cfg.divisor_r = instance_shift;
1401 cfg.divisor_p = instance_odd;
1402 }
1403 } else if (util_is_power_of_two_or_zero(hw_divisor)) {
1404 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1405 cfg.type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;
1406 cfg.pointer = addr;
1407 cfg.stride = stride;
1408 cfg.size = size;
1409 cfg.divisor_r = __builtin_ctz(hw_divisor);
1410 }
1411
1412 } else {
1413 unsigned shift = 0, extra_flags = 0;
1414
1415 unsigned magic_divisor =
1416 panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags);
1417
1418 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1419 cfg.type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;
1420 cfg.pointer = addr;
1421 cfg.stride = stride;
1422 cfg.size = size;
1423
1424 cfg.divisor_r = shift;
1425 cfg.divisor_e = extra_flags;
1426 }
1427
1428 pan_pack(bufs + k + 1, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) {
1429 cfg.divisor_numerator = magic_divisor;
1430 cfg.divisor = divisor;
1431 }
1432
1433 ++k;
1434 }
1435
1436 ++k;
1437 }
1438
1439 /* Add special gl_VertexID/gl_InstanceID buffers */
1440
1441 if (unlikely(vs->attribute_count >= PAN_VERTEX_ID)) {
1442 panfrost_vertex_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1443
1444 pan_pack(out + PAN_VERTEX_ID, ATTRIBUTE, cfg) {
1445 cfg.buffer_index = k++;
1446 cfg.format = so->formats[PAN_VERTEX_ID];
1447 }
1448
1449 panfrost_instance_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1450
1451 pan_pack(out + PAN_INSTANCE_ID, ATTRIBUTE, cfg) {
1452 cfg.buffer_index = k++;
1453 cfg.format = so->formats[PAN_INSTANCE_ID];
1454 }
1455 }
1456
1457 /* Attribute addresses require 64-byte alignment, so let:
1458 *
1459 * base' = base & ~63 = base - (base & 63)
1460 * offset' = offset + (base & 63)
1461 *
1462 * Since base' + offset' = base + offset, these are equivalent
1463 * addressing modes and now base is 64 aligned.
1464 */
1465
1466 unsigned start = vertex_postfix->offset_start;
1467
1468 for (unsigned i = 0; i < so->num_elements; ++i) {
1469 unsigned vbi = so->pipe[i].vertex_buffer_index;
1470 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1471
1472 /* Adjust by the masked off bits of the offset. Make sure we
1473 * read src_offset from so->hw (which is not GPU visible)
1474 * rather than target (which is) due to caching effects */
1475
1476 unsigned src_offset = so->pipe[i].src_offset;
1477
1478 /* BOs aligned to 4k so guaranteed aligned to 64 */
1479 src_offset += (buf->buffer_offset & 63);
1480
1481 /* Also, somewhat obscurely per-instance data needs to be
1482 * offset in response to a delayed start in an indexed draw */
1483
1484 if (so->pipe[i].instance_divisor && ctx->instance_count > 1 && start)
1485 src_offset -= buf->stride * start;
1486
1487 pan_pack(out + i, ATTRIBUTE, cfg) {
1488 cfg.buffer_index = attrib_to_buffer[i];
1489 cfg.format = so->formats[i];
1490 cfg.offset = src_offset;
1491 }
1492 }
1493
1494 vertex_postfix->attributes = S.gpu;
1495 vertex_postfix->attribute_meta = T.gpu;
1496 }
1497
1498 static mali_ptr
1499 panfrost_emit_varyings(struct panfrost_batch *batch,
1500 struct mali_attribute_buffer_packed *slot,
1501 unsigned stride, unsigned count)
1502 {
1503 unsigned size = stride * count;
1504 mali_ptr ptr = panfrost_pool_alloc_aligned(&batch->invisible_pool, size, 64).gpu;
1505
1506 pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1507 cfg.stride = stride;
1508 cfg.size = size;
1509 cfg.pointer = ptr;
1510 }
1511
1512 return ptr;
1513 }
1514
1515 static unsigned
1516 panfrost_streamout_offset(unsigned stride, unsigned offset,
1517 struct pipe_stream_output_target *target)
1518 {
1519 return (target->buffer_offset + (offset * stride * 4)) & 63;
1520 }
1521
1522 static void
1523 panfrost_emit_streamout(struct panfrost_batch *batch,
1524 struct mali_attribute_buffer_packed *slot,
1525 unsigned stride_words, unsigned offset, unsigned count,
1526 struct pipe_stream_output_target *target)
1527 {
1528 unsigned stride = stride_words * 4;
1529 unsigned max_size = target->buffer_size;
1530 unsigned expected_size = stride * count;
1531
1532 /* Grab the BO and bind it to the batch */
1533 struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1534
1535 /* Varyings are WRITE from the perspective of the VERTEX but READ from
1536 * the perspective of the TILER and FRAGMENT.
1537 */
1538 panfrost_batch_add_bo(batch, bo,
1539 PAN_BO_ACCESS_SHARED |
1540 PAN_BO_ACCESS_RW |
1541 PAN_BO_ACCESS_VERTEX_TILER |
1542 PAN_BO_ACCESS_FRAGMENT);
1543
1544 /* We will have an offset applied to get alignment */
1545 mali_ptr addr = bo->gpu + target->buffer_offset + (offset * stride);
1546
1547 pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1548 cfg.pointer = (addr & ~63);
1549 cfg.stride = stride;
1550 cfg.size = MIN2(max_size, expected_size) + (addr & 63);
1551 }
1552 }
1553
1554 static bool
1555 has_point_coord(unsigned mask, gl_varying_slot loc)
1556 {
1557 if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1558 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1559 else if (loc == VARYING_SLOT_PNTC)
1560 return (mask & (1 << 8));
1561 else
1562 return false;
1563 }
1564
1565 /* Helpers for manipulating stream out information so we can pack varyings
1566 * accordingly. Compute the src_offset for a given captured varying */
1567
1568 static struct pipe_stream_output *
1569 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1570 {
1571 for (unsigned i = 0; i < info->num_outputs; ++i) {
1572 if (info->output[i].register_index == loc)
1573 return &info->output[i];
1574 }
1575
1576 unreachable("Varying not captured");
1577 }
1578
1579 static unsigned
1580 pan_varying_size(enum mali_format fmt)
1581 {
1582 unsigned type = MALI_EXTRACT_TYPE(fmt);
1583 unsigned chan = MALI_EXTRACT_CHANNELS(fmt);
1584 unsigned bits = MALI_EXTRACT_BITS(fmt);
1585 unsigned bpc = 0;
1586
1587 if (bits == MALI_CHANNEL_FLOAT) {
1588 /* No doubles */
1589 bool fp16 = (type == MALI_FORMAT_SINT);
1590 assert(fp16 || (type == MALI_FORMAT_UNORM));
1591
1592 bpc = fp16 ? 2 : 4;
1593 } else {
1594 assert(type >= MALI_FORMAT_SNORM && type <= MALI_FORMAT_SINT);
1595
1596 /* See the enums */
1597 bits = 1 << bits;
1598 assert(bits >= 8);
1599 bpc = bits / 8;
1600 }
1601
1602 return bpc * chan;
1603 }
1604
1605 /* Indices for named (non-XFB) varyings that are present. These are packed
1606 * tightly so they correspond to a bitfield present (P) indexed by (1 <<
1607 * PAN_VARY_*). This has the nice property that you can lookup the buffer index
1608 * of a given special field given a shift S by:
1609 *
1610 * idx = popcount(P & ((1 << S) - 1))
1611 *
1612 * That is... look at all of the varyings that come earlier and count them, the
1613 * count is the new index since plus one. Likewise, the total number of special
1614 * buffers required is simply popcount(P)
1615 */
1616
1617 enum pan_special_varying {
1618 PAN_VARY_GENERAL = 0,
1619 PAN_VARY_POSITION = 1,
1620 PAN_VARY_PSIZ = 2,
1621 PAN_VARY_PNTCOORD = 3,
1622 PAN_VARY_FACE = 4,
1623 PAN_VARY_FRAGCOORD = 5,
1624
1625 /* Keep last */
1626 PAN_VARY_MAX,
1627 };
1628
1629 /* Given a varying, figure out which index it correpsonds to */
1630
1631 static inline unsigned
1632 pan_varying_index(unsigned present, enum pan_special_varying v)
1633 {
1634 unsigned mask = (1 << v) - 1;
1635 return util_bitcount(present & mask);
1636 }
1637
1638 /* Get the base offset for XFB buffers, which by convention come after
1639 * everything else. Wrapper function for semantic reasons; by construction this
1640 * is just popcount. */
1641
1642 static inline unsigned
1643 pan_xfb_base(unsigned present)
1644 {
1645 return util_bitcount(present);
1646 }
1647
1648 /* Computes the present mask for varyings so we can start emitting varying records */
1649
1650 static inline unsigned
1651 pan_varying_present(
1652 struct panfrost_shader_state *vs,
1653 struct panfrost_shader_state *fs,
1654 unsigned quirks)
1655 {
1656 /* At the moment we always emit general and position buffers. Not
1657 * strictly necessary but usually harmless */
1658
1659 unsigned present = (1 << PAN_VARY_GENERAL) | (1 << PAN_VARY_POSITION);
1660
1661 /* Enable special buffers by the shader info */
1662
1663 if (vs->writes_point_size)
1664 present |= (1 << PAN_VARY_PSIZ);
1665
1666 if (fs->reads_point_coord)
1667 present |= (1 << PAN_VARY_PNTCOORD);
1668
1669 if (fs->reads_face)
1670 present |= (1 << PAN_VARY_FACE);
1671
1672 if (fs->reads_frag_coord && !(quirks & IS_BIFROST))
1673 present |= (1 << PAN_VARY_FRAGCOORD);
1674
1675 /* Also, if we have a point sprite, we need a point coord buffer */
1676
1677 for (unsigned i = 0; i < fs->varying_count; i++) {
1678 gl_varying_slot loc = fs->varyings_loc[i];
1679
1680 if (has_point_coord(fs->point_sprite_mask, loc))
1681 present |= (1 << PAN_VARY_PNTCOORD);
1682 }
1683
1684 return present;
1685 }
1686
1687 /* Emitters for varying records */
1688
1689 static void
1690 pan_emit_vary(struct mali_attribute_packed *out,
1691 unsigned present, enum pan_special_varying buf,
1692 unsigned quirks, enum mali_format format,
1693 unsigned offset)
1694 {
1695 unsigned nr_channels = MALI_EXTRACT_CHANNELS(format);
1696 unsigned swizzle = quirks & HAS_SWIZZLES ?
1697 panfrost_get_default_swizzle(nr_channels) :
1698 panfrost_bifrost_swizzle(nr_channels);
1699
1700 pan_pack(out, ATTRIBUTE, cfg) {
1701 cfg.buffer_index = pan_varying_index(present, buf);
1702 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1703 cfg.format = (format << 12) | swizzle;
1704 cfg.offset = offset;
1705 }
1706 }
1707
1708 /* General varying that is unused */
1709
1710 static void
1711 pan_emit_vary_only(struct mali_attribute_packed *out,
1712 unsigned present, unsigned quirks)
1713 {
1714 pan_emit_vary(out, present, 0, quirks, MALI_VARYING_DISCARD, 0);
1715 }
1716
1717 /* Special records */
1718
1719 static const enum mali_format pan_varying_formats[PAN_VARY_MAX] = {
1720 [PAN_VARY_POSITION] = MALI_VARYING_POS,
1721 [PAN_VARY_PSIZ] = MALI_R16F,
1722 [PAN_VARY_PNTCOORD] = MALI_R16F,
1723 [PAN_VARY_FACE] = MALI_R32I,
1724 [PAN_VARY_FRAGCOORD] = MALI_RGBA32F
1725 };
1726
1727 static void
1728 pan_emit_vary_special(struct mali_attribute_packed *out,
1729 unsigned present, enum pan_special_varying buf,
1730 unsigned quirks)
1731 {
1732 assert(buf < PAN_VARY_MAX);
1733 pan_emit_vary(out, present, buf, quirks, pan_varying_formats[buf], 0);
1734 }
1735
1736 static enum mali_format
1737 pan_xfb_format(enum mali_format format, unsigned nr)
1738 {
1739 if (MALI_EXTRACT_BITS(format) == MALI_CHANNEL_FLOAT)
1740 return MALI_R32F | MALI_NR_CHANNELS(nr);
1741 else
1742 return MALI_EXTRACT_TYPE(format) | MALI_NR_CHANNELS(nr) | MALI_CHANNEL_32;
1743 }
1744
1745 /* Transform feedback records. Note struct pipe_stream_output is (if packed as
1746 * a bitfield) 32-bit, smaller than a 64-bit pointer, so may as well pass by
1747 * value. */
1748
1749 static void
1750 pan_emit_vary_xfb(struct mali_attribute_packed *out,
1751 unsigned present,
1752 unsigned max_xfb,
1753 unsigned *streamout_offsets,
1754 unsigned quirks,
1755 enum mali_format format,
1756 struct pipe_stream_output o)
1757 {
1758 unsigned swizzle = quirks & HAS_SWIZZLES ?
1759 panfrost_get_default_swizzle(o.num_components) :
1760 panfrost_bifrost_swizzle(o.num_components);
1761
1762 pan_pack(out, ATTRIBUTE, cfg) {
1763 /* XFB buffers come after everything else */
1764 cfg.buffer_index = pan_xfb_base(present) + o.output_buffer;
1765 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1766
1767 /* Override number of channels and precision to highp */
1768 cfg.format = (pan_xfb_format(format, o.num_components) << 12) | swizzle;
1769
1770 /* Apply given offsets together */
1771 cfg.offset = (o.dst_offset * 4) /* dwords */
1772 + streamout_offsets[o.output_buffer];
1773 }
1774 }
1775
1776 /* Determine if we should capture a varying for XFB. This requires actually
1777 * having a buffer for it. If we don't capture it, we'll fallback to a general
1778 * varying path (linked or unlinked, possibly discarding the write) */
1779
1780 static bool
1781 panfrost_xfb_captured(struct panfrost_shader_state *xfb,
1782 unsigned loc, unsigned max_xfb)
1783 {
1784 if (!(xfb->so_mask & (1ll << loc)))
1785 return false;
1786
1787 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1788 return o->output_buffer < max_xfb;
1789 }
1790
1791 static void
1792 pan_emit_general_varying(struct mali_attribute_packed *out,
1793 struct panfrost_shader_state *other,
1794 struct panfrost_shader_state *xfb,
1795 gl_varying_slot loc,
1796 enum mali_format format,
1797 unsigned present,
1798 unsigned quirks,
1799 unsigned *gen_offsets,
1800 enum mali_format *gen_formats,
1801 unsigned *gen_stride,
1802 unsigned idx,
1803 bool should_alloc)
1804 {
1805 /* Check if we're linked */
1806 signed other_idx = -1;
1807
1808 for (unsigned j = 0; j < other->varying_count; ++j) {
1809 if (other->varyings_loc[j] == loc) {
1810 other_idx = j;
1811 break;
1812 }
1813 }
1814
1815 if (other_idx < 0) {
1816 pan_emit_vary_only(out, present, quirks);
1817 return;
1818 }
1819
1820 unsigned offset = gen_offsets[other_idx];
1821
1822 if (should_alloc) {
1823 /* We're linked, so allocate a space via a watermark allocation */
1824 enum mali_format alt = other->varyings[other_idx];
1825
1826 /* Do interpolation at minimum precision */
1827 unsigned size_main = pan_varying_size(format);
1828 unsigned size_alt = pan_varying_size(alt);
1829 unsigned size = MIN2(size_main, size_alt);
1830
1831 /* If a varying is marked for XFB but not actually captured, we
1832 * should match the format to the format that would otherwise
1833 * be used for XFB, since dEQP checks for invariance here. It's
1834 * unclear if this is required by the spec. */
1835
1836 if (xfb->so_mask & (1ull << loc)) {
1837 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1838 format = pan_xfb_format(format, o->num_components);
1839 size = pan_varying_size(format);
1840 } else if (size == size_alt) {
1841 format = alt;
1842 }
1843
1844 gen_offsets[idx] = *gen_stride;
1845 gen_formats[other_idx] = format;
1846 offset = *gen_stride;
1847 *gen_stride += size;
1848 }
1849
1850 pan_emit_vary(out, present, PAN_VARY_GENERAL, quirks, format, offset);
1851 }
1852
1853 /* Higher-level wrapper around all of the above, classifying a varying into one
1854 * of the above types */
1855
1856 static void
1857 panfrost_emit_varying(
1858 struct mali_attribute_packed *out,
1859 struct panfrost_shader_state *stage,
1860 struct panfrost_shader_state *other,
1861 struct panfrost_shader_state *xfb,
1862 unsigned present,
1863 unsigned max_xfb,
1864 unsigned *streamout_offsets,
1865 unsigned quirks,
1866 unsigned *gen_offsets,
1867 enum mali_format *gen_formats,
1868 unsigned *gen_stride,
1869 unsigned idx,
1870 bool should_alloc,
1871 bool is_fragment)
1872 {
1873 gl_varying_slot loc = stage->varyings_loc[idx];
1874 enum mali_format format = stage->varyings[idx];
1875
1876 /* Override format to match linkage */
1877 if (!should_alloc && gen_formats[idx])
1878 format = gen_formats[idx];
1879
1880 if (has_point_coord(stage->point_sprite_mask, loc)) {
1881 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1882 } else if (panfrost_xfb_captured(xfb, loc, max_xfb)) {
1883 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1884 pan_emit_vary_xfb(out, present, max_xfb, streamout_offsets, quirks, format, *o);
1885 } else if (loc == VARYING_SLOT_POS) {
1886 if (is_fragment)
1887 pan_emit_vary_special(out, present, PAN_VARY_FRAGCOORD, quirks);
1888 else
1889 pan_emit_vary_special(out, present, PAN_VARY_POSITION, quirks);
1890 } else if (loc == VARYING_SLOT_PSIZ) {
1891 pan_emit_vary_special(out, present, PAN_VARY_PSIZ, quirks);
1892 } else if (loc == VARYING_SLOT_PNTC) {
1893 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1894 } else if (loc == VARYING_SLOT_FACE) {
1895 pan_emit_vary_special(out, present, PAN_VARY_FACE, quirks);
1896 } else {
1897 pan_emit_general_varying(out, other, xfb, loc, format, present,
1898 quirks, gen_offsets, gen_formats, gen_stride,
1899 idx, should_alloc);
1900 }
1901 }
1902
1903 static void
1904 pan_emit_special_input(struct mali_attribute_buffer_packed *out,
1905 unsigned present,
1906 enum pan_special_varying v,
1907 unsigned special)
1908 {
1909 if (present & (1 << v)) {
1910 unsigned idx = pan_varying_index(present, v);
1911
1912 pan_pack(out + idx, ATTRIBUTE_BUFFER, cfg) {
1913 cfg.special = special;
1914 cfg.type = 0;
1915 }
1916 }
1917 }
1918
1919 void
1920 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1921 unsigned vertex_count,
1922 struct mali_vertex_tiler_postfix *vertex_postfix,
1923 struct mali_vertex_tiler_postfix *tiler_postfix,
1924 union midgard_primitive_size *primitive_size)
1925 {
1926 /* Load the shaders */
1927 struct panfrost_context *ctx = batch->ctx;
1928 struct panfrost_device *dev = pan_device(ctx->base.screen);
1929 struct panfrost_shader_state *vs, *fs;
1930 size_t vs_size, fs_size;
1931
1932 /* Allocate the varying descriptor */
1933
1934 vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1935 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1936 vs_size = MALI_ATTRIBUTE_LENGTH * vs->varying_count;
1937 fs_size = MALI_ATTRIBUTE_LENGTH * fs->varying_count;
1938
1939 struct panfrost_transfer trans = panfrost_pool_alloc_aligned(
1940 &batch->pool, vs_size + fs_size, MALI_ATTRIBUTE_LENGTH);
1941
1942 struct pipe_stream_output_info *so = &vs->stream_output;
1943 unsigned present = pan_varying_present(vs, fs, dev->quirks);
1944
1945 /* Check if this varying is linked by us. This is the case for
1946 * general-purpose, non-captured varyings. If it is, link it. If it's
1947 * not, use the provided stream out information to determine the
1948 * offset, since it was already linked for us. */
1949
1950 unsigned gen_offsets[32];
1951 enum mali_format gen_formats[32];
1952 memset(gen_offsets, 0, sizeof(gen_offsets));
1953 memset(gen_formats, 0, sizeof(gen_formats));
1954
1955 unsigned gen_stride = 0;
1956 assert(vs->varying_count < ARRAY_SIZE(gen_offsets));
1957 assert(fs->varying_count < ARRAY_SIZE(gen_offsets));
1958
1959 unsigned streamout_offsets[32];
1960
1961 for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1962 streamout_offsets[i] = panfrost_streamout_offset(
1963 so->stride[i],
1964 ctx->streamout.offsets[i],
1965 ctx->streamout.targets[i]);
1966 }
1967
1968 struct mali_attribute_packed *ovs = (struct mali_attribute_packed *)trans.cpu;
1969 struct mali_attribute_packed *ofs = ovs + vs->varying_count;
1970
1971 for (unsigned i = 0; i < vs->varying_count; i++) {
1972 panfrost_emit_varying(ovs + i, vs, fs, vs, present,
1973 ctx->streamout.num_targets, streamout_offsets,
1974 dev->quirks,
1975 gen_offsets, gen_formats, &gen_stride, i, true, false);
1976 }
1977
1978 for (unsigned i = 0; i < fs->varying_count; i++) {
1979 panfrost_emit_varying(ofs + i, fs, vs, vs, present,
1980 ctx->streamout.num_targets, streamout_offsets,
1981 dev->quirks,
1982 gen_offsets, gen_formats, &gen_stride, i, false, true);
1983 }
1984
1985 unsigned xfb_base = pan_xfb_base(present);
1986 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1987 MALI_ATTRIBUTE_BUFFER_LENGTH * (xfb_base + ctx->streamout.num_targets),
1988 MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
1989 struct mali_attribute_buffer_packed *varyings =
1990 (struct mali_attribute_buffer_packed *) T.cpu;
1991
1992 /* Emit the stream out buffers */
1993
1994 unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1995 ctx->vertex_count);
1996
1997 for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1998 panfrost_emit_streamout(batch, &varyings[xfb_base + i],
1999 so->stride[i],
2000 ctx->streamout.offsets[i],
2001 out_count,
2002 ctx->streamout.targets[i]);
2003 }
2004
2005 panfrost_emit_varyings(batch,
2006 &varyings[pan_varying_index(present, PAN_VARY_GENERAL)],
2007 gen_stride, vertex_count);
2008
2009 /* fp32 vec4 gl_Position */
2010 tiler_postfix->position_varying = panfrost_emit_varyings(batch,
2011 &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
2012 sizeof(float) * 4, vertex_count);
2013
2014 if (present & (1 << PAN_VARY_PSIZ)) {
2015 primitive_size->pointer = panfrost_emit_varyings(batch,
2016 &varyings[pan_varying_index(present, PAN_VARY_PSIZ)],
2017 2, vertex_count);
2018 }
2019
2020 pan_emit_special_input(varyings, present, PAN_VARY_PNTCOORD, MALI_ATTRIBUTE_SPECIAL_POINT_COORD);
2021 pan_emit_special_input(varyings, present, PAN_VARY_FACE, MALI_ATTRIBUTE_SPECIAL_FRONT_FACING);
2022 pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD, MALI_ATTRIBUTE_SPECIAL_FRAG_COORD);
2023
2024 vertex_postfix->varyings = T.gpu;
2025 tiler_postfix->varyings = T.gpu;
2026
2027 vertex_postfix->varying_meta = trans.gpu;
2028 tiler_postfix->varying_meta = trans.gpu + vs_size;
2029 }
2030
2031 void
2032 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
2033 struct mali_vertex_tiler_prefix *vertex_prefix,
2034 struct mali_vertex_tiler_postfix *vertex_postfix,
2035 struct mali_vertex_tiler_prefix *tiler_prefix,
2036 struct mali_vertex_tiler_postfix *tiler_postfix,
2037 union midgard_primitive_size *primitive_size)
2038 {
2039 struct panfrost_context *ctx = batch->ctx;
2040 struct panfrost_device *device = pan_device(ctx->base.screen);
2041 bool wallpapering = ctx->wallpaper_batch && batch->scoreboard.tiler_dep;
2042 struct bifrost_payload_vertex bifrost_vertex = {0,};
2043 struct bifrost_payload_tiler bifrost_tiler = {0,};
2044 struct midgard_payload_vertex_tiler midgard_vertex = {0,};
2045 struct midgard_payload_vertex_tiler midgard_tiler = {0,};
2046 void *vp, *tp;
2047 size_t vp_size, tp_size;
2048
2049 if (device->quirks & IS_BIFROST) {
2050 bifrost_vertex.prefix = *vertex_prefix;
2051 bifrost_vertex.postfix = *vertex_postfix;
2052 vp = &bifrost_vertex;
2053 vp_size = sizeof(bifrost_vertex);
2054
2055 bifrost_tiler.prefix = *tiler_prefix;
2056 bifrost_tiler.tiler.primitive_size = *primitive_size;
2057 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
2058 bifrost_tiler.postfix = *tiler_postfix;
2059 tp = &bifrost_tiler;
2060 tp_size = sizeof(bifrost_tiler);
2061 } else {
2062 midgard_vertex.prefix = *vertex_prefix;
2063 midgard_vertex.postfix = *vertex_postfix;
2064 vp = &midgard_vertex;
2065 vp_size = sizeof(midgard_vertex);
2066
2067 midgard_tiler.prefix = *tiler_prefix;
2068 midgard_tiler.postfix = *tiler_postfix;
2069 midgard_tiler.primitive_size = *primitive_size;
2070 tp = &midgard_tiler;
2071 tp_size = sizeof(midgard_tiler);
2072 }
2073
2074 if (wallpapering) {
2075 /* Inject in reverse order, with "predicted" job indices.
2076 * THIS IS A HACK XXX */
2077 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false,
2078 batch->scoreboard.job_index + 2, tp, tp_size, true);
2079 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2080 vp, vp_size, true);
2081 return;
2082 }
2083
2084 /* If rasterizer discard is enable, only submit the vertex */
2085
2086 unsigned vertex = panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2087 vp, vp_size, false);
2088
2089 if (ctx->rasterizer->base.rasterizer_discard)
2090 return;
2091
2092 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false, vertex, tp, tp_size,
2093 false);
2094 }
2095
2096 /* TODO: stop hardcoding this */
2097 mali_ptr
2098 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2099 {
2100 uint16_t locations[] = {
2101 128, 128,
2102 0, 256,
2103 0, 256,
2104 0, 256,
2105 0, 256,
2106 0, 256,
2107 0, 256,
2108 0, 256,
2109 0, 256,
2110 0, 256,
2111 0, 256,
2112 0, 256,
2113 0, 256,
2114 0, 256,
2115 0, 256,
2116 0, 256,
2117 0, 256,
2118 0, 256,
2119 0, 256,
2120 0, 256,
2121 0, 256,
2122 0, 256,
2123 0, 256,
2124 0, 256,
2125 0, 256,
2126 0, 256,
2127 0, 256,
2128 0, 256,
2129 0, 256,
2130 0, 256,
2131 0, 256,
2132 0, 256,
2133 128, 128,
2134 0, 0,
2135 0, 0,
2136 0, 0,
2137 0, 0,
2138 0, 0,
2139 0, 0,
2140 0, 0,
2141 0, 0,
2142 0, 0,
2143 0, 0,
2144 0, 0,
2145 0, 0,
2146 0, 0,
2147 0, 0,
2148 0, 0,
2149 };
2150
2151 return panfrost_pool_upload_aligned(&batch->pool, locations, 96 * sizeof(uint16_t), 64);
2152 }