2efd512ddb6a819ec06e84814221e34543239af2
[mesa.git] / src / gallium / drivers / panfrost / pan_cmdstream.c
1 /*
2 * Copyright (C) 2018 Alyssa Rosenzweig
3 * Copyright (C) 2020 Collabora Ltd.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "util/macros.h"
26 #include "util/u_prim.h"
27 #include "util/u_vbuf.h"
28
29 #include "panfrost-quirks.h"
30
31 #include "pan_pool.h"
32 #include "pan_bo.h"
33 #include "pan_cmdstream.h"
34 #include "pan_context.h"
35 #include "pan_job.h"
36
37 /* If a BO is accessed for a particular shader stage, will it be in the primary
38 * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
39 * fragment will be primary, e.g. compute jobs will be considered
40 * "vertex/tiler" by analogy */
41
42 static inline uint32_t
43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
44 {
45 assert(stage == PIPE_SHADER_FRAGMENT ||
46 stage == PIPE_SHADER_VERTEX ||
47 stage == PIPE_SHADER_COMPUTE);
48
49 return stage == PIPE_SHADER_FRAGMENT ?
50 PAN_BO_ACCESS_FRAGMENT :
51 PAN_BO_ACCESS_VERTEX_TILER;
52 }
53
54 static void
55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
56 struct mali_vertex_tiler_postfix *postfix)
57 {
58 struct panfrost_device *dev = pan_device(ctx->base.screen);
59 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
60
61 struct mali_shared_memory shared = {
62 .shared_workgroup_count = ~0,
63 };
64
65 if (batch->stack_size) {
66 struct panfrost_bo *stack =
67 panfrost_batch_get_scratchpad(batch, batch->stack_size,
68 dev->thread_tls_alloc,
69 dev->core_count);
70
71 shared.stack_shift = panfrost_get_stack_shift(batch->stack_size);
72 shared.scratchpad = stack->gpu;
73 }
74
75 postfix->shared_memory = panfrost_pool_upload_aligned(&batch->pool, &shared, sizeof(shared), 64);
76 }
77
78 static void
79 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
80 struct mali_vertex_tiler_postfix *postfix)
81 {
82 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
83 postfix->shared_memory = panfrost_batch_reserve_framebuffer(batch);
84 }
85
86 static void
87 panfrost_vt_update_rasterizer(struct panfrost_rasterizer *rasterizer,
88 struct mali_vertex_tiler_prefix *prefix,
89 struct mali_vertex_tiler_postfix *postfix)
90 {
91 postfix->gl_enables |= 0x7;
92 SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
93 rasterizer->base.front_ccw);
94 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
95 (rasterizer->base.cull_face & PIPE_FACE_FRONT));
96 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
97 (rasterizer->base.cull_face & PIPE_FACE_BACK));
98 SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
99 rasterizer->base.flatshade_first);
100 }
101
102 void
103 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
104 struct mali_vertex_tiler_prefix *prefix,
105 union midgard_primitive_size *primitive_size)
106 {
107 struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
108
109 if (!panfrost_writes_point_size(ctx)) {
110 float val = (prefix->draw_mode == MALI_DRAW_MODE_POINTS) ?
111 rasterizer->base.point_size :
112 rasterizer->base.line_width;
113
114 primitive_size->constant = val;
115 }
116 }
117
118 static void
119 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
120 struct mali_vertex_tiler_postfix *postfix)
121 {
122 SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
123 if (ctx->occlusion_query) {
124 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
125 panfrost_batch_add_bo(ctx->batch, ctx->occlusion_query->bo,
126 PAN_BO_ACCESS_SHARED |
127 PAN_BO_ACCESS_RW |
128 PAN_BO_ACCESS_FRAGMENT);
129 } else {
130 postfix->occlusion_counter = 0;
131 }
132 }
133
134 void
135 panfrost_vt_init(struct panfrost_context *ctx,
136 enum pipe_shader_type stage,
137 struct mali_vertex_tiler_prefix *prefix,
138 struct mali_vertex_tiler_postfix *postfix)
139 {
140 struct panfrost_device *device = pan_device(ctx->base.screen);
141
142 if (!ctx->shader[stage])
143 return;
144
145 memset(prefix, 0, sizeof(*prefix));
146 memset(postfix, 0, sizeof(*postfix));
147
148 if (device->quirks & IS_BIFROST) {
149 postfix->gl_enables = 0x2;
150 panfrost_vt_emit_shared_memory(ctx, postfix);
151 } else {
152 postfix->gl_enables = 0x6;
153 panfrost_vt_attach_framebuffer(ctx, postfix);
154 }
155
156 if (stage == PIPE_SHADER_FRAGMENT) {
157 panfrost_vt_update_occlusion_query(ctx, postfix);
158 panfrost_vt_update_rasterizer(ctx->rasterizer, prefix, postfix);
159 }
160 }
161
162 static unsigned
163 panfrost_translate_index_size(unsigned size)
164 {
165 switch (size) {
166 case 1:
167 return MALI_DRAW_INDEXED_UINT8;
168
169 case 2:
170 return MALI_DRAW_INDEXED_UINT16;
171
172 case 4:
173 return MALI_DRAW_INDEXED_UINT32;
174
175 default:
176 unreachable("Invalid index size");
177 }
178 }
179
180 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
181 * good for the duration of the draw (transient), could last longer. Also get
182 * the bounds on the index buffer for the range accessed by the draw. We do
183 * these operations together because there are natural optimizations which
184 * require them to be together. */
185
186 static mali_ptr
187 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
188 const struct pipe_draw_info *info,
189 unsigned *min_index, unsigned *max_index)
190 {
191 struct panfrost_resource *rsrc = pan_resource(info->index.resource);
192 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
193 off_t offset = info->start * info->index_size;
194 bool needs_indices = true;
195 mali_ptr out = 0;
196
197 if (info->max_index != ~0u) {
198 *min_index = info->min_index;
199 *max_index = info->max_index;
200 needs_indices = false;
201 }
202
203 if (!info->has_user_indices) {
204 /* Only resources can be directly mapped */
205 panfrost_batch_add_bo(batch, rsrc->bo,
206 PAN_BO_ACCESS_SHARED |
207 PAN_BO_ACCESS_READ |
208 PAN_BO_ACCESS_VERTEX_TILER);
209 out = rsrc->bo->gpu + offset;
210
211 /* Check the cache */
212 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
213 info->start,
214 info->count,
215 min_index,
216 max_index);
217 } else {
218 /* Otherwise, we need to upload to transient memory */
219 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
220 struct panfrost_transfer T =
221 panfrost_pool_alloc_aligned(&batch->pool,
222 info->count * info->index_size,
223 info->index_size);
224
225 memcpy(T.cpu, ibuf8 + offset, info->count * info->index_size);
226 out = T.gpu;
227 }
228
229 if (needs_indices) {
230 /* Fallback */
231 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
232
233 if (!info->has_user_indices)
234 panfrost_minmax_cache_add(rsrc->index_cache,
235 info->start, info->count,
236 *min_index, *max_index);
237 }
238
239 return out;
240 }
241
242 void
243 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
244 const struct pipe_draw_info *info,
245 enum mali_draw_mode draw_mode,
246 struct mali_vertex_tiler_postfix *vertex_postfix,
247 struct mali_vertex_tiler_prefix *tiler_prefix,
248 struct mali_vertex_tiler_postfix *tiler_postfix,
249 unsigned *vertex_count,
250 unsigned *padded_count)
251 {
252 tiler_prefix->draw_mode = draw_mode;
253
254 unsigned draw_flags = 0;
255
256 if (panfrost_writes_point_size(ctx))
257 draw_flags |= MALI_DRAW_VARYING_SIZE;
258
259 if (info->primitive_restart)
260 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
261
262 /* These doesn't make much sense */
263
264 draw_flags |= 0x3000;
265
266 if (info->index_size) {
267 unsigned min_index = 0, max_index = 0;
268
269 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
270 info,
271 &min_index,
272 &max_index);
273
274 /* Use the corresponding values */
275 *vertex_count = max_index - min_index + 1;
276 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
277 tiler_prefix->offset_bias_correction = -min_index;
278 tiler_prefix->index_count = MALI_POSITIVE(info->count);
279 draw_flags |= panfrost_translate_index_size(info->index_size);
280 } else {
281 tiler_prefix->indices = 0;
282 *vertex_count = ctx->vertex_count;
283 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
284 tiler_prefix->offset_bias_correction = 0;
285 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
286 }
287
288 tiler_prefix->unknown_draw = draw_flags;
289
290 /* Encode the padded vertex count */
291
292 if (info->instance_count > 1) {
293 *padded_count = panfrost_padded_vertex_count(*vertex_count);
294
295 unsigned shift = __builtin_ctz(ctx->padded_count);
296 unsigned k = ctx->padded_count >> (shift + 1);
297
298 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
299 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
300 } else {
301 *padded_count = *vertex_count;
302
303 /* Reset instancing state */
304 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
305 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
306 }
307 }
308
309 static void
310 panfrost_emit_compute_shader(struct panfrost_context *ctx,
311 enum pipe_shader_type st,
312 struct mali_shader_meta *meta)
313 {
314 const struct panfrost_device *dev = pan_device(ctx->base.screen);
315 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
316
317 memset(meta, 0, sizeof(*meta));
318 meta->shader = ss->shader;
319 meta->attribute_count = ss->attribute_count;
320 meta->varying_count = ss->varying_count;
321 meta->texture_count = ctx->sampler_view_count[st];
322 meta->sampler_count = ctx->sampler_count[st];
323
324 if (dev->quirks & IS_BIFROST) {
325 meta->bifrost1.unk1 = 0x800000;
326 meta->bifrost2.preload_regs = 0xC0;
327 meta->bifrost2.uniform_count = ss->uniform_count;
328 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
329 } else {
330 meta->midgard1.uniform_count = ss->uniform_count;
331 meta->midgard1.work_count = ss->work_reg_count;
332
333 /* TODO: This is not conformant on ES3 */
334 meta->midgard1.flags_hi = MALI_SUPPRESS_INF_NAN;
335
336 meta->midgard1.flags_lo = 0x20;
337 meta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
338
339 SET_BIT(meta->midgard1.flags_lo, MALI_WRITES_GLOBAL, ss->writes_global);
340 }
341 }
342
343 static unsigned
344 translate_tex_wrap(enum pipe_tex_wrap w)
345 {
346 switch (w) {
347 case PIPE_TEX_WRAP_REPEAT: return MALI_WRAP_MODE_REPEAT;
348 case PIPE_TEX_WRAP_CLAMP: return MALI_WRAP_MODE_CLAMP;
349 case PIPE_TEX_WRAP_CLAMP_TO_EDGE: return MALI_WRAP_MODE_CLAMP_TO_EDGE;
350 case PIPE_TEX_WRAP_CLAMP_TO_BORDER: return MALI_WRAP_MODE_CLAMP_TO_BORDER;
351 case PIPE_TEX_WRAP_MIRROR_REPEAT: return MALI_WRAP_MODE_MIRRORED_REPEAT;
352 case PIPE_TEX_WRAP_MIRROR_CLAMP: return MALI_WRAP_MODE_MIRRORED_CLAMP;
353 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE;
354 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER;
355 default: unreachable("Invalid wrap");
356 }
357 }
358
359 /* The hardware compares in the wrong order order, so we have to flip before
360 * encoding. Yes, really. */
361
362 static enum mali_func
363 panfrost_sampler_compare_func(const struct pipe_sampler_state *cso)
364 {
365 if (!cso->compare_mode)
366 return MALI_FUNC_NEVER;
367
368 enum mali_func f = panfrost_translate_compare_func(cso->compare_func);
369 return panfrost_flip_compare_func(f);
370 }
371
372 static enum mali_mipmap_mode
373 pan_pipe_to_mipmode(enum pipe_tex_mipfilter f)
374 {
375 switch (f) {
376 case PIPE_TEX_MIPFILTER_NEAREST: return MALI_MIPMAP_MODE_NEAREST;
377 case PIPE_TEX_MIPFILTER_LINEAR: return MALI_MIPMAP_MODE_TRILINEAR;
378 case PIPE_TEX_MIPFILTER_NONE: return MALI_MIPMAP_MODE_NONE;
379 default: unreachable("Invalid");
380 }
381 }
382
383 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
384 struct mali_midgard_sampler_packed *hw)
385 {
386 pan_pack(hw, MIDGARD_SAMPLER, cfg) {
387 cfg.magnify_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
388 cfg.minify_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
389 cfg.mipmap_mode = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR) ?
390 MALI_MIPMAP_MODE_TRILINEAR : MALI_MIPMAP_MODE_NEAREST;
391 cfg.normalized_coordinates = cso->normalized_coords;
392
393 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
394
395 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
396
397 /* If necessary, we disable mipmapping in the sampler descriptor by
398 * clamping the LOD as tight as possible (from 0 to epsilon,
399 * essentially -- remember these are fixed point numbers, so
400 * epsilon=1/256) */
401
402 cfg.maximum_lod = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) ?
403 cfg.minimum_lod + 1 :
404 FIXED_16(cso->max_lod, false);
405
406 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
407 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
408 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
409
410 cfg.compare_function = panfrost_sampler_compare_func(cso);
411 cfg.seamless_cube_map = cso->seamless_cube_map;
412
413 cfg.border_color_r = cso->border_color.f[0];
414 cfg.border_color_g = cso->border_color.f[1];
415 cfg.border_color_b = cso->border_color.f[2];
416 cfg.border_color_a = cso->border_color.f[3];
417 }
418 }
419
420 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
421 struct mali_bifrost_sampler_packed *hw)
422 {
423 pan_pack(hw, BIFROST_SAMPLER, cfg) {
424 cfg.magnify_linear = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR;
425 cfg.minify_linear = cso->min_img_filter == PIPE_TEX_FILTER_LINEAR;
426 cfg.mipmap_mode = pan_pipe_to_mipmode(cso->min_mip_filter);
427 cfg.normalized_coordinates = cso->normalized_coords;
428
429 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
430 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
431 cfg.maximum_lod = FIXED_16(cso->max_lod, false);
432
433 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
434 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
435 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
436
437 cfg.compare_function = panfrost_sampler_compare_func(cso);
438 cfg.seamless_cube_map = cso->seamless_cube_map;
439 }
440 }
441
442 static bool
443 panfrost_fs_required(
444 struct panfrost_shader_state *fs,
445 struct panfrost_blend_final *blend,
446 unsigned rt_count)
447 {
448 /* If we generally have side effects */
449 if (fs->fs_sidefx)
450 return true;
451
452 /* If colour is written we need to execute */
453 for (unsigned i = 0; i < rt_count; ++i) {
454 if (!blend[i].no_colour)
455 return true;
456 }
457
458 /* If depth is written and not implied we need to execute.
459 * TODO: Predicate on Z/S writes being enabled */
460 return (fs->writes_depth || fs->writes_stencil);
461 }
462
463 static void
464 panfrost_emit_blend(struct panfrost_batch *batch, void *rts,
465 struct panfrost_blend_final *blend)
466 {
467 const struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
468 struct panfrost_shader_state *fs = panfrost_get_shader_state(batch->ctx, PIPE_SHADER_FRAGMENT);
469 unsigned rt_count = batch->key.nr_cbufs;
470
471 struct bifrost_blend_rt *brts = rts;
472 struct midgard_blend_rt *mrts = rts;
473
474 /* Disable blending for depth-only on Bifrost */
475
476 if (rt_count == 0 && dev->quirks & IS_BIFROST)
477 brts[0].unk2 = 0x3;
478
479 for (unsigned i = 0; i < rt_count; ++i) {
480 unsigned flags = 0;
481
482 pan_pack(&flags, BLEND_FLAGS, cfg) {
483 if (blend[i].no_colour) {
484 cfg.enable = false;
485 break;
486 }
487
488 batch->draws |= (PIPE_CLEAR_COLOR0 << i);
489
490 cfg.srgb = util_format_is_srgb(batch->key.cbufs[i]->format);
491 cfg.load_destination = blend[i].load_dest;
492 cfg.dither_disable = !batch->ctx->blend->base.dither;
493
494 if (!(dev->quirks & IS_BIFROST))
495 cfg.midgard_blend_shader = blend[i].is_shader;
496 }
497
498 if (dev->quirks & IS_BIFROST) {
499 brts[i].flags = flags;
500
501 if (blend[i].is_shader) {
502 /* The blend shader's address needs to be at
503 * the same top 32 bit as the fragment shader.
504 * TODO: Ensure that's always the case.
505 */
506 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
507 (fs->bo->gpu & (0xffffffffull << 32)));
508 brts[i].shader = blend[i].shader.gpu;
509 brts[i].unk2 = 0x0;
510 } else {
511 enum pipe_format format = batch->key.cbufs[i]->format;
512 const struct util_format_description *format_desc;
513 format_desc = util_format_description(format);
514
515 brts[i].equation = blend[i].equation.equation;
516
517 /* TODO: this is a bit more complicated */
518 brts[i].constant = blend[i].equation.constant;
519
520 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
521
522 /* 0x19 disables blending and forces REPLACE
523 * mode (equivalent to rgb_mode = alpha_mode =
524 * x122, colour mask = 0xF). 0x1a allows
525 * blending. */
526 brts[i].unk2 = blend[i].opaque ? 0x19 : 0x1a;
527
528 brts[i].shader_type = fs->blend_types[i];
529 }
530 } else {
531 memcpy(&mrts[i].flags, &flags, sizeof(flags));
532
533 if (blend[i].is_shader) {
534 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
535 } else {
536 mrts[i].blend.equation = blend[i].equation.equation;
537 mrts[i].blend.constant = blend[i].equation.constant;
538 }
539 }
540 }
541 }
542
543 static void
544 panfrost_emit_frag_shader(struct panfrost_context *ctx,
545 struct mali_shader_meta *fragmeta,
546 struct panfrost_blend_final *blend)
547 {
548 const struct panfrost_device *dev = pan_device(ctx->base.screen);
549 struct panfrost_shader_state *fs;
550
551 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
552
553 struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
554 const struct panfrost_zsa_state *zsa = ctx->depth_stencil;
555
556 memset(fragmeta, 0, sizeof(*fragmeta));
557
558 fragmeta->shader = fs->shader;
559 fragmeta->attribute_count = fs->attribute_count;
560 fragmeta->varying_count = fs->varying_count;
561 fragmeta->texture_count = ctx->sampler_view_count[PIPE_SHADER_FRAGMENT];
562 fragmeta->sampler_count = ctx->sampler_count[PIPE_SHADER_FRAGMENT];
563
564 if (dev->quirks & IS_BIFROST) {
565 /* First clause ATEST |= 0x4000000.
566 * Lefs than 32 regs |= 0x200 */
567 fragmeta->bifrost1.unk1 = 0x950020;
568
569 fragmeta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, PIPE_SHADER_FRAGMENT);
570 fragmeta->bifrost2.preload_regs = 0x1;
571 SET_BIT(fragmeta->bifrost2.preload_regs, 0x10, fs->reads_frag_coord);
572
573 fragmeta->bifrost2.uniform_count = fs->uniform_count;
574 } else {
575 fragmeta->midgard1.uniform_count = fs->uniform_count;
576 fragmeta->midgard1.work_count = fs->work_reg_count;
577
578 /* TODO: This is not conformant on ES3 */
579 fragmeta->midgard1.flags_hi = MALI_SUPPRESS_INF_NAN;
580
581 fragmeta->midgard1.flags_lo = 0x20;
582 fragmeta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, PIPE_SHADER_FRAGMENT);
583
584 SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_GLOBAL, fs->writes_global);
585 }
586
587 bool msaa = rast->multisample;
588 fragmeta->coverage_mask = msaa ? ctx->sample_mask : ~0;
589
590 fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x10;
591 fragmeta->unknown2_4 = 0x4e0;
592
593 if (dev->quirks & IS_BIFROST) {
594 /* TODO */
595 } else {
596 /* Depending on whether it's legal to in the given shader, we try to
597 * enable early-z testing. TODO: respect e-z force */
598
599 SET_BIT(fragmeta->midgard1.flags_lo, MALI_EARLY_Z,
600 !fs->can_discard && !fs->writes_global &&
601 !fs->writes_depth && !fs->writes_stencil &&
602 !ctx->blend->base.alpha_to_coverage);
603
604 /* Add the writes Z/S flags if needed. */
605 SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_Z, fs->writes_depth);
606 SET_BIT(fragmeta->midgard1.flags_hi, MALI_WRITES_S, fs->writes_stencil);
607
608 /* Any time texturing is used, derivatives are implicitly calculated,
609 * so we need to enable helper invocations */
610
611 SET_BIT(fragmeta->midgard1.flags_lo, MALI_HELPER_INVOCATIONS,
612 fs->helper_invocations);
613
614 /* If discard is enabled, which bit we set to convey this
615 * depends on if depth/stencil is used for the draw or not.
616 * Just one of depth OR stencil is enough to trigger this. */
617
618 bool zs_enabled =
619 fs->writes_depth || fs->writes_stencil ||
620 (zsa->base.depth.enabled && zsa->base.depth.func != PIPE_FUNC_ALWAYS) ||
621 zsa->base.stencil[0].enabled;
622
623 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_TILEBUFFER,
624 fs->outputs_read || (!zs_enabled && fs->can_discard));
625 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_ZS, zs_enabled && fs->can_discard);
626 }
627
628 /* TODO: Sample size */
629 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
630 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
631
632 /* EXT_shader_framebuffer_fetch requires the shader to be run
633 * per-sample when outputs are read. */
634 bool per_sample = ctx->min_samples > 1 || fs->outputs_read;
635 SET_BIT(fragmeta->unknown2_3, MALI_PER_SAMPLE, msaa && per_sample);
636
637 fragmeta->depth_units = rast->offset_units * 2.0f;
638 fragmeta->depth_factor = rast->offset_scale;
639
640 /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
641
642 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, rast->offset_tri);
643 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, rast->offset_tri);
644
645 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_NEAR, rast->depth_clip_near);
646 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_FAR, rast->depth_clip_far);
647
648 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
649 zsa->base.stencil[0].enabled);
650
651 fragmeta->stencil_mask_front = zsa->stencil_mask_front;
652 fragmeta->stencil_mask_back = zsa->stencil_mask_back;
653
654 /* Bottom bits for stencil ref, exactly one word */
655 fragmeta->stencil_front.opaque[0] = zsa->stencil_front.opaque[0] | ctx->stencil_ref.ref_value[0];
656
657 /* If back-stencil is not enabled, use the front values */
658
659 if (zsa->base.stencil[1].enabled)
660 fragmeta->stencil_back.opaque[0] = zsa->stencil_back.opaque[0] | ctx->stencil_ref.ref_value[1];
661 else
662 fragmeta->stencil_back = fragmeta->stencil_front;
663
664 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
665 zsa->base.depth.writemask);
666
667 fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
668 fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(
669 zsa->base.depth.enabled ? zsa->base.depth.func : PIPE_FUNC_ALWAYS));
670
671 SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
672 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
673 !ctx->blend->base.dither);
674
675 SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
676
677 SET_BIT(fragmeta->unknown2_4, MALI_ALPHA_TO_COVERAGE,
678 ctx->blend->base.alpha_to_coverage);
679
680 /* Get blending setup */
681 unsigned rt_count = ctx->pipe_framebuffer.nr_cbufs;
682
683 /* Disable shader execution if we can */
684 if (dev->quirks & MIDGARD_SHADERLESS
685 && !panfrost_fs_required(fs, blend, rt_count)) {
686 fragmeta->shader = 0;
687 fragmeta->attribute_count = 0;
688 fragmeta->varying_count = 0;
689 fragmeta->texture_count = 0;
690 fragmeta->sampler_count = 0;
691
692 /* This feature is not known to work on Bifrost */
693 fragmeta->midgard1.work_count = 1;
694 fragmeta->midgard1.uniform_count = 0;
695 fragmeta->midgard1.uniform_buffer_count = 0;
696 }
697
698 /* If there is a blend shader, work registers are shared. We impose 8
699 * work registers as a limit for blend shaders. Should be lower XXX */
700
701 if (!(dev->quirks & IS_BIFROST)) {
702 for (unsigned c = 0; c < rt_count; ++c) {
703 if (blend[c].is_shader) {
704 fragmeta->midgard1.work_count =
705 MAX2(fragmeta->midgard1.work_count, 8);
706 }
707 }
708 }
709
710 if (dev->quirks & MIDGARD_SFBD) {
711 /* When only a single render target platform is used, the blend
712 * information is inside the shader meta itself. We additionally
713 * need to signal CAN_DISCARD for nontrivial blend modes (so
714 * we're able to read back the destination buffer) */
715
716 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
717 blend[0].is_shader);
718
719 if (blend[0].is_shader) {
720 fragmeta->blend.shader = blend[0].shader.gpu |
721 blend[0].shader.first_tag;
722 } else {
723 fragmeta->blend.equation = blend[0].equation.equation;
724 fragmeta->blend.constant = blend[0].equation.constant;
725 }
726
727 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
728 blend[0].load_dest);
729 } else if (!(dev->quirks & IS_BIFROST)) {
730 /* Bug where MRT-capable hw apparently reads the last blend
731 * shader from here instead of the usual location? */
732
733 for (signed rt = ((signed) rt_count - 1); rt >= 0; --rt) {
734 if (!blend[rt].is_shader)
735 continue;
736
737 fragmeta->blend.shader = blend[rt].shader.gpu |
738 blend[rt].shader.first_tag;
739 break;
740 }
741 }
742
743 if (dev->quirks & IS_BIFROST) {
744 bool no_blend = true;
745
746 for (unsigned i = 0; i < rt_count; ++i)
747 no_blend &= (!blend[i].load_dest | blend[i].no_colour);
748
749 SET_BIT(fragmeta->bifrost1.unk1, MALI_BIFROST_EARLY_Z,
750 !fs->can_discard && !fs->writes_depth && no_blend);
751 }
752 }
753
754 void
755 panfrost_emit_shader_meta(struct panfrost_batch *batch,
756 enum pipe_shader_type st,
757 struct mali_vertex_tiler_postfix *postfix)
758 {
759 struct panfrost_context *ctx = batch->ctx;
760 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
761
762 if (!ss) {
763 postfix->shader = 0;
764 return;
765 }
766
767 struct mali_shader_meta meta;
768
769 /* Add the shader BO to the batch. */
770 panfrost_batch_add_bo(batch, ss->bo,
771 PAN_BO_ACCESS_PRIVATE |
772 PAN_BO_ACCESS_READ |
773 panfrost_bo_access_for_stage(st));
774
775 mali_ptr shader_ptr;
776
777 if (st == PIPE_SHADER_FRAGMENT) {
778 struct panfrost_device *dev = pan_device(ctx->base.screen);
779 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
780 size_t desc_size = sizeof(meta);
781 void *rts = NULL;
782 struct panfrost_transfer xfer;
783 unsigned rt_size;
784
785 if (dev->quirks & MIDGARD_SFBD)
786 rt_size = 0;
787 else if (dev->quirks & IS_BIFROST)
788 rt_size = sizeof(struct bifrost_blend_rt);
789 else
790 rt_size = sizeof(struct midgard_blend_rt);
791
792 desc_size += rt_size * rt_count;
793
794 if (rt_size)
795 rts = rzalloc_size(ctx, rt_size * rt_count);
796
797 struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
798
799 for (unsigned c = 0; c < ctx->pipe_framebuffer.nr_cbufs; ++c)
800 blend[c] = panfrost_get_blend_for_context(ctx, c);
801
802 panfrost_emit_frag_shader(ctx, &meta, blend);
803
804 if (!(dev->quirks & MIDGARD_SFBD))
805 panfrost_emit_blend(batch, rts, blend);
806 else
807 batch->draws |= PIPE_CLEAR_COLOR0;
808
809 xfer = panfrost_pool_alloc_aligned(&batch->pool, desc_size, sizeof(meta));
810
811 memcpy(xfer.cpu, &meta, sizeof(meta));
812 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
813
814 if (rt_size)
815 ralloc_free(rts);
816
817 shader_ptr = xfer.gpu;
818 } else {
819 panfrost_emit_compute_shader(ctx, st, &meta);
820
821 shader_ptr = panfrost_pool_upload(&batch->pool, &meta,
822 sizeof(meta));
823 }
824
825 postfix->shader = shader_ptr;
826 }
827
828 void
829 panfrost_emit_viewport(struct panfrost_batch *batch,
830 struct mali_vertex_tiler_postfix *tiler_postfix)
831 {
832 struct panfrost_context *ctx = batch->ctx;
833 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
834 const struct pipe_scissor_state *ss = &ctx->scissor;
835 const struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
836 const struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer;
837
838 /* Derive min/max from translate/scale. Note since |x| >= 0 by
839 * definition, we have that -|x| <= |x| hence translate - |scale| <=
840 * translate + |scale|, so the ordering is correct here. */
841 float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
842 float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
843 float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
844 float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
845 float minz = (vp->translate[2] - fabsf(vp->scale[2]));
846 float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
847
848 /* Scissor to the intersection of viewport and to the scissor, clamped
849 * to the framebuffer */
850
851 unsigned minx = MIN2(fb->width, vp_minx);
852 unsigned maxx = MIN2(fb->width, vp_maxx);
853 unsigned miny = MIN2(fb->height, vp_miny);
854 unsigned maxy = MIN2(fb->height, vp_maxy);
855
856 if (ss && rast->scissor) {
857 minx = MAX2(ss->minx, minx);
858 miny = MAX2(ss->miny, miny);
859 maxx = MIN2(ss->maxx, maxx);
860 maxy = MIN2(ss->maxy, maxy);
861 }
862
863 struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool, MALI_VIEWPORT_LENGTH);
864
865 pan_pack(T.cpu, VIEWPORT, cfg) {
866 cfg.scissor_minimum_x = minx;
867 cfg.scissor_minimum_y = miny;
868 cfg.scissor_maximum_x = maxx - 1;
869 cfg.scissor_maximum_y = maxy - 1;
870
871 cfg.minimum_z = rast->depth_clip_near ? minz : -INFINITY;
872 cfg.maximum_z = rast->depth_clip_far ? maxz : INFINITY;
873 }
874
875 tiler_postfix->viewport = T.gpu;
876 panfrost_batch_union_scissor(batch, minx, miny, maxx, maxy);
877 }
878
879 static mali_ptr
880 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
881 enum pipe_shader_type st,
882 struct panfrost_constant_buffer *buf,
883 unsigned index)
884 {
885 struct pipe_constant_buffer *cb = &buf->cb[index];
886 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
887
888 if (rsrc) {
889 panfrost_batch_add_bo(batch, rsrc->bo,
890 PAN_BO_ACCESS_SHARED |
891 PAN_BO_ACCESS_READ |
892 panfrost_bo_access_for_stage(st));
893
894 /* Alignment gauranteed by
895 * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
896 return rsrc->bo->gpu + cb->buffer_offset;
897 } else if (cb->user_buffer) {
898 return panfrost_pool_upload_aligned(&batch->pool,
899 cb->user_buffer +
900 cb->buffer_offset,
901 cb->buffer_size, 16);
902 } else {
903 unreachable("No constant buffer");
904 }
905 }
906
907 struct sysval_uniform {
908 union {
909 float f[4];
910 int32_t i[4];
911 uint32_t u[4];
912 uint64_t du[2];
913 };
914 };
915
916 static void
917 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
918 struct sysval_uniform *uniform)
919 {
920 struct panfrost_context *ctx = batch->ctx;
921 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
922
923 uniform->f[0] = vp->scale[0];
924 uniform->f[1] = vp->scale[1];
925 uniform->f[2] = vp->scale[2];
926 }
927
928 static void
929 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
930 struct sysval_uniform *uniform)
931 {
932 struct panfrost_context *ctx = batch->ctx;
933 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
934
935 uniform->f[0] = vp->translate[0];
936 uniform->f[1] = vp->translate[1];
937 uniform->f[2] = vp->translate[2];
938 }
939
940 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
941 enum pipe_shader_type st,
942 unsigned int sysvalid,
943 struct sysval_uniform *uniform)
944 {
945 struct panfrost_context *ctx = batch->ctx;
946 unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
947 unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
948 bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
949 struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
950
951 assert(dim);
952 uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
953
954 if (dim > 1)
955 uniform->i[1] = u_minify(tex->texture->height0,
956 tex->u.tex.first_level);
957
958 if (dim > 2)
959 uniform->i[2] = u_minify(tex->texture->depth0,
960 tex->u.tex.first_level);
961
962 if (is_array)
963 uniform->i[dim] = tex->texture->array_size;
964 }
965
966 static void
967 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
968 enum pipe_shader_type st,
969 unsigned ssbo_id,
970 struct sysval_uniform *uniform)
971 {
972 struct panfrost_context *ctx = batch->ctx;
973
974 assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
975 struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
976
977 /* Compute address */
978 struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
979
980 panfrost_batch_add_bo(batch, bo,
981 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
982 panfrost_bo_access_for_stage(st));
983
984 /* Upload address and size as sysval */
985 uniform->du[0] = bo->gpu + sb.buffer_offset;
986 uniform->u[2] = sb.buffer_size;
987 }
988
989 static void
990 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
991 enum pipe_shader_type st,
992 unsigned samp_idx,
993 struct sysval_uniform *uniform)
994 {
995 struct panfrost_context *ctx = batch->ctx;
996 struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
997
998 uniform->f[0] = sampl->min_lod;
999 uniform->f[1] = sampl->max_lod;
1000 uniform->f[2] = sampl->lod_bias;
1001
1002 /* Even without any errata, Midgard represents "no mipmapping" as
1003 * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1004 * panfrost_create_sampler_state which also explains our choice of
1005 * epsilon value (again to keep behaviour consistent) */
1006
1007 if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1008 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1009 }
1010
1011 static void
1012 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1013 struct sysval_uniform *uniform)
1014 {
1015 struct panfrost_context *ctx = batch->ctx;
1016
1017 uniform->u[0] = ctx->compute_grid->grid[0];
1018 uniform->u[1] = ctx->compute_grid->grid[1];
1019 uniform->u[2] = ctx->compute_grid->grid[2];
1020 }
1021
1022 static void
1023 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1024 struct panfrost_shader_state *ss,
1025 enum pipe_shader_type st)
1026 {
1027 struct sysval_uniform *uniforms = (void *)buf;
1028
1029 for (unsigned i = 0; i < ss->sysval_count; ++i) {
1030 int sysval = ss->sysval[i];
1031
1032 switch (PAN_SYSVAL_TYPE(sysval)) {
1033 case PAN_SYSVAL_VIEWPORT_SCALE:
1034 panfrost_upload_viewport_scale_sysval(batch,
1035 &uniforms[i]);
1036 break;
1037 case PAN_SYSVAL_VIEWPORT_OFFSET:
1038 panfrost_upload_viewport_offset_sysval(batch,
1039 &uniforms[i]);
1040 break;
1041 case PAN_SYSVAL_TEXTURE_SIZE:
1042 panfrost_upload_txs_sysval(batch, st,
1043 PAN_SYSVAL_ID(sysval),
1044 &uniforms[i]);
1045 break;
1046 case PAN_SYSVAL_SSBO:
1047 panfrost_upload_ssbo_sysval(batch, st,
1048 PAN_SYSVAL_ID(sysval),
1049 &uniforms[i]);
1050 break;
1051 case PAN_SYSVAL_NUM_WORK_GROUPS:
1052 panfrost_upload_num_work_groups_sysval(batch,
1053 &uniforms[i]);
1054 break;
1055 case PAN_SYSVAL_SAMPLER:
1056 panfrost_upload_sampler_sysval(batch, st,
1057 PAN_SYSVAL_ID(sysval),
1058 &uniforms[i]);
1059 break;
1060 default:
1061 assert(0);
1062 }
1063 }
1064 }
1065
1066 static const void *
1067 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1068 unsigned index)
1069 {
1070 struct pipe_constant_buffer *cb = &buf->cb[index];
1071 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1072
1073 if (rsrc)
1074 return rsrc->bo->cpu;
1075 else if (cb->user_buffer)
1076 return cb->user_buffer;
1077 else
1078 unreachable("No constant buffer");
1079 }
1080
1081 void
1082 panfrost_emit_const_buf(struct panfrost_batch *batch,
1083 enum pipe_shader_type stage,
1084 struct mali_vertex_tiler_postfix *postfix)
1085 {
1086 struct panfrost_context *ctx = batch->ctx;
1087 struct panfrost_shader_variants *all = ctx->shader[stage];
1088
1089 if (!all)
1090 return;
1091
1092 struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1093
1094 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1095
1096 /* Uniforms are implicitly UBO #0 */
1097 bool has_uniforms = buf->enabled_mask & (1 << 0);
1098
1099 /* Allocate room for the sysval and the uniforms */
1100 size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1101 size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1102 size_t size = sys_size + uniform_size;
1103 struct panfrost_transfer transfer =
1104 panfrost_pool_alloc_aligned(&batch->pool, size, 16);
1105
1106 /* Upload sysvals requested by the shader */
1107 panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1108
1109 /* Upload uniforms */
1110 if (has_uniforms && uniform_size) {
1111 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1112 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1113 }
1114
1115 /* Next up, attach UBOs. UBO #0 is the uniforms we just
1116 * uploaded */
1117
1118 unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1119 assert(ubo_count >= 1);
1120
1121 size_t sz = MALI_UNIFORM_BUFFER_LENGTH * ubo_count;
1122 struct panfrost_transfer ubos =
1123 panfrost_pool_alloc_aligned(&batch->pool, sz,
1124 MALI_UNIFORM_BUFFER_LENGTH);
1125
1126 uint64_t *ubo_ptr = (uint64_t *) ubos.cpu;
1127
1128 /* Upload uniforms as a UBO */
1129
1130 if (size) {
1131 pan_pack(ubo_ptr, UNIFORM_BUFFER, cfg) {
1132 cfg.entries = DIV_ROUND_UP(size, 16);
1133 cfg.pointer = transfer.gpu;
1134 }
1135 } else {
1136 *ubo_ptr = 0;
1137 }
1138
1139 /* The rest are honest-to-goodness UBOs */
1140
1141 for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1142 size_t usz = buf->cb[ubo].buffer_size;
1143 bool enabled = buf->enabled_mask & (1 << ubo);
1144 bool empty = usz == 0;
1145
1146 if (!enabled || empty) {
1147 ubo_ptr[ubo] = 0;
1148 continue;
1149 }
1150
1151 pan_pack(ubo_ptr + ubo, UNIFORM_BUFFER, cfg) {
1152 cfg.entries = DIV_ROUND_UP(usz, 16);
1153 cfg.pointer = panfrost_map_constant_buffer_gpu(batch,
1154 stage, buf, ubo);
1155 }
1156 }
1157
1158 postfix->uniforms = transfer.gpu;
1159 postfix->uniform_buffers = ubos.gpu;
1160
1161 buf->dirty_mask = 0;
1162 }
1163
1164 void
1165 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1166 const struct pipe_grid_info *info,
1167 struct midgard_payload_vertex_tiler *vtp)
1168 {
1169 struct panfrost_context *ctx = batch->ctx;
1170 struct panfrost_device *dev = pan_device(ctx->base.screen);
1171 struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1172 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1173 unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1174 128));
1175
1176 unsigned log2_instances =
1177 util_logbase2_ceil(info->grid[0]) +
1178 util_logbase2_ceil(info->grid[1]) +
1179 util_logbase2_ceil(info->grid[2]);
1180
1181 unsigned shared_size = single_size * (1 << log2_instances) * dev->core_count;
1182 struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1183 shared_size,
1184 1);
1185
1186 struct mali_shared_memory shared = {
1187 .shared_memory = bo->gpu,
1188 .shared_workgroup_count = log2_instances,
1189 .shared_shift = util_logbase2(single_size) + 1
1190 };
1191
1192 vtp->postfix.shared_memory = panfrost_pool_upload_aligned(&batch->pool, &shared,
1193 sizeof(shared), 64);
1194 }
1195
1196 static mali_ptr
1197 panfrost_get_tex_desc(struct panfrost_batch *batch,
1198 enum pipe_shader_type st,
1199 struct panfrost_sampler_view *view)
1200 {
1201 if (!view)
1202 return (mali_ptr) 0;
1203
1204 struct pipe_sampler_view *pview = &view->base;
1205 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1206
1207 /* Add the BO to the job so it's retained until the job is done. */
1208
1209 panfrost_batch_add_bo(batch, rsrc->bo,
1210 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1211 panfrost_bo_access_for_stage(st));
1212
1213 panfrost_batch_add_bo(batch, view->bo,
1214 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1215 panfrost_bo_access_for_stage(st));
1216
1217 return view->bo->gpu;
1218 }
1219
1220 static void
1221 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
1222 struct pipe_context *pctx)
1223 {
1224 struct panfrost_resource *rsrc = pan_resource(view->base.texture);
1225 if (view->texture_bo != rsrc->bo->gpu ||
1226 view->modifier != rsrc->modifier) {
1227 panfrost_bo_unreference(view->bo);
1228 panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);
1229 }
1230 }
1231
1232 void
1233 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1234 enum pipe_shader_type stage,
1235 struct mali_vertex_tiler_postfix *postfix)
1236 {
1237 struct panfrost_context *ctx = batch->ctx;
1238 struct panfrost_device *device = pan_device(ctx->base.screen);
1239
1240 if (!ctx->sampler_view_count[stage])
1241 return;
1242
1243 if (device->quirks & IS_BIFROST) {
1244 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1245 MALI_BIFROST_TEXTURE_LENGTH *
1246 ctx->sampler_view_count[stage],
1247 MALI_BIFROST_TEXTURE_LENGTH);
1248
1249 struct mali_bifrost_texture_packed *out =
1250 (struct mali_bifrost_texture_packed *) T.cpu;
1251
1252 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1253 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1254 struct pipe_sampler_view *pview = &view->base;
1255 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1256
1257 panfrost_update_sampler_view(view, &ctx->base);
1258 out[i] = view->bifrost_descriptor;
1259
1260 /* Add the BOs to the job so they are retained until the job is done. */
1261
1262 panfrost_batch_add_bo(batch, rsrc->bo,
1263 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1264 panfrost_bo_access_for_stage(stage));
1265
1266 panfrost_batch_add_bo(batch, view->bo,
1267 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1268 panfrost_bo_access_for_stage(stage));
1269 }
1270
1271 postfix->textures = T.gpu;
1272 } else {
1273 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1274
1275 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1276 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1277
1278 panfrost_update_sampler_view(view, &ctx->base);
1279
1280 trampolines[i] = panfrost_get_tex_desc(batch, stage, view);
1281 }
1282
1283 postfix->textures = panfrost_pool_upload_aligned(&batch->pool,
1284 trampolines,
1285 sizeof(uint64_t) *
1286 ctx->sampler_view_count[stage],
1287 sizeof(uint64_t));
1288 }
1289 }
1290
1291 void
1292 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1293 enum pipe_shader_type stage,
1294 struct mali_vertex_tiler_postfix *postfix)
1295 {
1296 struct panfrost_context *ctx = batch->ctx;
1297
1298 if (!ctx->sampler_count[stage])
1299 return;
1300
1301 size_t desc_size = MALI_BIFROST_SAMPLER_LENGTH;
1302 assert(MALI_BIFROST_SAMPLER_LENGTH == MALI_MIDGARD_SAMPLER_LENGTH);
1303
1304 size_t sz = desc_size * ctx->sampler_count[stage];
1305 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool, sz, desc_size);
1306 struct mali_midgard_sampler_packed *out = (struct mali_midgard_sampler_packed *) T.cpu;
1307
1308 for (unsigned i = 0; i < ctx->sampler_count[stage]; ++i)
1309 out[i] = ctx->samplers[stage][i]->hw;
1310
1311 postfix->sampler_descriptor = T.gpu;
1312 }
1313
1314 void
1315 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1316 struct mali_vertex_tiler_postfix *vertex_postfix)
1317 {
1318 struct panfrost_context *ctx = batch->ctx;
1319 struct panfrost_vertex_state *so = ctx->vertex;
1320 struct panfrost_shader_state *vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1321
1322 unsigned instance_shift = vertex_postfix->instance_shift;
1323 unsigned instance_odd = vertex_postfix->instance_odd;
1324
1325 /* Worst case: everything is NPOT, which is only possible if instancing
1326 * is enabled. Otherwise single record is gauranteed */
1327 bool could_npot = instance_shift || instance_odd;
1328
1329 struct panfrost_transfer S = panfrost_pool_alloc_aligned(&batch->pool,
1330 MALI_ATTRIBUTE_BUFFER_LENGTH * vs->attribute_count *
1331 (could_npot ? 2 : 1),
1332 MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
1333
1334 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1335 MALI_ATTRIBUTE_LENGTH * vs->attribute_count,
1336 MALI_ATTRIBUTE_LENGTH);
1337
1338 struct mali_attribute_buffer_packed *bufs =
1339 (struct mali_attribute_buffer_packed *) S.cpu;
1340
1341 struct mali_attribute_packed *out =
1342 (struct mali_attribute_packed *) T.cpu;
1343
1344 unsigned attrib_to_buffer[PIPE_MAX_ATTRIBS] = { 0 };
1345 unsigned k = 0;
1346
1347 for (unsigned i = 0; i < so->num_elements; ++i) {
1348 /* We map buffers 1:1 with the attributes, which
1349 * means duplicating some vertex buffers (who cares? aside from
1350 * maybe some caching implications but I somehow doubt that
1351 * matters) */
1352
1353 struct pipe_vertex_element *elem = &so->pipe[i];
1354 unsigned vbi = elem->vertex_buffer_index;
1355 attrib_to_buffer[i] = k;
1356
1357 if (!(ctx->vb_mask & (1 << vbi)))
1358 continue;
1359
1360 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1361 struct panfrost_resource *rsrc;
1362
1363 rsrc = pan_resource(buf->buffer.resource);
1364 if (!rsrc)
1365 continue;
1366
1367 /* Add a dependency of the batch on the vertex buffer */
1368 panfrost_batch_add_bo(batch, rsrc->bo,
1369 PAN_BO_ACCESS_SHARED |
1370 PAN_BO_ACCESS_READ |
1371 PAN_BO_ACCESS_VERTEX_TILER);
1372
1373 /* Mask off lower bits, see offset fixup below */
1374 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1375 mali_ptr addr = raw_addr & ~63;
1376
1377 /* Since we advanced the base pointer, we shrink the buffer
1378 * size, but add the offset we subtracted */
1379 unsigned size = rsrc->base.width0 + (raw_addr - addr)
1380 - buf->buffer_offset;
1381
1382 /* When there is a divisor, the hardware-level divisor is
1383 * the product of the instance divisor and the padded count */
1384 unsigned divisor = elem->instance_divisor;
1385 unsigned hw_divisor = ctx->padded_count * divisor;
1386 unsigned stride = buf->stride;
1387
1388 /* If there's a divisor(=1) but no instancing, we want every
1389 * attribute to be the same */
1390
1391 if (divisor && ctx->instance_count == 1)
1392 stride = 0;
1393
1394 if (!divisor || ctx->instance_count <= 1) {
1395 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1396 if (ctx->instance_count > 1)
1397 cfg.type = MALI_ATTRIBUTE_TYPE_1D_MODULUS;
1398
1399 cfg.pointer = addr;
1400 cfg.stride = stride;
1401 cfg.size = size;
1402 cfg.divisor_r = instance_shift;
1403 cfg.divisor_p = instance_odd;
1404 }
1405 } else if (util_is_power_of_two_or_zero(hw_divisor)) {
1406 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1407 cfg.type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;
1408 cfg.pointer = addr;
1409 cfg.stride = stride;
1410 cfg.size = size;
1411 cfg.divisor_r = __builtin_ctz(hw_divisor);
1412 }
1413
1414 } else {
1415 unsigned shift = 0, extra_flags = 0;
1416
1417 unsigned magic_divisor =
1418 panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags);
1419
1420 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1421 cfg.type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;
1422 cfg.pointer = addr;
1423 cfg.stride = stride;
1424 cfg.size = size;
1425
1426 cfg.divisor_r = shift;
1427 cfg.divisor_e = extra_flags;
1428 }
1429
1430 pan_pack(bufs + k + 1, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) {
1431 cfg.divisor_numerator = magic_divisor;
1432 cfg.divisor = divisor;
1433 }
1434
1435 ++k;
1436 }
1437
1438 ++k;
1439 }
1440
1441 /* Add special gl_VertexID/gl_InstanceID buffers */
1442
1443 if (unlikely(vs->attribute_count >= PAN_VERTEX_ID)) {
1444 panfrost_vertex_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1445
1446 pan_pack(out + PAN_VERTEX_ID, ATTRIBUTE, cfg) {
1447 cfg.buffer_index = k++;
1448 cfg.format = so->formats[PAN_VERTEX_ID];
1449 }
1450
1451 panfrost_instance_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1452
1453 pan_pack(out + PAN_INSTANCE_ID, ATTRIBUTE, cfg) {
1454 cfg.buffer_index = k++;
1455 cfg.format = so->formats[PAN_INSTANCE_ID];
1456 }
1457 }
1458
1459 /* Attribute addresses require 64-byte alignment, so let:
1460 *
1461 * base' = base & ~63 = base - (base & 63)
1462 * offset' = offset + (base & 63)
1463 *
1464 * Since base' + offset' = base + offset, these are equivalent
1465 * addressing modes and now base is 64 aligned.
1466 */
1467
1468 unsigned start = vertex_postfix->offset_start;
1469
1470 for (unsigned i = 0; i < so->num_elements; ++i) {
1471 unsigned vbi = so->pipe[i].vertex_buffer_index;
1472 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1473
1474 /* Adjust by the masked off bits of the offset. Make sure we
1475 * read src_offset from so->hw (which is not GPU visible)
1476 * rather than target (which is) due to caching effects */
1477
1478 unsigned src_offset = so->pipe[i].src_offset;
1479
1480 /* BOs aligned to 4k so guaranteed aligned to 64 */
1481 src_offset += (buf->buffer_offset & 63);
1482
1483 /* Also, somewhat obscurely per-instance data needs to be
1484 * offset in response to a delayed start in an indexed draw */
1485
1486 if (so->pipe[i].instance_divisor && ctx->instance_count > 1 && start)
1487 src_offset -= buf->stride * start;
1488
1489 pan_pack(out + i, ATTRIBUTE, cfg) {
1490 cfg.buffer_index = attrib_to_buffer[i];
1491 cfg.format = so->formats[i];
1492 cfg.offset = src_offset;
1493 }
1494 }
1495
1496 vertex_postfix->attributes = S.gpu;
1497 vertex_postfix->attribute_meta = T.gpu;
1498 }
1499
1500 static mali_ptr
1501 panfrost_emit_varyings(struct panfrost_batch *batch,
1502 struct mali_attribute_buffer_packed *slot,
1503 unsigned stride, unsigned count)
1504 {
1505 unsigned size = stride * count;
1506 mali_ptr ptr = panfrost_pool_alloc_aligned(&batch->invisible_pool, size, 64).gpu;
1507
1508 pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1509 cfg.stride = stride;
1510 cfg.size = size;
1511 cfg.pointer = ptr;
1512 }
1513
1514 return ptr;
1515 }
1516
1517 static unsigned
1518 panfrost_streamout_offset(unsigned stride, unsigned offset,
1519 struct pipe_stream_output_target *target)
1520 {
1521 return (target->buffer_offset + (offset * stride * 4)) & 63;
1522 }
1523
1524 static void
1525 panfrost_emit_streamout(struct panfrost_batch *batch,
1526 struct mali_attribute_buffer_packed *slot,
1527 unsigned stride_words, unsigned offset, unsigned count,
1528 struct pipe_stream_output_target *target)
1529 {
1530 unsigned stride = stride_words * 4;
1531 unsigned max_size = target->buffer_size;
1532 unsigned expected_size = stride * count;
1533
1534 /* Grab the BO and bind it to the batch */
1535 struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1536
1537 /* Varyings are WRITE from the perspective of the VERTEX but READ from
1538 * the perspective of the TILER and FRAGMENT.
1539 */
1540 panfrost_batch_add_bo(batch, bo,
1541 PAN_BO_ACCESS_SHARED |
1542 PAN_BO_ACCESS_RW |
1543 PAN_BO_ACCESS_VERTEX_TILER |
1544 PAN_BO_ACCESS_FRAGMENT);
1545
1546 /* We will have an offset applied to get alignment */
1547 mali_ptr addr = bo->gpu + target->buffer_offset + (offset * stride);
1548
1549 pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1550 cfg.pointer = (addr & ~63);
1551 cfg.stride = stride;
1552 cfg.size = MIN2(max_size, expected_size) + (addr & 63);
1553 }
1554 }
1555
1556 static bool
1557 has_point_coord(unsigned mask, gl_varying_slot loc)
1558 {
1559 if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1560 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1561 else if (loc == VARYING_SLOT_PNTC)
1562 return (mask & (1 << 8));
1563 else
1564 return false;
1565 }
1566
1567 /* Helpers for manipulating stream out information so we can pack varyings
1568 * accordingly. Compute the src_offset for a given captured varying */
1569
1570 static struct pipe_stream_output *
1571 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1572 {
1573 for (unsigned i = 0; i < info->num_outputs; ++i) {
1574 if (info->output[i].register_index == loc)
1575 return &info->output[i];
1576 }
1577
1578 unreachable("Varying not captured");
1579 }
1580
1581 static unsigned
1582 pan_varying_size(enum mali_format fmt)
1583 {
1584 unsigned type = MALI_EXTRACT_TYPE(fmt);
1585 unsigned chan = MALI_EXTRACT_CHANNELS(fmt);
1586 unsigned bits = MALI_EXTRACT_BITS(fmt);
1587 unsigned bpc = 0;
1588
1589 if (bits == MALI_CHANNEL_FLOAT) {
1590 /* No doubles */
1591 bool fp16 = (type == MALI_FORMAT_SINT);
1592 assert(fp16 || (type == MALI_FORMAT_UNORM));
1593
1594 bpc = fp16 ? 2 : 4;
1595 } else {
1596 assert(type >= MALI_FORMAT_SNORM && type <= MALI_FORMAT_SINT);
1597
1598 /* See the enums */
1599 bits = 1 << bits;
1600 assert(bits >= 8);
1601 bpc = bits / 8;
1602 }
1603
1604 return bpc * chan;
1605 }
1606
1607 /* Indices for named (non-XFB) varyings that are present. These are packed
1608 * tightly so they correspond to a bitfield present (P) indexed by (1 <<
1609 * PAN_VARY_*). This has the nice property that you can lookup the buffer index
1610 * of a given special field given a shift S by:
1611 *
1612 * idx = popcount(P & ((1 << S) - 1))
1613 *
1614 * That is... look at all of the varyings that come earlier and count them, the
1615 * count is the new index since plus one. Likewise, the total number of special
1616 * buffers required is simply popcount(P)
1617 */
1618
1619 enum pan_special_varying {
1620 PAN_VARY_GENERAL = 0,
1621 PAN_VARY_POSITION = 1,
1622 PAN_VARY_PSIZ = 2,
1623 PAN_VARY_PNTCOORD = 3,
1624 PAN_VARY_FACE = 4,
1625 PAN_VARY_FRAGCOORD = 5,
1626
1627 /* Keep last */
1628 PAN_VARY_MAX,
1629 };
1630
1631 /* Given a varying, figure out which index it correpsonds to */
1632
1633 static inline unsigned
1634 pan_varying_index(unsigned present, enum pan_special_varying v)
1635 {
1636 unsigned mask = (1 << v) - 1;
1637 return util_bitcount(present & mask);
1638 }
1639
1640 /* Get the base offset for XFB buffers, which by convention come after
1641 * everything else. Wrapper function for semantic reasons; by construction this
1642 * is just popcount. */
1643
1644 static inline unsigned
1645 pan_xfb_base(unsigned present)
1646 {
1647 return util_bitcount(present);
1648 }
1649
1650 /* Computes the present mask for varyings so we can start emitting varying records */
1651
1652 static inline unsigned
1653 pan_varying_present(
1654 struct panfrost_shader_state *vs,
1655 struct panfrost_shader_state *fs,
1656 unsigned quirks)
1657 {
1658 /* At the moment we always emit general and position buffers. Not
1659 * strictly necessary but usually harmless */
1660
1661 unsigned present = (1 << PAN_VARY_GENERAL) | (1 << PAN_VARY_POSITION);
1662
1663 /* Enable special buffers by the shader info */
1664
1665 if (vs->writes_point_size)
1666 present |= (1 << PAN_VARY_PSIZ);
1667
1668 if (fs->reads_point_coord)
1669 present |= (1 << PAN_VARY_PNTCOORD);
1670
1671 if (fs->reads_face)
1672 present |= (1 << PAN_VARY_FACE);
1673
1674 if (fs->reads_frag_coord && !(quirks & IS_BIFROST))
1675 present |= (1 << PAN_VARY_FRAGCOORD);
1676
1677 /* Also, if we have a point sprite, we need a point coord buffer */
1678
1679 for (unsigned i = 0; i < fs->varying_count; i++) {
1680 gl_varying_slot loc = fs->varyings_loc[i];
1681
1682 if (has_point_coord(fs->point_sprite_mask, loc))
1683 present |= (1 << PAN_VARY_PNTCOORD);
1684 }
1685
1686 return present;
1687 }
1688
1689 /* Emitters for varying records */
1690
1691 static void
1692 pan_emit_vary(struct mali_attribute_packed *out,
1693 unsigned present, enum pan_special_varying buf,
1694 unsigned quirks, enum mali_format format,
1695 unsigned offset)
1696 {
1697 unsigned nr_channels = MALI_EXTRACT_CHANNELS(format);
1698 unsigned swizzle = quirks & HAS_SWIZZLES ?
1699 panfrost_get_default_swizzle(nr_channels) :
1700 panfrost_bifrost_swizzle(nr_channels);
1701
1702 pan_pack(out, ATTRIBUTE, cfg) {
1703 cfg.buffer_index = pan_varying_index(present, buf);
1704 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1705 cfg.format = (format << 12) | swizzle;
1706 cfg.offset = offset;
1707 }
1708 }
1709
1710 /* General varying that is unused */
1711
1712 static void
1713 pan_emit_vary_only(struct mali_attribute_packed *out,
1714 unsigned present, unsigned quirks)
1715 {
1716 pan_emit_vary(out, present, 0, quirks, MALI_VARYING_DISCARD, 0);
1717 }
1718
1719 /* Special records */
1720
1721 static const enum mali_format pan_varying_formats[PAN_VARY_MAX] = {
1722 [PAN_VARY_POSITION] = MALI_VARYING_POS,
1723 [PAN_VARY_PSIZ] = MALI_R16F,
1724 [PAN_VARY_PNTCOORD] = MALI_R16F,
1725 [PAN_VARY_FACE] = MALI_R32I,
1726 [PAN_VARY_FRAGCOORD] = MALI_RGBA32F
1727 };
1728
1729 static void
1730 pan_emit_vary_special(struct mali_attribute_packed *out,
1731 unsigned present, enum pan_special_varying buf,
1732 unsigned quirks)
1733 {
1734 assert(buf < PAN_VARY_MAX);
1735 pan_emit_vary(out, present, buf, quirks, pan_varying_formats[buf], 0);
1736 }
1737
1738 static enum mali_format
1739 pan_xfb_format(enum mali_format format, unsigned nr)
1740 {
1741 if (MALI_EXTRACT_BITS(format) == MALI_CHANNEL_FLOAT)
1742 return MALI_R32F | MALI_NR_CHANNELS(nr);
1743 else
1744 return MALI_EXTRACT_TYPE(format) | MALI_NR_CHANNELS(nr) | MALI_CHANNEL_32;
1745 }
1746
1747 /* Transform feedback records. Note struct pipe_stream_output is (if packed as
1748 * a bitfield) 32-bit, smaller than a 64-bit pointer, so may as well pass by
1749 * value. */
1750
1751 static void
1752 pan_emit_vary_xfb(struct mali_attribute_packed *out,
1753 unsigned present,
1754 unsigned max_xfb,
1755 unsigned *streamout_offsets,
1756 unsigned quirks,
1757 enum mali_format format,
1758 struct pipe_stream_output o)
1759 {
1760 unsigned swizzle = quirks & HAS_SWIZZLES ?
1761 panfrost_get_default_swizzle(o.num_components) :
1762 panfrost_bifrost_swizzle(o.num_components);
1763
1764 pan_pack(out, ATTRIBUTE, cfg) {
1765 /* XFB buffers come after everything else */
1766 cfg.buffer_index = pan_xfb_base(present) + o.output_buffer;
1767 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1768
1769 /* Override number of channels and precision to highp */
1770 cfg.format = (pan_xfb_format(format, o.num_components) << 12) | swizzle;
1771
1772 /* Apply given offsets together */
1773 cfg.offset = (o.dst_offset * 4) /* dwords */
1774 + streamout_offsets[o.output_buffer];
1775 }
1776 }
1777
1778 /* Determine if we should capture a varying for XFB. This requires actually
1779 * having a buffer for it. If we don't capture it, we'll fallback to a general
1780 * varying path (linked or unlinked, possibly discarding the write) */
1781
1782 static bool
1783 panfrost_xfb_captured(struct panfrost_shader_state *xfb,
1784 unsigned loc, unsigned max_xfb)
1785 {
1786 if (!(xfb->so_mask & (1ll << loc)))
1787 return false;
1788
1789 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1790 return o->output_buffer < max_xfb;
1791 }
1792
1793 static void
1794 pan_emit_general_varying(struct mali_attribute_packed *out,
1795 struct panfrost_shader_state *other,
1796 struct panfrost_shader_state *xfb,
1797 gl_varying_slot loc,
1798 enum mali_format format,
1799 unsigned present,
1800 unsigned quirks,
1801 unsigned *gen_offsets,
1802 enum mali_format *gen_formats,
1803 unsigned *gen_stride,
1804 unsigned idx,
1805 bool should_alloc)
1806 {
1807 /* Check if we're linked */
1808 signed other_idx = -1;
1809
1810 for (unsigned j = 0; j < other->varying_count; ++j) {
1811 if (other->varyings_loc[j] == loc) {
1812 other_idx = j;
1813 break;
1814 }
1815 }
1816
1817 if (other_idx < 0) {
1818 pan_emit_vary_only(out, present, quirks);
1819 return;
1820 }
1821
1822 unsigned offset = gen_offsets[other_idx];
1823
1824 if (should_alloc) {
1825 /* We're linked, so allocate a space via a watermark allocation */
1826 enum mali_format alt = other->varyings[other_idx];
1827
1828 /* Do interpolation at minimum precision */
1829 unsigned size_main = pan_varying_size(format);
1830 unsigned size_alt = pan_varying_size(alt);
1831 unsigned size = MIN2(size_main, size_alt);
1832
1833 /* If a varying is marked for XFB but not actually captured, we
1834 * should match the format to the format that would otherwise
1835 * be used for XFB, since dEQP checks for invariance here. It's
1836 * unclear if this is required by the spec. */
1837
1838 if (xfb->so_mask & (1ull << loc)) {
1839 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1840 format = pan_xfb_format(format, o->num_components);
1841 size = pan_varying_size(format);
1842 } else if (size == size_alt) {
1843 format = alt;
1844 }
1845
1846 gen_offsets[idx] = *gen_stride;
1847 gen_formats[other_idx] = format;
1848 offset = *gen_stride;
1849 *gen_stride += size;
1850 }
1851
1852 pan_emit_vary(out, present, PAN_VARY_GENERAL, quirks, format, offset);
1853 }
1854
1855 /* Higher-level wrapper around all of the above, classifying a varying into one
1856 * of the above types */
1857
1858 static void
1859 panfrost_emit_varying(
1860 struct mali_attribute_packed *out,
1861 struct panfrost_shader_state *stage,
1862 struct panfrost_shader_state *other,
1863 struct panfrost_shader_state *xfb,
1864 unsigned present,
1865 unsigned max_xfb,
1866 unsigned *streamout_offsets,
1867 unsigned quirks,
1868 unsigned *gen_offsets,
1869 enum mali_format *gen_formats,
1870 unsigned *gen_stride,
1871 unsigned idx,
1872 bool should_alloc,
1873 bool is_fragment)
1874 {
1875 gl_varying_slot loc = stage->varyings_loc[idx];
1876 enum mali_format format = stage->varyings[idx];
1877
1878 /* Override format to match linkage */
1879 if (!should_alloc && gen_formats[idx])
1880 format = gen_formats[idx];
1881
1882 if (has_point_coord(stage->point_sprite_mask, loc)) {
1883 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1884 } else if (panfrost_xfb_captured(xfb, loc, max_xfb)) {
1885 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1886 pan_emit_vary_xfb(out, present, max_xfb, streamout_offsets, quirks, format, *o);
1887 } else if (loc == VARYING_SLOT_POS) {
1888 if (is_fragment)
1889 pan_emit_vary_special(out, present, PAN_VARY_FRAGCOORD, quirks);
1890 else
1891 pan_emit_vary_special(out, present, PAN_VARY_POSITION, quirks);
1892 } else if (loc == VARYING_SLOT_PSIZ) {
1893 pan_emit_vary_special(out, present, PAN_VARY_PSIZ, quirks);
1894 } else if (loc == VARYING_SLOT_PNTC) {
1895 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1896 } else if (loc == VARYING_SLOT_FACE) {
1897 pan_emit_vary_special(out, present, PAN_VARY_FACE, quirks);
1898 } else {
1899 pan_emit_general_varying(out, other, xfb, loc, format, present,
1900 quirks, gen_offsets, gen_formats, gen_stride,
1901 idx, should_alloc);
1902 }
1903 }
1904
1905 static void
1906 pan_emit_special_input(struct mali_attribute_buffer_packed *out,
1907 unsigned present,
1908 enum pan_special_varying v,
1909 unsigned special)
1910 {
1911 if (present & (1 << v)) {
1912 unsigned idx = pan_varying_index(present, v);
1913
1914 pan_pack(out + idx, ATTRIBUTE_BUFFER, cfg) {
1915 cfg.special = special;
1916 cfg.type = 0;
1917 }
1918 }
1919 }
1920
1921 void
1922 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1923 unsigned vertex_count,
1924 struct mali_vertex_tiler_postfix *vertex_postfix,
1925 struct mali_vertex_tiler_postfix *tiler_postfix,
1926 union midgard_primitive_size *primitive_size)
1927 {
1928 /* Load the shaders */
1929 struct panfrost_context *ctx = batch->ctx;
1930 struct panfrost_device *dev = pan_device(ctx->base.screen);
1931 struct panfrost_shader_state *vs, *fs;
1932 size_t vs_size, fs_size;
1933
1934 /* Allocate the varying descriptor */
1935
1936 vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1937 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1938 vs_size = MALI_ATTRIBUTE_LENGTH * vs->varying_count;
1939 fs_size = MALI_ATTRIBUTE_LENGTH * fs->varying_count;
1940
1941 struct panfrost_transfer trans = panfrost_pool_alloc_aligned(
1942 &batch->pool, vs_size + fs_size, MALI_ATTRIBUTE_LENGTH);
1943
1944 struct pipe_stream_output_info *so = &vs->stream_output;
1945 unsigned present = pan_varying_present(vs, fs, dev->quirks);
1946
1947 /* Check if this varying is linked by us. This is the case for
1948 * general-purpose, non-captured varyings. If it is, link it. If it's
1949 * not, use the provided stream out information to determine the
1950 * offset, since it was already linked for us. */
1951
1952 unsigned gen_offsets[32];
1953 enum mali_format gen_formats[32];
1954 memset(gen_offsets, 0, sizeof(gen_offsets));
1955 memset(gen_formats, 0, sizeof(gen_formats));
1956
1957 unsigned gen_stride = 0;
1958 assert(vs->varying_count < ARRAY_SIZE(gen_offsets));
1959 assert(fs->varying_count < ARRAY_SIZE(gen_offsets));
1960
1961 unsigned streamout_offsets[32];
1962
1963 for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1964 streamout_offsets[i] = panfrost_streamout_offset(
1965 so->stride[i],
1966 ctx->streamout.offsets[i],
1967 ctx->streamout.targets[i]);
1968 }
1969
1970 struct mali_attribute_packed *ovs = (struct mali_attribute_packed *)trans.cpu;
1971 struct mali_attribute_packed *ofs = ovs + vs->varying_count;
1972
1973 for (unsigned i = 0; i < vs->varying_count; i++) {
1974 panfrost_emit_varying(ovs + i, vs, fs, vs, present,
1975 ctx->streamout.num_targets, streamout_offsets,
1976 dev->quirks,
1977 gen_offsets, gen_formats, &gen_stride, i, true, false);
1978 }
1979
1980 for (unsigned i = 0; i < fs->varying_count; i++) {
1981 panfrost_emit_varying(ofs + i, fs, vs, vs, present,
1982 ctx->streamout.num_targets, streamout_offsets,
1983 dev->quirks,
1984 gen_offsets, gen_formats, &gen_stride, i, false, true);
1985 }
1986
1987 unsigned xfb_base = pan_xfb_base(present);
1988 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1989 MALI_ATTRIBUTE_BUFFER_LENGTH * (xfb_base + ctx->streamout.num_targets),
1990 MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
1991 struct mali_attribute_buffer_packed *varyings =
1992 (struct mali_attribute_buffer_packed *) T.cpu;
1993
1994 /* Emit the stream out buffers */
1995
1996 unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1997 ctx->vertex_count);
1998
1999 for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
2000 panfrost_emit_streamout(batch, &varyings[xfb_base + i],
2001 so->stride[i],
2002 ctx->streamout.offsets[i],
2003 out_count,
2004 ctx->streamout.targets[i]);
2005 }
2006
2007 panfrost_emit_varyings(batch,
2008 &varyings[pan_varying_index(present, PAN_VARY_GENERAL)],
2009 gen_stride, vertex_count);
2010
2011 /* fp32 vec4 gl_Position */
2012 tiler_postfix->position_varying = panfrost_emit_varyings(batch,
2013 &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
2014 sizeof(float) * 4, vertex_count);
2015
2016 if (present & (1 << PAN_VARY_PSIZ)) {
2017 primitive_size->pointer = panfrost_emit_varyings(batch,
2018 &varyings[pan_varying_index(present, PAN_VARY_PSIZ)],
2019 2, vertex_count);
2020 }
2021
2022 pan_emit_special_input(varyings, present, PAN_VARY_PNTCOORD, MALI_ATTRIBUTE_SPECIAL_POINT_COORD);
2023 pan_emit_special_input(varyings, present, PAN_VARY_FACE, MALI_ATTRIBUTE_SPECIAL_FRONT_FACING);
2024 pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD, MALI_ATTRIBUTE_SPECIAL_FRAG_COORD);
2025
2026 vertex_postfix->varyings = T.gpu;
2027 tiler_postfix->varyings = T.gpu;
2028
2029 vertex_postfix->varying_meta = trans.gpu;
2030 tiler_postfix->varying_meta = trans.gpu + vs_size;
2031 }
2032
2033 void
2034 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
2035 struct mali_vertex_tiler_prefix *vertex_prefix,
2036 struct mali_vertex_tiler_postfix *vertex_postfix,
2037 struct mali_vertex_tiler_prefix *tiler_prefix,
2038 struct mali_vertex_tiler_postfix *tiler_postfix,
2039 union midgard_primitive_size *primitive_size)
2040 {
2041 struct panfrost_context *ctx = batch->ctx;
2042 struct panfrost_device *device = pan_device(ctx->base.screen);
2043 bool wallpapering = ctx->wallpaper_batch && batch->scoreboard.tiler_dep;
2044 struct bifrost_payload_vertex bifrost_vertex = {0,};
2045 struct bifrost_payload_tiler bifrost_tiler = {0,};
2046 struct midgard_payload_vertex_tiler midgard_vertex = {0,};
2047 struct midgard_payload_vertex_tiler midgard_tiler = {0,};
2048 void *vp, *tp;
2049 size_t vp_size, tp_size;
2050
2051 if (device->quirks & IS_BIFROST) {
2052 bifrost_vertex.prefix = *vertex_prefix;
2053 bifrost_vertex.postfix = *vertex_postfix;
2054 vp = &bifrost_vertex;
2055 vp_size = sizeof(bifrost_vertex);
2056
2057 bifrost_tiler.prefix = *tiler_prefix;
2058 bifrost_tiler.tiler.primitive_size = *primitive_size;
2059 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
2060 bifrost_tiler.postfix = *tiler_postfix;
2061 tp = &bifrost_tiler;
2062 tp_size = sizeof(bifrost_tiler);
2063 } else {
2064 midgard_vertex.prefix = *vertex_prefix;
2065 midgard_vertex.postfix = *vertex_postfix;
2066 vp = &midgard_vertex;
2067 vp_size = sizeof(midgard_vertex);
2068
2069 midgard_tiler.prefix = *tiler_prefix;
2070 midgard_tiler.postfix = *tiler_postfix;
2071 midgard_tiler.primitive_size = *primitive_size;
2072 tp = &midgard_tiler;
2073 tp_size = sizeof(midgard_tiler);
2074 }
2075
2076 if (wallpapering) {
2077 /* Inject in reverse order, with "predicted" job indices.
2078 * THIS IS A HACK XXX */
2079 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false,
2080 batch->scoreboard.job_index + 2, tp, tp_size, true);
2081 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2082 vp, vp_size, true);
2083 return;
2084 }
2085
2086 /* If rasterizer discard is enable, only submit the vertex */
2087
2088 unsigned vertex = panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2089 vp, vp_size, false);
2090
2091 if (ctx->rasterizer->base.rasterizer_discard)
2092 return;
2093
2094 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false, vertex, tp, tp_size,
2095 false);
2096 }
2097
2098 /* TODO: stop hardcoding this */
2099 mali_ptr
2100 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2101 {
2102 uint16_t locations[] = {
2103 128, 128,
2104 0, 256,
2105 0, 256,
2106 0, 256,
2107 0, 256,
2108 0, 256,
2109 0, 256,
2110 0, 256,
2111 0, 256,
2112 0, 256,
2113 0, 256,
2114 0, 256,
2115 0, 256,
2116 0, 256,
2117 0, 256,
2118 0, 256,
2119 0, 256,
2120 0, 256,
2121 0, 256,
2122 0, 256,
2123 0, 256,
2124 0, 256,
2125 0, 256,
2126 0, 256,
2127 0, 256,
2128 0, 256,
2129 0, 256,
2130 0, 256,
2131 0, 256,
2132 0, 256,
2133 0, 256,
2134 0, 256,
2135 128, 128,
2136 0, 0,
2137 0, 0,
2138 0, 0,
2139 0, 0,
2140 0, 0,
2141 0, 0,
2142 0, 0,
2143 0, 0,
2144 0, 0,
2145 0, 0,
2146 0, 0,
2147 0, 0,
2148 0, 0,
2149 0, 0,
2150 0, 0,
2151 };
2152
2153 return panfrost_pool_upload_aligned(&batch->pool, locations, 96 * sizeof(uint16_t), 64);
2154 }