panfrost: Prepack fragment properties/preload
[mesa.git] / src / gallium / drivers / panfrost / pan_cmdstream.c
1 /*
2 * Copyright (C) 2018 Alyssa Rosenzweig
3 * Copyright (C) 2020 Collabora Ltd.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "util/macros.h"
26 #include "util/u_prim.h"
27 #include "util/u_vbuf.h"
28
29 #include "panfrost-quirks.h"
30
31 #include "pan_pool.h"
32 #include "pan_bo.h"
33 #include "pan_cmdstream.h"
34 #include "pan_context.h"
35 #include "pan_job.h"
36
37 /* If a BO is accessed for a particular shader stage, will it be in the primary
38 * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
39 * fragment will be primary, e.g. compute jobs will be considered
40 * "vertex/tiler" by analogy */
41
42 static inline uint32_t
43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
44 {
45 assert(stage == PIPE_SHADER_FRAGMENT ||
46 stage == PIPE_SHADER_VERTEX ||
47 stage == PIPE_SHADER_COMPUTE);
48
49 return stage == PIPE_SHADER_FRAGMENT ?
50 PAN_BO_ACCESS_FRAGMENT :
51 PAN_BO_ACCESS_VERTEX_TILER;
52 }
53
54 static void
55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
56 struct mali_vertex_tiler_postfix *postfix)
57 {
58 struct panfrost_device *dev = pan_device(ctx->base.screen);
59 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
60
61 struct mali_shared_memory shared = {
62 .shared_workgroup_count = ~0,
63 };
64
65 if (batch->stack_size) {
66 struct panfrost_bo *stack =
67 panfrost_batch_get_scratchpad(batch, batch->stack_size,
68 dev->thread_tls_alloc,
69 dev->core_count);
70
71 shared.stack_shift = panfrost_get_stack_shift(batch->stack_size);
72 shared.scratchpad = stack->gpu;
73 }
74
75 postfix->shared_memory = panfrost_pool_upload_aligned(&batch->pool, &shared, sizeof(shared), 64);
76 }
77
78 static void
79 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
80 struct mali_vertex_tiler_postfix *postfix)
81 {
82 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
83 postfix->shared_memory = panfrost_batch_reserve_framebuffer(batch);
84 }
85
86 static void
87 panfrost_vt_update_rasterizer(struct panfrost_rasterizer *rasterizer,
88 struct mali_vertex_tiler_prefix *prefix,
89 struct mali_vertex_tiler_postfix *postfix)
90 {
91 postfix->gl_enables |= 0x7;
92 SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
93 rasterizer->base.front_ccw);
94 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
95 (rasterizer->base.cull_face & PIPE_FACE_FRONT));
96 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
97 (rasterizer->base.cull_face & PIPE_FACE_BACK));
98 SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
99 rasterizer->base.flatshade_first);
100 }
101
102 void
103 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
104 struct mali_vertex_tiler_prefix *prefix,
105 union midgard_primitive_size *primitive_size)
106 {
107 struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
108
109 if (!panfrost_writes_point_size(ctx)) {
110 float val = (prefix->draw_mode == MALI_DRAW_MODE_POINTS) ?
111 rasterizer->base.point_size :
112 rasterizer->base.line_width;
113
114 primitive_size->constant = val;
115 }
116 }
117
118 static void
119 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
120 struct mali_vertex_tiler_postfix *postfix)
121 {
122 SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
123 if (ctx->occlusion_query) {
124 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
125 panfrost_batch_add_bo(ctx->batch, ctx->occlusion_query->bo,
126 PAN_BO_ACCESS_SHARED |
127 PAN_BO_ACCESS_RW |
128 PAN_BO_ACCESS_FRAGMENT);
129 } else {
130 postfix->occlusion_counter = 0;
131 }
132 }
133
134 void
135 panfrost_vt_init(struct panfrost_context *ctx,
136 enum pipe_shader_type stage,
137 struct mali_vertex_tiler_prefix *prefix,
138 struct mali_vertex_tiler_postfix *postfix)
139 {
140 struct panfrost_device *device = pan_device(ctx->base.screen);
141
142 if (!ctx->shader[stage])
143 return;
144
145 memset(prefix, 0, sizeof(*prefix));
146 memset(postfix, 0, sizeof(*postfix));
147
148 if (device->quirks & IS_BIFROST) {
149 postfix->gl_enables = 0x2;
150 panfrost_vt_emit_shared_memory(ctx, postfix);
151 } else {
152 postfix->gl_enables = 0x6;
153 panfrost_vt_attach_framebuffer(ctx, postfix);
154 }
155
156 if (stage == PIPE_SHADER_FRAGMENT) {
157 panfrost_vt_update_occlusion_query(ctx, postfix);
158 panfrost_vt_update_rasterizer(ctx->rasterizer, prefix, postfix);
159 }
160 }
161
162 static unsigned
163 panfrost_translate_index_size(unsigned size)
164 {
165 switch (size) {
166 case 1:
167 return MALI_DRAW_INDEXED_UINT8;
168
169 case 2:
170 return MALI_DRAW_INDEXED_UINT16;
171
172 case 4:
173 return MALI_DRAW_INDEXED_UINT32;
174
175 default:
176 unreachable("Invalid index size");
177 }
178 }
179
180 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
181 * good for the duration of the draw (transient), could last longer. Also get
182 * the bounds on the index buffer for the range accessed by the draw. We do
183 * these operations together because there are natural optimizations which
184 * require them to be together. */
185
186 static mali_ptr
187 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
188 const struct pipe_draw_info *info,
189 unsigned *min_index, unsigned *max_index)
190 {
191 struct panfrost_resource *rsrc = pan_resource(info->index.resource);
192 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
193 off_t offset = info->start * info->index_size;
194 bool needs_indices = true;
195 mali_ptr out = 0;
196
197 if (info->max_index != ~0u) {
198 *min_index = info->min_index;
199 *max_index = info->max_index;
200 needs_indices = false;
201 }
202
203 if (!info->has_user_indices) {
204 /* Only resources can be directly mapped */
205 panfrost_batch_add_bo(batch, rsrc->bo,
206 PAN_BO_ACCESS_SHARED |
207 PAN_BO_ACCESS_READ |
208 PAN_BO_ACCESS_VERTEX_TILER);
209 out = rsrc->bo->gpu + offset;
210
211 /* Check the cache */
212 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
213 info->start,
214 info->count,
215 min_index,
216 max_index);
217 } else {
218 /* Otherwise, we need to upload to transient memory */
219 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
220 struct panfrost_transfer T =
221 panfrost_pool_alloc_aligned(&batch->pool,
222 info->count * info->index_size,
223 info->index_size);
224
225 memcpy(T.cpu, ibuf8 + offset, info->count * info->index_size);
226 out = T.gpu;
227 }
228
229 if (needs_indices) {
230 /* Fallback */
231 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
232
233 if (!info->has_user_indices)
234 panfrost_minmax_cache_add(rsrc->index_cache,
235 info->start, info->count,
236 *min_index, *max_index);
237 }
238
239 return out;
240 }
241
242 void
243 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
244 const struct pipe_draw_info *info,
245 enum mali_draw_mode draw_mode,
246 struct mali_vertex_tiler_postfix *vertex_postfix,
247 struct mali_vertex_tiler_prefix *tiler_prefix,
248 struct mali_vertex_tiler_postfix *tiler_postfix,
249 unsigned *vertex_count,
250 unsigned *padded_count)
251 {
252 tiler_prefix->draw_mode = draw_mode;
253
254 unsigned draw_flags = 0;
255
256 if (panfrost_writes_point_size(ctx))
257 draw_flags |= MALI_DRAW_VARYING_SIZE;
258
259 if (info->primitive_restart)
260 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
261
262 /* These doesn't make much sense */
263
264 draw_flags |= 0x3000;
265
266 if (info->index_size) {
267 unsigned min_index = 0, max_index = 0;
268
269 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
270 info,
271 &min_index,
272 &max_index);
273
274 /* Use the corresponding values */
275 *vertex_count = max_index - min_index + 1;
276 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
277 tiler_prefix->offset_bias_correction = -min_index;
278 tiler_prefix->index_count = MALI_POSITIVE(info->count);
279 draw_flags |= panfrost_translate_index_size(info->index_size);
280 } else {
281 tiler_prefix->indices = 0;
282 *vertex_count = ctx->vertex_count;
283 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
284 tiler_prefix->offset_bias_correction = 0;
285 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
286 }
287
288 tiler_prefix->unknown_draw = draw_flags;
289
290 /* Encode the padded vertex count */
291
292 if (info->instance_count > 1) {
293 *padded_count = panfrost_padded_vertex_count(*vertex_count);
294
295 unsigned shift = __builtin_ctz(ctx->padded_count);
296 unsigned k = ctx->padded_count >> (shift + 1);
297
298 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
299 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
300 } else {
301 *padded_count = *vertex_count;
302
303 /* Reset instancing state */
304 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
305 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
306 }
307 }
308
309 static void
310 panfrost_emit_compute_shader(struct panfrost_context *ctx,
311 enum pipe_shader_type st,
312 struct mali_shader_meta *meta)
313 {
314 const struct panfrost_device *dev = pan_device(ctx->base.screen);
315 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
316
317 memset(meta, 0, sizeof(*meta));
318 memcpy(&meta->shader, &ss->shader, sizeof(ss->shader));
319 memcpy(&meta->midgard_props, &ss->properties, sizeof(ss->properties));
320
321 if (dev->quirks & IS_BIFROST)
322 memcpy(&meta->bifrost_preload, &ss->preload, sizeof(ss->preload));
323 }
324
325 static unsigned
326 translate_tex_wrap(enum pipe_tex_wrap w)
327 {
328 switch (w) {
329 case PIPE_TEX_WRAP_REPEAT: return MALI_WRAP_MODE_REPEAT;
330 case PIPE_TEX_WRAP_CLAMP: return MALI_WRAP_MODE_CLAMP;
331 case PIPE_TEX_WRAP_CLAMP_TO_EDGE: return MALI_WRAP_MODE_CLAMP_TO_EDGE;
332 case PIPE_TEX_WRAP_CLAMP_TO_BORDER: return MALI_WRAP_MODE_CLAMP_TO_BORDER;
333 case PIPE_TEX_WRAP_MIRROR_REPEAT: return MALI_WRAP_MODE_MIRRORED_REPEAT;
334 case PIPE_TEX_WRAP_MIRROR_CLAMP: return MALI_WRAP_MODE_MIRRORED_CLAMP;
335 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE;
336 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER;
337 default: unreachable("Invalid wrap");
338 }
339 }
340
341 /* The hardware compares in the wrong order order, so we have to flip before
342 * encoding. Yes, really. */
343
344 static enum mali_func
345 panfrost_sampler_compare_func(const struct pipe_sampler_state *cso)
346 {
347 if (!cso->compare_mode)
348 return MALI_FUNC_NEVER;
349
350 enum mali_func f = panfrost_translate_compare_func(cso->compare_func);
351 return panfrost_flip_compare_func(f);
352 }
353
354 static enum mali_mipmap_mode
355 pan_pipe_to_mipmode(enum pipe_tex_mipfilter f)
356 {
357 switch (f) {
358 case PIPE_TEX_MIPFILTER_NEAREST: return MALI_MIPMAP_MODE_NEAREST;
359 case PIPE_TEX_MIPFILTER_LINEAR: return MALI_MIPMAP_MODE_TRILINEAR;
360 case PIPE_TEX_MIPFILTER_NONE: return MALI_MIPMAP_MODE_NONE;
361 default: unreachable("Invalid");
362 }
363 }
364
365 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
366 struct mali_midgard_sampler_packed *hw)
367 {
368 pan_pack(hw, MIDGARD_SAMPLER, cfg) {
369 cfg.magnify_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
370 cfg.minify_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
371 cfg.mipmap_mode = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR) ?
372 MALI_MIPMAP_MODE_TRILINEAR : MALI_MIPMAP_MODE_NEAREST;
373 cfg.normalized_coordinates = cso->normalized_coords;
374
375 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
376
377 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
378
379 /* If necessary, we disable mipmapping in the sampler descriptor by
380 * clamping the LOD as tight as possible (from 0 to epsilon,
381 * essentially -- remember these are fixed point numbers, so
382 * epsilon=1/256) */
383
384 cfg.maximum_lod = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) ?
385 cfg.minimum_lod + 1 :
386 FIXED_16(cso->max_lod, false);
387
388 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
389 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
390 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
391
392 cfg.compare_function = panfrost_sampler_compare_func(cso);
393 cfg.seamless_cube_map = cso->seamless_cube_map;
394
395 cfg.border_color_r = cso->border_color.f[0];
396 cfg.border_color_g = cso->border_color.f[1];
397 cfg.border_color_b = cso->border_color.f[2];
398 cfg.border_color_a = cso->border_color.f[3];
399 }
400 }
401
402 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
403 struct mali_bifrost_sampler_packed *hw)
404 {
405 pan_pack(hw, BIFROST_SAMPLER, cfg) {
406 cfg.magnify_linear = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR;
407 cfg.minify_linear = cso->min_img_filter == PIPE_TEX_FILTER_LINEAR;
408 cfg.mipmap_mode = pan_pipe_to_mipmode(cso->min_mip_filter);
409 cfg.normalized_coordinates = cso->normalized_coords;
410
411 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
412 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
413 cfg.maximum_lod = FIXED_16(cso->max_lod, false);
414
415 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
416 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
417 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
418
419 cfg.compare_function = panfrost_sampler_compare_func(cso);
420 cfg.seamless_cube_map = cso->seamless_cube_map;
421 }
422 }
423
424 static bool
425 panfrost_fs_required(
426 struct panfrost_shader_state *fs,
427 struct panfrost_blend_final *blend,
428 unsigned rt_count)
429 {
430 /* If we generally have side effects */
431 if (fs->fs_sidefx)
432 return true;
433
434 /* If colour is written we need to execute */
435 for (unsigned i = 0; i < rt_count; ++i) {
436 if (!blend[i].no_colour)
437 return true;
438 }
439
440 /* If depth is written and not implied we need to execute.
441 * TODO: Predicate on Z/S writes being enabled */
442 return (fs->writes_depth || fs->writes_stencil);
443 }
444
445 static void
446 panfrost_emit_blend(struct panfrost_batch *batch, void *rts,
447 struct panfrost_blend_final *blend)
448 {
449 const struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
450 struct panfrost_shader_state *fs = panfrost_get_shader_state(batch->ctx, PIPE_SHADER_FRAGMENT);
451 unsigned rt_count = batch->key.nr_cbufs;
452
453 struct bifrost_blend_rt *brts = rts;
454 struct midgard_blend_rt *mrts = rts;
455
456 /* Disable blending for depth-only on Bifrost */
457
458 if (rt_count == 0 && dev->quirks & IS_BIFROST)
459 brts[0].unk2 = 0x3;
460
461 for (unsigned i = 0; i < rt_count; ++i) {
462 unsigned flags = 0;
463
464 pan_pack(&flags, BLEND_FLAGS, cfg) {
465 if (blend[i].no_colour) {
466 cfg.enable = false;
467 break;
468 }
469
470 batch->draws |= (PIPE_CLEAR_COLOR0 << i);
471
472 cfg.srgb = util_format_is_srgb(batch->key.cbufs[i]->format);
473 cfg.load_destination = blend[i].load_dest;
474 cfg.dither_disable = !batch->ctx->blend->base.dither;
475
476 if (!(dev->quirks & IS_BIFROST))
477 cfg.midgard_blend_shader = blend[i].is_shader;
478 }
479
480 if (dev->quirks & IS_BIFROST) {
481 brts[i].flags = flags;
482
483 if (blend[i].is_shader) {
484 /* The blend shader's address needs to be at
485 * the same top 32 bit as the fragment shader.
486 * TODO: Ensure that's always the case.
487 */
488 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
489 (fs->bo->gpu & (0xffffffffull << 32)));
490 brts[i].shader = blend[i].shader.gpu;
491 brts[i].unk2 = 0x0;
492 } else {
493 enum pipe_format format = batch->key.cbufs[i]->format;
494 const struct util_format_description *format_desc;
495 format_desc = util_format_description(format);
496
497 brts[i].equation = blend[i].equation.equation;
498
499 /* TODO: this is a bit more complicated */
500 brts[i].constant = blend[i].equation.constant;
501
502 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
503
504 /* 0x19 disables blending and forces REPLACE
505 * mode (equivalent to rgb_mode = alpha_mode =
506 * x122, colour mask = 0xF). 0x1a allows
507 * blending. */
508 brts[i].unk2 = blend[i].opaque ? 0x19 : 0x1a;
509
510 brts[i].shader_type = fs->blend_types[i];
511 }
512 } else {
513 memcpy(&mrts[i].flags, &flags, sizeof(flags));
514
515 if (blend[i].is_shader) {
516 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
517 } else {
518 mrts[i].blend.equation = blend[i].equation.equation;
519 mrts[i].blend.constant = blend[i].equation.constant;
520 }
521 }
522 }
523 }
524
525 static struct mali_shader_packed
526 panfrost_pack_shaderless(bool midgard)
527 {
528 struct mali_shader_packed pack;
529
530 pan_pack(&pack, SHADER, cfg) {
531 cfg.shader = midgard ? 0x1 : 0x0;
532 }
533
534 return pack;
535 }
536
537 static void
538 panfrost_emit_frag_shader(struct panfrost_context *ctx,
539 struct mali_shader_meta *fragmeta,
540 struct panfrost_blend_final *blend)
541 {
542 const struct panfrost_device *dev = pan_device(ctx->base.screen);
543 struct panfrost_shader_state *fs;
544
545 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
546
547 struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
548 const struct panfrost_zsa_state *zsa = ctx->depth_stencil;
549 unsigned rt_count = ctx->pipe_framebuffer.nr_cbufs;
550
551 memset(fragmeta, 0, sizeof(*fragmeta));
552 memcpy(&fragmeta->shader, &fs->shader, sizeof(fs->shader));
553
554 if (dev->quirks & IS_BIFROST) {
555 struct mali_bifrost_properties_packed prop;
556
557 bool no_blend = true;
558
559 for (unsigned i = 0; i < rt_count; ++i)
560 no_blend &= (!blend[i].load_dest | blend[i].no_colour);
561
562 pan_pack(&prop, BIFROST_PROPERTIES, cfg) {
563 cfg.early_z_enable = !fs->can_discard && !fs->writes_depth && no_blend;
564 }
565
566 /* Combine with prepacked properties */
567 prop.opaque[0] |= fs->properties.opaque[0];
568
569 memcpy(&fragmeta->bifrost_props, &prop, sizeof(prop));
570 memcpy(&fragmeta->bifrost_preload, &fs->preload, sizeof(fs->preload));
571 } else {
572 struct mali_midgard_properties_packed prop;
573
574 /* Reasons to disable early-Z from a shader perspective */
575 bool late_z = fs->can_discard || fs->writes_global ||
576 fs->writes_depth || fs->writes_stencil;
577
578 /* Reasons to disable early-Z from a CSO perspective */
579 bool alpha_to_coverage = ctx->blend->base.alpha_to_coverage;
580
581 /* If either depth or stencil is enabled, discard matters */
582 bool zs_enabled =
583 (zsa->base.depth.enabled && zsa->base.depth.func != PIPE_FUNC_ALWAYS) ||
584 zsa->base.stencil[0].enabled;
585
586 bool has_blend_shader = false;
587
588 for (unsigned c = 0; c < rt_count; ++c)
589 has_blend_shader |= blend[c].is_shader;
590
591 pan_pack(&prop, MIDGARD_PROPERTIES, cfg) {
592 /* TODO: Reduce this limit? */
593 if (has_blend_shader)
594 cfg.work_register_count = MAX2(fs->work_reg_count, 8);
595 else
596 cfg.work_register_count = fs->work_reg_count;
597
598 cfg.early_z_enable = !(late_z || alpha_to_coverage);
599 cfg.reads_tilebuffer = fs->outputs_read || (!zs_enabled && fs->can_discard);
600 cfg.reads_depth_stencil = zs_enabled && fs->can_discard;
601 }
602
603 /* Combine with prepacked properties */
604 prop.opaque[0] |= fs->properties.opaque[0];
605 memcpy(&fragmeta->midgard_props, &prop, sizeof(prop));
606 }
607
608 bool msaa = rast->multisample;
609 fragmeta->coverage_mask = msaa ? ctx->sample_mask : ~0;
610
611 fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x10;
612 fragmeta->unknown2_4 = 0x4e0;
613
614 /* TODO: Sample size */
615 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
616 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
617
618 /* EXT_shader_framebuffer_fetch requires the shader to be run
619 * per-sample when outputs are read. */
620 bool per_sample = ctx->min_samples > 1 || fs->outputs_read;
621 SET_BIT(fragmeta->unknown2_3, MALI_PER_SAMPLE, msaa && per_sample);
622
623 fragmeta->depth_units = rast->offset_units * 2.0f;
624 fragmeta->depth_factor = rast->offset_scale;
625
626 /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
627
628 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, rast->offset_tri);
629 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, rast->offset_tri);
630
631 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_NEAR, rast->depth_clip_near);
632 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_FAR, rast->depth_clip_far);
633
634 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
635 zsa->base.stencil[0].enabled);
636
637 fragmeta->stencil_mask_front = zsa->stencil_mask_front;
638 fragmeta->stencil_mask_back = zsa->stencil_mask_back;
639
640 /* Bottom bits for stencil ref, exactly one word */
641 fragmeta->stencil_front.opaque[0] = zsa->stencil_front.opaque[0] | ctx->stencil_ref.ref_value[0];
642
643 /* If back-stencil is not enabled, use the front values */
644
645 if (zsa->base.stencil[1].enabled)
646 fragmeta->stencil_back.opaque[0] = zsa->stencil_back.opaque[0] | ctx->stencil_ref.ref_value[1];
647 else
648 fragmeta->stencil_back = fragmeta->stencil_front;
649
650 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
651 zsa->base.depth.writemask);
652
653 fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
654 fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(
655 zsa->base.depth.enabled ? zsa->base.depth.func : PIPE_FUNC_ALWAYS));
656
657 SET_BIT(fragmeta->unknown2_4, MALI_ALPHA_TO_COVERAGE,
658 ctx->blend->base.alpha_to_coverage);
659
660 /* Disable shader execution if we can */
661 if (!panfrost_fs_required(fs, blend, rt_count)) {
662 struct mali_shader_packed shader =
663 panfrost_pack_shaderless(!(dev->quirks & IS_BIFROST));
664
665 memcpy(&fragmeta->shader, &shader, sizeof(shader));
666
667 struct mali_midgard_properties_packed prop;
668
669 if (dev->quirks & IS_BIFROST) {
670 pan_pack(&prop, BIFROST_PROPERTIES, cfg) {
671 cfg.unknown = 0x950020; /* XXX */
672 cfg.early_z_enable = true;
673 }
674 } else {
675 pan_pack(&prop, MIDGARD_PROPERTIES, cfg) {
676 cfg.work_register_count = 1;
677 cfg.depth_source = MALI_DEPTH_SOURCE_FIXED_FUNCTION;
678 cfg.early_z_enable = true;
679 }
680 }
681
682 memcpy(&fragmeta->midgard_props, &prop, sizeof(prop));
683 }
684
685 if (dev->quirks & MIDGARD_SFBD) {
686 /* When only a single render target platform is used, the blend
687 * information is inside the shader meta itself. We additionally
688 * need to signal CAN_DISCARD for nontrivial blend modes (so
689 * we're able to read back the destination buffer) */
690
691 if (blend[0].no_colour)
692 return;
693
694 fragmeta->unknown2_4 |= MALI_SFBD_ENABLE;
695
696 SET_BIT(fragmeta->unknown2_4, MALI_SFBD_SRGB,
697 util_format_is_srgb(ctx->pipe_framebuffer.cbufs[0]->format));
698
699 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
700 blend[0].is_shader);
701
702 if (blend[0].is_shader) {
703 fragmeta->blend.shader = blend[0].shader.gpu |
704 blend[0].shader.first_tag;
705 } else {
706 fragmeta->blend.equation = blend[0].equation.equation;
707 fragmeta->blend.constant = blend[0].equation.constant;
708 }
709
710 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
711 blend[0].load_dest);
712
713 SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER, !ctx->blend->base.dither);
714 } else if (!(dev->quirks & IS_BIFROST)) {
715 /* Bug where MRT-capable hw apparently reads the last blend
716 * shader from here instead of the usual location? */
717
718 for (signed rt = ((signed) rt_count - 1); rt >= 0; --rt) {
719 if (!blend[rt].is_shader)
720 continue;
721
722 fragmeta->blend.shader = blend[rt].shader.gpu |
723 blend[rt].shader.first_tag;
724 break;
725 }
726 }
727 }
728
729 void
730 panfrost_emit_shader_meta(struct panfrost_batch *batch,
731 enum pipe_shader_type st,
732 struct mali_vertex_tiler_postfix *postfix)
733 {
734 struct panfrost_context *ctx = batch->ctx;
735 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
736
737 if (!ss) {
738 postfix->shader = 0;
739 return;
740 }
741
742 struct mali_shader_meta meta;
743
744 /* Add the shader BO to the batch. */
745 panfrost_batch_add_bo(batch, ss->bo,
746 PAN_BO_ACCESS_PRIVATE |
747 PAN_BO_ACCESS_READ |
748 panfrost_bo_access_for_stage(st));
749
750 mali_ptr shader_ptr;
751
752 if (st == PIPE_SHADER_FRAGMENT) {
753 struct panfrost_device *dev = pan_device(ctx->base.screen);
754 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
755 size_t desc_size = sizeof(meta);
756 void *rts = NULL;
757 struct panfrost_transfer xfer;
758 unsigned rt_size;
759
760 if (dev->quirks & MIDGARD_SFBD)
761 rt_size = 0;
762 else if (dev->quirks & IS_BIFROST)
763 rt_size = sizeof(struct bifrost_blend_rt);
764 else
765 rt_size = sizeof(struct midgard_blend_rt);
766
767 desc_size += rt_size * rt_count;
768
769 if (rt_size)
770 rts = rzalloc_size(ctx, rt_size * rt_count);
771
772 struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
773
774 for (unsigned c = 0; c < ctx->pipe_framebuffer.nr_cbufs; ++c)
775 blend[c] = panfrost_get_blend_for_context(ctx, c);
776
777 panfrost_emit_frag_shader(ctx, &meta, blend);
778
779 if (!(dev->quirks & MIDGARD_SFBD))
780 panfrost_emit_blend(batch, rts, blend);
781 else
782 batch->draws |= PIPE_CLEAR_COLOR0;
783
784 xfer = panfrost_pool_alloc_aligned(&batch->pool, desc_size, sizeof(meta));
785
786 memcpy(xfer.cpu, &meta, sizeof(meta));
787 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
788
789 if (rt_size)
790 ralloc_free(rts);
791
792 shader_ptr = xfer.gpu;
793 } else {
794 panfrost_emit_compute_shader(ctx, st, &meta);
795
796 shader_ptr = panfrost_pool_upload(&batch->pool, &meta,
797 sizeof(meta));
798 }
799
800 postfix->shader = shader_ptr;
801 }
802
803 void
804 panfrost_emit_viewport(struct panfrost_batch *batch,
805 struct mali_vertex_tiler_postfix *tiler_postfix)
806 {
807 struct panfrost_context *ctx = batch->ctx;
808 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
809 const struct pipe_scissor_state *ss = &ctx->scissor;
810 const struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
811 const struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer;
812
813 /* Derive min/max from translate/scale. Note since |x| >= 0 by
814 * definition, we have that -|x| <= |x| hence translate - |scale| <=
815 * translate + |scale|, so the ordering is correct here. */
816 float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
817 float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
818 float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
819 float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
820 float minz = (vp->translate[2] - fabsf(vp->scale[2]));
821 float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
822
823 /* Scissor to the intersection of viewport and to the scissor, clamped
824 * to the framebuffer */
825
826 unsigned minx = MIN2(fb->width, vp_minx);
827 unsigned maxx = MIN2(fb->width, vp_maxx);
828 unsigned miny = MIN2(fb->height, vp_miny);
829 unsigned maxy = MIN2(fb->height, vp_maxy);
830
831 if (ss && rast->scissor) {
832 minx = MAX2(ss->minx, minx);
833 miny = MAX2(ss->miny, miny);
834 maxx = MIN2(ss->maxx, maxx);
835 maxy = MIN2(ss->maxy, maxy);
836 }
837
838 struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool, MALI_VIEWPORT_LENGTH);
839
840 pan_pack(T.cpu, VIEWPORT, cfg) {
841 cfg.scissor_minimum_x = minx;
842 cfg.scissor_minimum_y = miny;
843 cfg.scissor_maximum_x = maxx - 1;
844 cfg.scissor_maximum_y = maxy - 1;
845
846 cfg.minimum_z = rast->depth_clip_near ? minz : -INFINITY;
847 cfg.maximum_z = rast->depth_clip_far ? maxz : INFINITY;
848 }
849
850 tiler_postfix->viewport = T.gpu;
851 panfrost_batch_union_scissor(batch, minx, miny, maxx, maxy);
852 }
853
854 static mali_ptr
855 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
856 enum pipe_shader_type st,
857 struct panfrost_constant_buffer *buf,
858 unsigned index)
859 {
860 struct pipe_constant_buffer *cb = &buf->cb[index];
861 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
862
863 if (rsrc) {
864 panfrost_batch_add_bo(batch, rsrc->bo,
865 PAN_BO_ACCESS_SHARED |
866 PAN_BO_ACCESS_READ |
867 panfrost_bo_access_for_stage(st));
868
869 /* Alignment gauranteed by
870 * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
871 return rsrc->bo->gpu + cb->buffer_offset;
872 } else if (cb->user_buffer) {
873 return panfrost_pool_upload_aligned(&batch->pool,
874 cb->user_buffer +
875 cb->buffer_offset,
876 cb->buffer_size, 16);
877 } else {
878 unreachable("No constant buffer");
879 }
880 }
881
882 struct sysval_uniform {
883 union {
884 float f[4];
885 int32_t i[4];
886 uint32_t u[4];
887 uint64_t du[2];
888 };
889 };
890
891 static void
892 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
893 struct sysval_uniform *uniform)
894 {
895 struct panfrost_context *ctx = batch->ctx;
896 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
897
898 uniform->f[0] = vp->scale[0];
899 uniform->f[1] = vp->scale[1];
900 uniform->f[2] = vp->scale[2];
901 }
902
903 static void
904 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
905 struct sysval_uniform *uniform)
906 {
907 struct panfrost_context *ctx = batch->ctx;
908 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
909
910 uniform->f[0] = vp->translate[0];
911 uniform->f[1] = vp->translate[1];
912 uniform->f[2] = vp->translate[2];
913 }
914
915 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
916 enum pipe_shader_type st,
917 unsigned int sysvalid,
918 struct sysval_uniform *uniform)
919 {
920 struct panfrost_context *ctx = batch->ctx;
921 unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
922 unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
923 bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
924 struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
925
926 assert(dim);
927 uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
928
929 if (dim > 1)
930 uniform->i[1] = u_minify(tex->texture->height0,
931 tex->u.tex.first_level);
932
933 if (dim > 2)
934 uniform->i[2] = u_minify(tex->texture->depth0,
935 tex->u.tex.first_level);
936
937 if (is_array)
938 uniform->i[dim] = tex->texture->array_size;
939 }
940
941 static void
942 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
943 enum pipe_shader_type st,
944 unsigned ssbo_id,
945 struct sysval_uniform *uniform)
946 {
947 struct panfrost_context *ctx = batch->ctx;
948
949 assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
950 struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
951
952 /* Compute address */
953 struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
954
955 panfrost_batch_add_bo(batch, bo,
956 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
957 panfrost_bo_access_for_stage(st));
958
959 /* Upload address and size as sysval */
960 uniform->du[0] = bo->gpu + sb.buffer_offset;
961 uniform->u[2] = sb.buffer_size;
962 }
963
964 static void
965 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
966 enum pipe_shader_type st,
967 unsigned samp_idx,
968 struct sysval_uniform *uniform)
969 {
970 struct panfrost_context *ctx = batch->ctx;
971 struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
972
973 uniform->f[0] = sampl->min_lod;
974 uniform->f[1] = sampl->max_lod;
975 uniform->f[2] = sampl->lod_bias;
976
977 /* Even without any errata, Midgard represents "no mipmapping" as
978 * fixing the LOD with the clamps; keep behaviour consistent. c.f.
979 * panfrost_create_sampler_state which also explains our choice of
980 * epsilon value (again to keep behaviour consistent) */
981
982 if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
983 uniform->f[1] = uniform->f[0] + (1.0/256.0);
984 }
985
986 static void
987 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
988 struct sysval_uniform *uniform)
989 {
990 struct panfrost_context *ctx = batch->ctx;
991
992 uniform->u[0] = ctx->compute_grid->grid[0];
993 uniform->u[1] = ctx->compute_grid->grid[1];
994 uniform->u[2] = ctx->compute_grid->grid[2];
995 }
996
997 static void
998 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
999 struct panfrost_shader_state *ss,
1000 enum pipe_shader_type st)
1001 {
1002 struct sysval_uniform *uniforms = (void *)buf;
1003
1004 for (unsigned i = 0; i < ss->sysval_count; ++i) {
1005 int sysval = ss->sysval[i];
1006
1007 switch (PAN_SYSVAL_TYPE(sysval)) {
1008 case PAN_SYSVAL_VIEWPORT_SCALE:
1009 panfrost_upload_viewport_scale_sysval(batch,
1010 &uniforms[i]);
1011 break;
1012 case PAN_SYSVAL_VIEWPORT_OFFSET:
1013 panfrost_upload_viewport_offset_sysval(batch,
1014 &uniforms[i]);
1015 break;
1016 case PAN_SYSVAL_TEXTURE_SIZE:
1017 panfrost_upload_txs_sysval(batch, st,
1018 PAN_SYSVAL_ID(sysval),
1019 &uniforms[i]);
1020 break;
1021 case PAN_SYSVAL_SSBO:
1022 panfrost_upload_ssbo_sysval(batch, st,
1023 PAN_SYSVAL_ID(sysval),
1024 &uniforms[i]);
1025 break;
1026 case PAN_SYSVAL_NUM_WORK_GROUPS:
1027 panfrost_upload_num_work_groups_sysval(batch,
1028 &uniforms[i]);
1029 break;
1030 case PAN_SYSVAL_SAMPLER:
1031 panfrost_upload_sampler_sysval(batch, st,
1032 PAN_SYSVAL_ID(sysval),
1033 &uniforms[i]);
1034 break;
1035 default:
1036 assert(0);
1037 }
1038 }
1039 }
1040
1041 static const void *
1042 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1043 unsigned index)
1044 {
1045 struct pipe_constant_buffer *cb = &buf->cb[index];
1046 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1047
1048 if (rsrc)
1049 return rsrc->bo->cpu;
1050 else if (cb->user_buffer)
1051 return cb->user_buffer;
1052 else
1053 unreachable("No constant buffer");
1054 }
1055
1056 void
1057 panfrost_emit_const_buf(struct panfrost_batch *batch,
1058 enum pipe_shader_type stage,
1059 struct mali_vertex_tiler_postfix *postfix)
1060 {
1061 struct panfrost_context *ctx = batch->ctx;
1062 struct panfrost_shader_variants *all = ctx->shader[stage];
1063
1064 if (!all)
1065 return;
1066
1067 struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1068
1069 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1070
1071 /* Uniforms are implicitly UBO #0 */
1072 bool has_uniforms = buf->enabled_mask & (1 << 0);
1073
1074 /* Allocate room for the sysval and the uniforms */
1075 size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1076 size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1077 size_t size = sys_size + uniform_size;
1078 struct panfrost_transfer transfer =
1079 panfrost_pool_alloc_aligned(&batch->pool, size, 16);
1080
1081 /* Upload sysvals requested by the shader */
1082 panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1083
1084 /* Upload uniforms */
1085 if (has_uniforms && uniform_size) {
1086 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1087 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1088 }
1089
1090 /* Next up, attach UBOs. UBO #0 is the uniforms we just
1091 * uploaded, so it's always included. The count is the highest UBO
1092 * addressable -- gaps are included. */
1093
1094 unsigned ubo_count = 32 - __builtin_clz(buf->enabled_mask | 1);
1095
1096 size_t sz = MALI_UNIFORM_BUFFER_LENGTH * ubo_count;
1097 struct panfrost_transfer ubos =
1098 panfrost_pool_alloc_aligned(&batch->pool, sz,
1099 MALI_UNIFORM_BUFFER_LENGTH);
1100
1101 uint64_t *ubo_ptr = (uint64_t *) ubos.cpu;
1102
1103 /* Upload uniforms as a UBO */
1104
1105 if (size) {
1106 pan_pack(ubo_ptr, UNIFORM_BUFFER, cfg) {
1107 cfg.entries = DIV_ROUND_UP(size, 16);
1108 cfg.pointer = transfer.gpu;
1109 }
1110 } else {
1111 *ubo_ptr = 0;
1112 }
1113
1114 /* The rest are honest-to-goodness UBOs */
1115
1116 for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1117 size_t usz = buf->cb[ubo].buffer_size;
1118 bool enabled = buf->enabled_mask & (1 << ubo);
1119 bool empty = usz == 0;
1120
1121 if (!enabled || empty) {
1122 ubo_ptr[ubo] = 0;
1123 continue;
1124 }
1125
1126 pan_pack(ubo_ptr + ubo, UNIFORM_BUFFER, cfg) {
1127 cfg.entries = DIV_ROUND_UP(usz, 16);
1128 cfg.pointer = panfrost_map_constant_buffer_gpu(batch,
1129 stage, buf, ubo);
1130 }
1131 }
1132
1133 postfix->uniforms = transfer.gpu;
1134 postfix->uniform_buffers = ubos.gpu;
1135
1136 buf->dirty_mask = 0;
1137 }
1138
1139 void
1140 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1141 const struct pipe_grid_info *info,
1142 struct midgard_payload_vertex_tiler *vtp)
1143 {
1144 struct panfrost_context *ctx = batch->ctx;
1145 struct panfrost_device *dev = pan_device(ctx->base.screen);
1146 struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1147 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1148 unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1149 128));
1150
1151 unsigned log2_instances =
1152 util_logbase2_ceil(info->grid[0]) +
1153 util_logbase2_ceil(info->grid[1]) +
1154 util_logbase2_ceil(info->grid[2]);
1155
1156 unsigned shared_size = single_size * (1 << log2_instances) * dev->core_count;
1157 struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1158 shared_size,
1159 1);
1160
1161 struct mali_shared_memory shared = {
1162 .shared_memory = bo->gpu,
1163 .shared_workgroup_count = log2_instances,
1164 .shared_shift = util_logbase2(single_size) + 1
1165 };
1166
1167 vtp->postfix.shared_memory = panfrost_pool_upload_aligned(&batch->pool, &shared,
1168 sizeof(shared), 64);
1169 }
1170
1171 static mali_ptr
1172 panfrost_get_tex_desc(struct panfrost_batch *batch,
1173 enum pipe_shader_type st,
1174 struct panfrost_sampler_view *view)
1175 {
1176 if (!view)
1177 return (mali_ptr) 0;
1178
1179 struct pipe_sampler_view *pview = &view->base;
1180 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1181
1182 /* Add the BO to the job so it's retained until the job is done. */
1183
1184 panfrost_batch_add_bo(batch, rsrc->bo,
1185 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1186 panfrost_bo_access_for_stage(st));
1187
1188 panfrost_batch_add_bo(batch, view->bo,
1189 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1190 panfrost_bo_access_for_stage(st));
1191
1192 return view->bo->gpu;
1193 }
1194
1195 static void
1196 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
1197 struct pipe_context *pctx)
1198 {
1199 struct panfrost_resource *rsrc = pan_resource(view->base.texture);
1200 if (view->texture_bo != rsrc->bo->gpu ||
1201 view->modifier != rsrc->modifier) {
1202 panfrost_bo_unreference(view->bo);
1203 panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);
1204 }
1205 }
1206
1207 void
1208 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1209 enum pipe_shader_type stage,
1210 struct mali_vertex_tiler_postfix *postfix)
1211 {
1212 struct panfrost_context *ctx = batch->ctx;
1213 struct panfrost_device *device = pan_device(ctx->base.screen);
1214
1215 if (!ctx->sampler_view_count[stage])
1216 return;
1217
1218 if (device->quirks & IS_BIFROST) {
1219 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1220 MALI_BIFROST_TEXTURE_LENGTH *
1221 ctx->sampler_view_count[stage],
1222 MALI_BIFROST_TEXTURE_LENGTH);
1223
1224 struct mali_bifrost_texture_packed *out =
1225 (struct mali_bifrost_texture_packed *) T.cpu;
1226
1227 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1228 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1229 struct pipe_sampler_view *pview = &view->base;
1230 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1231
1232 panfrost_update_sampler_view(view, &ctx->base);
1233 out[i] = view->bifrost_descriptor;
1234
1235 /* Add the BOs to the job so they are retained until the job is done. */
1236
1237 panfrost_batch_add_bo(batch, rsrc->bo,
1238 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1239 panfrost_bo_access_for_stage(stage));
1240
1241 panfrost_batch_add_bo(batch, view->bo,
1242 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1243 panfrost_bo_access_for_stage(stage));
1244 }
1245
1246 postfix->textures = T.gpu;
1247 } else {
1248 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1249
1250 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1251 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1252
1253 panfrost_update_sampler_view(view, &ctx->base);
1254
1255 trampolines[i] = panfrost_get_tex_desc(batch, stage, view);
1256 }
1257
1258 postfix->textures = panfrost_pool_upload_aligned(&batch->pool,
1259 trampolines,
1260 sizeof(uint64_t) *
1261 ctx->sampler_view_count[stage],
1262 sizeof(uint64_t));
1263 }
1264 }
1265
1266 void
1267 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1268 enum pipe_shader_type stage,
1269 struct mali_vertex_tiler_postfix *postfix)
1270 {
1271 struct panfrost_context *ctx = batch->ctx;
1272
1273 if (!ctx->sampler_count[stage])
1274 return;
1275
1276 size_t desc_size = MALI_BIFROST_SAMPLER_LENGTH;
1277 assert(MALI_BIFROST_SAMPLER_LENGTH == MALI_MIDGARD_SAMPLER_LENGTH);
1278
1279 size_t sz = desc_size * ctx->sampler_count[stage];
1280 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool, sz, desc_size);
1281 struct mali_midgard_sampler_packed *out = (struct mali_midgard_sampler_packed *) T.cpu;
1282
1283 for (unsigned i = 0; i < ctx->sampler_count[stage]; ++i)
1284 out[i] = ctx->samplers[stage][i]->hw;
1285
1286 postfix->sampler_descriptor = T.gpu;
1287 }
1288
1289 void
1290 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1291 struct mali_vertex_tiler_postfix *vertex_postfix)
1292 {
1293 struct panfrost_context *ctx = batch->ctx;
1294 struct panfrost_vertex_state *so = ctx->vertex;
1295 struct panfrost_shader_state *vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1296
1297 unsigned instance_shift = vertex_postfix->instance_shift;
1298 unsigned instance_odd = vertex_postfix->instance_odd;
1299
1300 /* Worst case: everything is NPOT, which is only possible if instancing
1301 * is enabled. Otherwise single record is gauranteed */
1302 bool could_npot = instance_shift || instance_odd;
1303
1304 struct panfrost_transfer S = panfrost_pool_alloc_aligned(&batch->pool,
1305 MALI_ATTRIBUTE_BUFFER_LENGTH * vs->attribute_count *
1306 (could_npot ? 2 : 1),
1307 MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
1308
1309 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1310 MALI_ATTRIBUTE_LENGTH * vs->attribute_count,
1311 MALI_ATTRIBUTE_LENGTH);
1312
1313 struct mali_attribute_buffer_packed *bufs =
1314 (struct mali_attribute_buffer_packed *) S.cpu;
1315
1316 struct mali_attribute_packed *out =
1317 (struct mali_attribute_packed *) T.cpu;
1318
1319 unsigned attrib_to_buffer[PIPE_MAX_ATTRIBS] = { 0 };
1320 unsigned k = 0;
1321
1322 for (unsigned i = 0; i < so->num_elements; ++i) {
1323 /* We map buffers 1:1 with the attributes, which
1324 * means duplicating some vertex buffers (who cares? aside from
1325 * maybe some caching implications but I somehow doubt that
1326 * matters) */
1327
1328 struct pipe_vertex_element *elem = &so->pipe[i];
1329 unsigned vbi = elem->vertex_buffer_index;
1330 attrib_to_buffer[i] = k;
1331
1332 if (!(ctx->vb_mask & (1 << vbi)))
1333 continue;
1334
1335 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1336 struct panfrost_resource *rsrc;
1337
1338 rsrc = pan_resource(buf->buffer.resource);
1339 if (!rsrc)
1340 continue;
1341
1342 /* Add a dependency of the batch on the vertex buffer */
1343 panfrost_batch_add_bo(batch, rsrc->bo,
1344 PAN_BO_ACCESS_SHARED |
1345 PAN_BO_ACCESS_READ |
1346 PAN_BO_ACCESS_VERTEX_TILER);
1347
1348 /* Mask off lower bits, see offset fixup below */
1349 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1350 mali_ptr addr = raw_addr & ~63;
1351
1352 /* Since we advanced the base pointer, we shrink the buffer
1353 * size, but add the offset we subtracted */
1354 unsigned size = rsrc->base.width0 + (raw_addr - addr)
1355 - buf->buffer_offset;
1356
1357 /* When there is a divisor, the hardware-level divisor is
1358 * the product of the instance divisor and the padded count */
1359 unsigned divisor = elem->instance_divisor;
1360 unsigned hw_divisor = ctx->padded_count * divisor;
1361 unsigned stride = buf->stride;
1362
1363 /* If there's a divisor(=1) but no instancing, we want every
1364 * attribute to be the same */
1365
1366 if (divisor && ctx->instance_count == 1)
1367 stride = 0;
1368
1369 if (!divisor || ctx->instance_count <= 1) {
1370 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1371 if (ctx->instance_count > 1)
1372 cfg.type = MALI_ATTRIBUTE_TYPE_1D_MODULUS;
1373
1374 cfg.pointer = addr;
1375 cfg.stride = stride;
1376 cfg.size = size;
1377 cfg.divisor_r = instance_shift;
1378 cfg.divisor_p = instance_odd;
1379 }
1380 } else if (util_is_power_of_two_or_zero(hw_divisor)) {
1381 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1382 cfg.type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;
1383 cfg.pointer = addr;
1384 cfg.stride = stride;
1385 cfg.size = size;
1386 cfg.divisor_r = __builtin_ctz(hw_divisor);
1387 }
1388
1389 } else {
1390 unsigned shift = 0, extra_flags = 0;
1391
1392 unsigned magic_divisor =
1393 panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags);
1394
1395 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1396 cfg.type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;
1397 cfg.pointer = addr;
1398 cfg.stride = stride;
1399 cfg.size = size;
1400
1401 cfg.divisor_r = shift;
1402 cfg.divisor_e = extra_flags;
1403 }
1404
1405 pan_pack(bufs + k + 1, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) {
1406 cfg.divisor_numerator = magic_divisor;
1407 cfg.divisor = divisor;
1408 }
1409
1410 ++k;
1411 }
1412
1413 ++k;
1414 }
1415
1416 /* Add special gl_VertexID/gl_InstanceID buffers */
1417
1418 if (unlikely(vs->attribute_count >= PAN_VERTEX_ID)) {
1419 panfrost_vertex_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1420
1421 pan_pack(out + PAN_VERTEX_ID, ATTRIBUTE, cfg) {
1422 cfg.buffer_index = k++;
1423 cfg.format = so->formats[PAN_VERTEX_ID];
1424 }
1425
1426 panfrost_instance_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1427
1428 pan_pack(out + PAN_INSTANCE_ID, ATTRIBUTE, cfg) {
1429 cfg.buffer_index = k++;
1430 cfg.format = so->formats[PAN_INSTANCE_ID];
1431 }
1432 }
1433
1434 /* Attribute addresses require 64-byte alignment, so let:
1435 *
1436 * base' = base & ~63 = base - (base & 63)
1437 * offset' = offset + (base & 63)
1438 *
1439 * Since base' + offset' = base + offset, these are equivalent
1440 * addressing modes and now base is 64 aligned.
1441 */
1442
1443 unsigned start = vertex_postfix->offset_start;
1444
1445 for (unsigned i = 0; i < so->num_elements; ++i) {
1446 unsigned vbi = so->pipe[i].vertex_buffer_index;
1447 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1448
1449 /* Adjust by the masked off bits of the offset. Make sure we
1450 * read src_offset from so->hw (which is not GPU visible)
1451 * rather than target (which is) due to caching effects */
1452
1453 unsigned src_offset = so->pipe[i].src_offset;
1454
1455 /* BOs aligned to 4k so guaranteed aligned to 64 */
1456 src_offset += (buf->buffer_offset & 63);
1457
1458 /* Also, somewhat obscurely per-instance data needs to be
1459 * offset in response to a delayed start in an indexed draw */
1460
1461 if (so->pipe[i].instance_divisor && ctx->instance_count > 1 && start)
1462 src_offset -= buf->stride * start;
1463
1464 pan_pack(out + i, ATTRIBUTE, cfg) {
1465 cfg.buffer_index = attrib_to_buffer[i];
1466 cfg.format = so->formats[i];
1467 cfg.offset = src_offset;
1468 }
1469 }
1470
1471 vertex_postfix->attributes = S.gpu;
1472 vertex_postfix->attribute_meta = T.gpu;
1473 }
1474
1475 static mali_ptr
1476 panfrost_emit_varyings(struct panfrost_batch *batch,
1477 struct mali_attribute_buffer_packed *slot,
1478 unsigned stride, unsigned count)
1479 {
1480 unsigned size = stride * count;
1481 mali_ptr ptr = panfrost_pool_alloc_aligned(&batch->invisible_pool, size, 64).gpu;
1482
1483 pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1484 cfg.stride = stride;
1485 cfg.size = size;
1486 cfg.pointer = ptr;
1487 }
1488
1489 return ptr;
1490 }
1491
1492 static unsigned
1493 panfrost_streamout_offset(unsigned stride, unsigned offset,
1494 struct pipe_stream_output_target *target)
1495 {
1496 return (target->buffer_offset + (offset * stride * 4)) & 63;
1497 }
1498
1499 static void
1500 panfrost_emit_streamout(struct panfrost_batch *batch,
1501 struct mali_attribute_buffer_packed *slot,
1502 unsigned stride_words, unsigned offset, unsigned count,
1503 struct pipe_stream_output_target *target)
1504 {
1505 unsigned stride = stride_words * 4;
1506 unsigned max_size = target->buffer_size;
1507 unsigned expected_size = stride * count;
1508
1509 /* Grab the BO and bind it to the batch */
1510 struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1511
1512 /* Varyings are WRITE from the perspective of the VERTEX but READ from
1513 * the perspective of the TILER and FRAGMENT.
1514 */
1515 panfrost_batch_add_bo(batch, bo,
1516 PAN_BO_ACCESS_SHARED |
1517 PAN_BO_ACCESS_RW |
1518 PAN_BO_ACCESS_VERTEX_TILER |
1519 PAN_BO_ACCESS_FRAGMENT);
1520
1521 /* We will have an offset applied to get alignment */
1522 mali_ptr addr = bo->gpu + target->buffer_offset + (offset * stride);
1523
1524 pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1525 cfg.pointer = (addr & ~63);
1526 cfg.stride = stride;
1527 cfg.size = MIN2(max_size, expected_size) + (addr & 63);
1528 }
1529 }
1530
1531 static bool
1532 has_point_coord(unsigned mask, gl_varying_slot loc)
1533 {
1534 if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1535 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1536 else if (loc == VARYING_SLOT_PNTC)
1537 return (mask & (1 << 8));
1538 else
1539 return false;
1540 }
1541
1542 /* Helpers for manipulating stream out information so we can pack varyings
1543 * accordingly. Compute the src_offset for a given captured varying */
1544
1545 static struct pipe_stream_output *
1546 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1547 {
1548 for (unsigned i = 0; i < info->num_outputs; ++i) {
1549 if (info->output[i].register_index == loc)
1550 return &info->output[i];
1551 }
1552
1553 unreachable("Varying not captured");
1554 }
1555
1556 static unsigned
1557 pan_varying_size(enum mali_format fmt)
1558 {
1559 unsigned type = MALI_EXTRACT_TYPE(fmt);
1560 unsigned chan = MALI_EXTRACT_CHANNELS(fmt);
1561 unsigned bits = MALI_EXTRACT_BITS(fmt);
1562 unsigned bpc = 0;
1563
1564 if (bits == MALI_CHANNEL_FLOAT) {
1565 /* No doubles */
1566 bool fp16 = (type == MALI_FORMAT_SINT);
1567 assert(fp16 || (type == MALI_FORMAT_UNORM));
1568
1569 bpc = fp16 ? 2 : 4;
1570 } else {
1571 assert(type >= MALI_FORMAT_SNORM && type <= MALI_FORMAT_SINT);
1572
1573 /* See the enums */
1574 bits = 1 << bits;
1575 assert(bits >= 8);
1576 bpc = bits / 8;
1577 }
1578
1579 return bpc * chan;
1580 }
1581
1582 /* Indices for named (non-XFB) varyings that are present. These are packed
1583 * tightly so they correspond to a bitfield present (P) indexed by (1 <<
1584 * PAN_VARY_*). This has the nice property that you can lookup the buffer index
1585 * of a given special field given a shift S by:
1586 *
1587 * idx = popcount(P & ((1 << S) - 1))
1588 *
1589 * That is... look at all of the varyings that come earlier and count them, the
1590 * count is the new index since plus one. Likewise, the total number of special
1591 * buffers required is simply popcount(P)
1592 */
1593
1594 enum pan_special_varying {
1595 PAN_VARY_GENERAL = 0,
1596 PAN_VARY_POSITION = 1,
1597 PAN_VARY_PSIZ = 2,
1598 PAN_VARY_PNTCOORD = 3,
1599 PAN_VARY_FACE = 4,
1600 PAN_VARY_FRAGCOORD = 5,
1601
1602 /* Keep last */
1603 PAN_VARY_MAX,
1604 };
1605
1606 /* Given a varying, figure out which index it correpsonds to */
1607
1608 static inline unsigned
1609 pan_varying_index(unsigned present, enum pan_special_varying v)
1610 {
1611 unsigned mask = (1 << v) - 1;
1612 return util_bitcount(present & mask);
1613 }
1614
1615 /* Get the base offset for XFB buffers, which by convention come after
1616 * everything else. Wrapper function for semantic reasons; by construction this
1617 * is just popcount. */
1618
1619 static inline unsigned
1620 pan_xfb_base(unsigned present)
1621 {
1622 return util_bitcount(present);
1623 }
1624
1625 /* Computes the present mask for varyings so we can start emitting varying records */
1626
1627 static inline unsigned
1628 pan_varying_present(
1629 struct panfrost_shader_state *vs,
1630 struct panfrost_shader_state *fs,
1631 unsigned quirks)
1632 {
1633 /* At the moment we always emit general and position buffers. Not
1634 * strictly necessary but usually harmless */
1635
1636 unsigned present = (1 << PAN_VARY_GENERAL) | (1 << PAN_VARY_POSITION);
1637
1638 /* Enable special buffers by the shader info */
1639
1640 if (vs->writes_point_size)
1641 present |= (1 << PAN_VARY_PSIZ);
1642
1643 if (fs->reads_point_coord)
1644 present |= (1 << PAN_VARY_PNTCOORD);
1645
1646 if (fs->reads_face)
1647 present |= (1 << PAN_VARY_FACE);
1648
1649 if (fs->reads_frag_coord && !(quirks & IS_BIFROST))
1650 present |= (1 << PAN_VARY_FRAGCOORD);
1651
1652 /* Also, if we have a point sprite, we need a point coord buffer */
1653
1654 for (unsigned i = 0; i < fs->varying_count; i++) {
1655 gl_varying_slot loc = fs->varyings_loc[i];
1656
1657 if (has_point_coord(fs->point_sprite_mask, loc))
1658 present |= (1 << PAN_VARY_PNTCOORD);
1659 }
1660
1661 return present;
1662 }
1663
1664 /* Emitters for varying records */
1665
1666 static void
1667 pan_emit_vary(struct mali_attribute_packed *out,
1668 unsigned present, enum pan_special_varying buf,
1669 unsigned quirks, enum mali_format format,
1670 unsigned offset)
1671 {
1672 unsigned nr_channels = MALI_EXTRACT_CHANNELS(format);
1673 unsigned swizzle = quirks & HAS_SWIZZLES ?
1674 panfrost_get_default_swizzle(nr_channels) :
1675 panfrost_bifrost_swizzle(nr_channels);
1676
1677 pan_pack(out, ATTRIBUTE, cfg) {
1678 cfg.buffer_index = pan_varying_index(present, buf);
1679 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1680 cfg.format = (format << 12) | swizzle;
1681 cfg.offset = offset;
1682 }
1683 }
1684
1685 /* General varying that is unused */
1686
1687 static void
1688 pan_emit_vary_only(struct mali_attribute_packed *out,
1689 unsigned present, unsigned quirks)
1690 {
1691 pan_emit_vary(out, present, 0, quirks, MALI_VARYING_DISCARD, 0);
1692 }
1693
1694 /* Special records */
1695
1696 static const enum mali_format pan_varying_formats[PAN_VARY_MAX] = {
1697 [PAN_VARY_POSITION] = MALI_VARYING_POS,
1698 [PAN_VARY_PSIZ] = MALI_R16F,
1699 [PAN_VARY_PNTCOORD] = MALI_R16F,
1700 [PAN_VARY_FACE] = MALI_R32I,
1701 [PAN_VARY_FRAGCOORD] = MALI_RGBA32F
1702 };
1703
1704 static void
1705 pan_emit_vary_special(struct mali_attribute_packed *out,
1706 unsigned present, enum pan_special_varying buf,
1707 unsigned quirks)
1708 {
1709 assert(buf < PAN_VARY_MAX);
1710 pan_emit_vary(out, present, buf, quirks, pan_varying_formats[buf], 0);
1711 }
1712
1713 static enum mali_format
1714 pan_xfb_format(enum mali_format format, unsigned nr)
1715 {
1716 if (MALI_EXTRACT_BITS(format) == MALI_CHANNEL_FLOAT)
1717 return MALI_R32F | MALI_NR_CHANNELS(nr);
1718 else
1719 return MALI_EXTRACT_TYPE(format) | MALI_NR_CHANNELS(nr) | MALI_CHANNEL_32;
1720 }
1721
1722 /* Transform feedback records. Note struct pipe_stream_output is (if packed as
1723 * a bitfield) 32-bit, smaller than a 64-bit pointer, so may as well pass by
1724 * value. */
1725
1726 static void
1727 pan_emit_vary_xfb(struct mali_attribute_packed *out,
1728 unsigned present,
1729 unsigned max_xfb,
1730 unsigned *streamout_offsets,
1731 unsigned quirks,
1732 enum mali_format format,
1733 struct pipe_stream_output o)
1734 {
1735 unsigned swizzle = quirks & HAS_SWIZZLES ?
1736 panfrost_get_default_swizzle(o.num_components) :
1737 panfrost_bifrost_swizzle(o.num_components);
1738
1739 pan_pack(out, ATTRIBUTE, cfg) {
1740 /* XFB buffers come after everything else */
1741 cfg.buffer_index = pan_xfb_base(present) + o.output_buffer;
1742 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1743
1744 /* Override number of channels and precision to highp */
1745 cfg.format = (pan_xfb_format(format, o.num_components) << 12) | swizzle;
1746
1747 /* Apply given offsets together */
1748 cfg.offset = (o.dst_offset * 4) /* dwords */
1749 + streamout_offsets[o.output_buffer];
1750 }
1751 }
1752
1753 /* Determine if we should capture a varying for XFB. This requires actually
1754 * having a buffer for it. If we don't capture it, we'll fallback to a general
1755 * varying path (linked or unlinked, possibly discarding the write) */
1756
1757 static bool
1758 panfrost_xfb_captured(struct panfrost_shader_state *xfb,
1759 unsigned loc, unsigned max_xfb)
1760 {
1761 if (!(xfb->so_mask & (1ll << loc)))
1762 return false;
1763
1764 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1765 return o->output_buffer < max_xfb;
1766 }
1767
1768 static void
1769 pan_emit_general_varying(struct mali_attribute_packed *out,
1770 struct panfrost_shader_state *other,
1771 struct panfrost_shader_state *xfb,
1772 gl_varying_slot loc,
1773 enum mali_format format,
1774 unsigned present,
1775 unsigned quirks,
1776 unsigned *gen_offsets,
1777 enum mali_format *gen_formats,
1778 unsigned *gen_stride,
1779 unsigned idx,
1780 bool should_alloc)
1781 {
1782 /* Check if we're linked */
1783 signed other_idx = -1;
1784
1785 for (unsigned j = 0; j < other->varying_count; ++j) {
1786 if (other->varyings_loc[j] == loc) {
1787 other_idx = j;
1788 break;
1789 }
1790 }
1791
1792 if (other_idx < 0) {
1793 pan_emit_vary_only(out, present, quirks);
1794 return;
1795 }
1796
1797 unsigned offset = gen_offsets[other_idx];
1798
1799 if (should_alloc) {
1800 /* We're linked, so allocate a space via a watermark allocation */
1801 enum mali_format alt = other->varyings[other_idx];
1802
1803 /* Do interpolation at minimum precision */
1804 unsigned size_main = pan_varying_size(format);
1805 unsigned size_alt = pan_varying_size(alt);
1806 unsigned size = MIN2(size_main, size_alt);
1807
1808 /* If a varying is marked for XFB but not actually captured, we
1809 * should match the format to the format that would otherwise
1810 * be used for XFB, since dEQP checks for invariance here. It's
1811 * unclear if this is required by the spec. */
1812
1813 if (xfb->so_mask & (1ull << loc)) {
1814 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1815 format = pan_xfb_format(format, o->num_components);
1816 size = pan_varying_size(format);
1817 } else if (size == size_alt) {
1818 format = alt;
1819 }
1820
1821 gen_offsets[idx] = *gen_stride;
1822 gen_formats[other_idx] = format;
1823 offset = *gen_stride;
1824 *gen_stride += size;
1825 }
1826
1827 pan_emit_vary(out, present, PAN_VARY_GENERAL, quirks, format, offset);
1828 }
1829
1830 /* Higher-level wrapper around all of the above, classifying a varying into one
1831 * of the above types */
1832
1833 static void
1834 panfrost_emit_varying(
1835 struct mali_attribute_packed *out,
1836 struct panfrost_shader_state *stage,
1837 struct panfrost_shader_state *other,
1838 struct panfrost_shader_state *xfb,
1839 unsigned present,
1840 unsigned max_xfb,
1841 unsigned *streamout_offsets,
1842 unsigned quirks,
1843 unsigned *gen_offsets,
1844 enum mali_format *gen_formats,
1845 unsigned *gen_stride,
1846 unsigned idx,
1847 bool should_alloc,
1848 bool is_fragment)
1849 {
1850 gl_varying_slot loc = stage->varyings_loc[idx];
1851 enum mali_format format = stage->varyings[idx];
1852
1853 /* Override format to match linkage */
1854 if (!should_alloc && gen_formats[idx])
1855 format = gen_formats[idx];
1856
1857 if (has_point_coord(stage->point_sprite_mask, loc)) {
1858 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1859 } else if (panfrost_xfb_captured(xfb, loc, max_xfb)) {
1860 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1861 pan_emit_vary_xfb(out, present, max_xfb, streamout_offsets, quirks, format, *o);
1862 } else if (loc == VARYING_SLOT_POS) {
1863 if (is_fragment)
1864 pan_emit_vary_special(out, present, PAN_VARY_FRAGCOORD, quirks);
1865 else
1866 pan_emit_vary_special(out, present, PAN_VARY_POSITION, quirks);
1867 } else if (loc == VARYING_SLOT_PSIZ) {
1868 pan_emit_vary_special(out, present, PAN_VARY_PSIZ, quirks);
1869 } else if (loc == VARYING_SLOT_PNTC) {
1870 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1871 } else if (loc == VARYING_SLOT_FACE) {
1872 pan_emit_vary_special(out, present, PAN_VARY_FACE, quirks);
1873 } else {
1874 pan_emit_general_varying(out, other, xfb, loc, format, present,
1875 quirks, gen_offsets, gen_formats, gen_stride,
1876 idx, should_alloc);
1877 }
1878 }
1879
1880 static void
1881 pan_emit_special_input(struct mali_attribute_buffer_packed *out,
1882 unsigned present,
1883 enum pan_special_varying v,
1884 unsigned special)
1885 {
1886 if (present & (1 << v)) {
1887 unsigned idx = pan_varying_index(present, v);
1888
1889 pan_pack(out + idx, ATTRIBUTE_BUFFER, cfg) {
1890 cfg.special = special;
1891 cfg.type = 0;
1892 }
1893 }
1894 }
1895
1896 void
1897 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1898 unsigned vertex_count,
1899 struct mali_vertex_tiler_postfix *vertex_postfix,
1900 struct mali_vertex_tiler_postfix *tiler_postfix,
1901 union midgard_primitive_size *primitive_size)
1902 {
1903 /* Load the shaders */
1904 struct panfrost_context *ctx = batch->ctx;
1905 struct panfrost_device *dev = pan_device(ctx->base.screen);
1906 struct panfrost_shader_state *vs, *fs;
1907 size_t vs_size, fs_size;
1908
1909 /* Allocate the varying descriptor */
1910
1911 vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1912 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1913 vs_size = MALI_ATTRIBUTE_LENGTH * vs->varying_count;
1914 fs_size = MALI_ATTRIBUTE_LENGTH * fs->varying_count;
1915
1916 struct panfrost_transfer trans = panfrost_pool_alloc_aligned(
1917 &batch->pool, vs_size + fs_size, MALI_ATTRIBUTE_LENGTH);
1918
1919 struct pipe_stream_output_info *so = &vs->stream_output;
1920 unsigned present = pan_varying_present(vs, fs, dev->quirks);
1921
1922 /* Check if this varying is linked by us. This is the case for
1923 * general-purpose, non-captured varyings. If it is, link it. If it's
1924 * not, use the provided stream out information to determine the
1925 * offset, since it was already linked for us. */
1926
1927 unsigned gen_offsets[32];
1928 enum mali_format gen_formats[32];
1929 memset(gen_offsets, 0, sizeof(gen_offsets));
1930 memset(gen_formats, 0, sizeof(gen_formats));
1931
1932 unsigned gen_stride = 0;
1933 assert(vs->varying_count < ARRAY_SIZE(gen_offsets));
1934 assert(fs->varying_count < ARRAY_SIZE(gen_offsets));
1935
1936 unsigned streamout_offsets[32];
1937
1938 for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1939 streamout_offsets[i] = panfrost_streamout_offset(
1940 so->stride[i],
1941 ctx->streamout.offsets[i],
1942 ctx->streamout.targets[i]);
1943 }
1944
1945 struct mali_attribute_packed *ovs = (struct mali_attribute_packed *)trans.cpu;
1946 struct mali_attribute_packed *ofs = ovs + vs->varying_count;
1947
1948 for (unsigned i = 0; i < vs->varying_count; i++) {
1949 panfrost_emit_varying(ovs + i, vs, fs, vs, present,
1950 ctx->streamout.num_targets, streamout_offsets,
1951 dev->quirks,
1952 gen_offsets, gen_formats, &gen_stride, i, true, false);
1953 }
1954
1955 for (unsigned i = 0; i < fs->varying_count; i++) {
1956 panfrost_emit_varying(ofs + i, fs, vs, vs, present,
1957 ctx->streamout.num_targets, streamout_offsets,
1958 dev->quirks,
1959 gen_offsets, gen_formats, &gen_stride, i, false, true);
1960 }
1961
1962 unsigned xfb_base = pan_xfb_base(present);
1963 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1964 MALI_ATTRIBUTE_BUFFER_LENGTH * (xfb_base + ctx->streamout.num_targets),
1965 MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
1966 struct mali_attribute_buffer_packed *varyings =
1967 (struct mali_attribute_buffer_packed *) T.cpu;
1968
1969 /* Emit the stream out buffers */
1970
1971 unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1972 ctx->vertex_count);
1973
1974 for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1975 panfrost_emit_streamout(batch, &varyings[xfb_base + i],
1976 so->stride[i],
1977 ctx->streamout.offsets[i],
1978 out_count,
1979 ctx->streamout.targets[i]);
1980 }
1981
1982 panfrost_emit_varyings(batch,
1983 &varyings[pan_varying_index(present, PAN_VARY_GENERAL)],
1984 gen_stride, vertex_count);
1985
1986 /* fp32 vec4 gl_Position */
1987 tiler_postfix->position_varying = panfrost_emit_varyings(batch,
1988 &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
1989 sizeof(float) * 4, vertex_count);
1990
1991 if (present & (1 << PAN_VARY_PSIZ)) {
1992 primitive_size->pointer = panfrost_emit_varyings(batch,
1993 &varyings[pan_varying_index(present, PAN_VARY_PSIZ)],
1994 2, vertex_count);
1995 }
1996
1997 pan_emit_special_input(varyings, present, PAN_VARY_PNTCOORD, MALI_ATTRIBUTE_SPECIAL_POINT_COORD);
1998 pan_emit_special_input(varyings, present, PAN_VARY_FACE, MALI_ATTRIBUTE_SPECIAL_FRONT_FACING);
1999 pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD, MALI_ATTRIBUTE_SPECIAL_FRAG_COORD);
2000
2001 vertex_postfix->varyings = T.gpu;
2002 tiler_postfix->varyings = T.gpu;
2003
2004 vertex_postfix->varying_meta = trans.gpu;
2005 tiler_postfix->varying_meta = trans.gpu + vs_size;
2006 }
2007
2008 void
2009 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
2010 struct mali_vertex_tiler_prefix *vertex_prefix,
2011 struct mali_vertex_tiler_postfix *vertex_postfix,
2012 struct mali_vertex_tiler_prefix *tiler_prefix,
2013 struct mali_vertex_tiler_postfix *tiler_postfix,
2014 union midgard_primitive_size *primitive_size)
2015 {
2016 struct panfrost_context *ctx = batch->ctx;
2017 struct panfrost_device *device = pan_device(ctx->base.screen);
2018 bool wallpapering = ctx->wallpaper_batch && batch->scoreboard.tiler_dep;
2019 struct bifrost_payload_vertex bifrost_vertex = {0,};
2020 struct bifrost_payload_tiler bifrost_tiler = {0,};
2021 struct midgard_payload_vertex_tiler midgard_vertex = {0,};
2022 struct midgard_payload_vertex_tiler midgard_tiler = {0,};
2023 void *vp, *tp;
2024 size_t vp_size, tp_size;
2025
2026 if (device->quirks & IS_BIFROST) {
2027 bifrost_vertex.prefix = *vertex_prefix;
2028 bifrost_vertex.postfix = *vertex_postfix;
2029 vp = &bifrost_vertex;
2030 vp_size = sizeof(bifrost_vertex);
2031
2032 bifrost_tiler.prefix = *tiler_prefix;
2033 bifrost_tiler.tiler.primitive_size = *primitive_size;
2034 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
2035 bifrost_tiler.postfix = *tiler_postfix;
2036 tp = &bifrost_tiler;
2037 tp_size = sizeof(bifrost_tiler);
2038 } else {
2039 midgard_vertex.prefix = *vertex_prefix;
2040 midgard_vertex.postfix = *vertex_postfix;
2041 vp = &midgard_vertex;
2042 vp_size = sizeof(midgard_vertex);
2043
2044 midgard_tiler.prefix = *tiler_prefix;
2045 midgard_tiler.postfix = *tiler_postfix;
2046 midgard_tiler.primitive_size = *primitive_size;
2047 tp = &midgard_tiler;
2048 tp_size = sizeof(midgard_tiler);
2049 }
2050
2051 if (wallpapering) {
2052 /* Inject in reverse order, with "predicted" job indices.
2053 * THIS IS A HACK XXX */
2054 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false,
2055 batch->scoreboard.job_index + 2, tp, tp_size, true);
2056 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2057 vp, vp_size, true);
2058 return;
2059 }
2060
2061 /* If rasterizer discard is enable, only submit the vertex */
2062
2063 unsigned vertex = panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2064 vp, vp_size, false);
2065
2066 if (ctx->rasterizer->base.rasterizer_discard)
2067 return;
2068
2069 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false, vertex, tp, tp_size,
2070 false);
2071 }
2072
2073 /* TODO: stop hardcoding this */
2074 mali_ptr
2075 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2076 {
2077 uint16_t locations[] = {
2078 128, 128,
2079 0, 256,
2080 0, 256,
2081 0, 256,
2082 0, 256,
2083 0, 256,
2084 0, 256,
2085 0, 256,
2086 0, 256,
2087 0, 256,
2088 0, 256,
2089 0, 256,
2090 0, 256,
2091 0, 256,
2092 0, 256,
2093 0, 256,
2094 0, 256,
2095 0, 256,
2096 0, 256,
2097 0, 256,
2098 0, 256,
2099 0, 256,
2100 0, 256,
2101 0, 256,
2102 0, 256,
2103 0, 256,
2104 0, 256,
2105 0, 256,
2106 0, 256,
2107 0, 256,
2108 0, 256,
2109 0, 256,
2110 128, 128,
2111 0, 0,
2112 0, 0,
2113 0, 0,
2114 0, 0,
2115 0, 0,
2116 0, 0,
2117 0, 0,
2118 0, 0,
2119 0, 0,
2120 0, 0,
2121 0, 0,
2122 0, 0,
2123 0, 0,
2124 0, 0,
2125 0, 0,
2126 };
2127
2128 return panfrost_pool_upload_aligned(&batch->pool, locations, 96 * sizeof(uint16_t), 64);
2129 }