panfrost: Decontextualize rasterizer
[mesa.git] / src / gallium / drivers / panfrost / pan_cmdstream.c
1 /*
2 * Copyright (C) 2018 Alyssa Rosenzweig
3 * Copyright (C) 2020 Collabora Ltd.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "util/macros.h"
26 #include "util/u_prim.h"
27 #include "util/u_vbuf.h"
28
29 #include "panfrost-quirks.h"
30
31 #include "pan_pool.h"
32 #include "pan_bo.h"
33 #include "pan_cmdstream.h"
34 #include "pan_context.h"
35 #include "pan_job.h"
36
37 /* If a BO is accessed for a particular shader stage, will it be in the primary
38 * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
39 * fragment will be primary, e.g. compute jobs will be considered
40 * "vertex/tiler" by analogy */
41
42 static inline uint32_t
43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
44 {
45 assert(stage == PIPE_SHADER_FRAGMENT ||
46 stage == PIPE_SHADER_VERTEX ||
47 stage == PIPE_SHADER_COMPUTE);
48
49 return stage == PIPE_SHADER_FRAGMENT ?
50 PAN_BO_ACCESS_FRAGMENT :
51 PAN_BO_ACCESS_VERTEX_TILER;
52 }
53
54 static void
55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
56 struct mali_vertex_tiler_postfix *postfix)
57 {
58 struct panfrost_device *dev = pan_device(ctx->base.screen);
59 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
60
61 unsigned shift = panfrost_get_stack_shift(batch->stack_size);
62 struct mali_shared_memory shared = {
63 .stack_shift = shift,
64 .scratchpad = panfrost_batch_get_scratchpad(batch, shift, dev->thread_tls_alloc, dev->core_count)->gpu,
65 .shared_workgroup_count = ~0,
66 };
67 postfix->shared_memory = panfrost_pool_upload(&batch->pool, &shared, sizeof(shared));
68 }
69
70 static void
71 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
72 struct mali_vertex_tiler_postfix *postfix)
73 {
74 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
75 postfix->shared_memory = panfrost_batch_reserve_framebuffer(batch);
76 }
77
78 static void
79 panfrost_vt_update_rasterizer(struct panfrost_rasterizer *rasterizer,
80 struct mali_vertex_tiler_prefix *prefix,
81 struct mali_vertex_tiler_postfix *postfix)
82 {
83 postfix->gl_enables |= 0x7;
84 SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
85 rasterizer && rasterizer->base.front_ccw);
86 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
87 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_FRONT));
88 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
89 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_BACK));
90 SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
91 rasterizer && rasterizer->base.flatshade_first);
92 }
93
94 void
95 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
96 struct mali_vertex_tiler_prefix *prefix,
97 union midgard_primitive_size *primitive_size)
98 {
99 struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
100
101 if (!panfrost_writes_point_size(ctx)) {
102 bool points = prefix->draw_mode == MALI_DRAW_MODE_POINTS;
103 float val = 0.0f;
104
105 if (rasterizer)
106 val = points ?
107 rasterizer->base.point_size :
108 rasterizer->base.line_width;
109
110 primitive_size->constant = val;
111 }
112 }
113
114 static void
115 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
116 struct mali_vertex_tiler_postfix *postfix)
117 {
118 SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
119 if (ctx->occlusion_query) {
120 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
121 panfrost_batch_add_bo(ctx->batch, ctx->occlusion_query->bo,
122 PAN_BO_ACCESS_SHARED |
123 PAN_BO_ACCESS_RW |
124 PAN_BO_ACCESS_FRAGMENT);
125 } else {
126 postfix->occlusion_counter = 0;
127 }
128 }
129
130 void
131 panfrost_vt_init(struct panfrost_context *ctx,
132 enum pipe_shader_type stage,
133 struct mali_vertex_tiler_prefix *prefix,
134 struct mali_vertex_tiler_postfix *postfix)
135 {
136 struct panfrost_device *device = pan_device(ctx->base.screen);
137
138 if (!ctx->shader[stage])
139 return;
140
141 memset(prefix, 0, sizeof(*prefix));
142 memset(postfix, 0, sizeof(*postfix));
143
144 if (device->quirks & IS_BIFROST) {
145 postfix->gl_enables = 0x2;
146 panfrost_vt_emit_shared_memory(ctx, postfix);
147 } else {
148 postfix->gl_enables = 0x6;
149 panfrost_vt_attach_framebuffer(ctx, postfix);
150 }
151
152 if (stage == PIPE_SHADER_FRAGMENT) {
153 panfrost_vt_update_occlusion_query(ctx, postfix);
154 panfrost_vt_update_rasterizer(ctx->rasterizer, prefix, postfix);
155 }
156 }
157
158 static unsigned
159 panfrost_translate_index_size(unsigned size)
160 {
161 switch (size) {
162 case 1:
163 return MALI_DRAW_INDEXED_UINT8;
164
165 case 2:
166 return MALI_DRAW_INDEXED_UINT16;
167
168 case 4:
169 return MALI_DRAW_INDEXED_UINT32;
170
171 default:
172 unreachable("Invalid index size");
173 }
174 }
175
176 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
177 * good for the duration of the draw (transient), could last longer. Also get
178 * the bounds on the index buffer for the range accessed by the draw. We do
179 * these operations together because there are natural optimizations which
180 * require them to be together. */
181
182 static mali_ptr
183 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
184 const struct pipe_draw_info *info,
185 unsigned *min_index, unsigned *max_index)
186 {
187 struct panfrost_resource *rsrc = pan_resource(info->index.resource);
188 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
189 off_t offset = info->start * info->index_size;
190 bool needs_indices = true;
191 mali_ptr out = 0;
192
193 if (info->max_index != ~0u) {
194 *min_index = info->min_index;
195 *max_index = info->max_index;
196 needs_indices = false;
197 }
198
199 if (!info->has_user_indices) {
200 /* Only resources can be directly mapped */
201 panfrost_batch_add_bo(batch, rsrc->bo,
202 PAN_BO_ACCESS_SHARED |
203 PAN_BO_ACCESS_READ |
204 PAN_BO_ACCESS_VERTEX_TILER);
205 out = rsrc->bo->gpu + offset;
206
207 /* Check the cache */
208 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
209 info->start,
210 info->count,
211 min_index,
212 max_index);
213 } else {
214 /* Otherwise, we need to upload to transient memory */
215 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
216 out = panfrost_pool_upload(&batch->pool, ibuf8 + offset,
217 info->count *
218 info->index_size);
219 }
220
221 if (needs_indices) {
222 /* Fallback */
223 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
224
225 if (!info->has_user_indices)
226 panfrost_minmax_cache_add(rsrc->index_cache,
227 info->start, info->count,
228 *min_index, *max_index);
229 }
230
231 return out;
232 }
233
234 void
235 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
236 const struct pipe_draw_info *info,
237 enum mali_draw_mode draw_mode,
238 struct mali_vertex_tiler_postfix *vertex_postfix,
239 struct mali_vertex_tiler_prefix *tiler_prefix,
240 struct mali_vertex_tiler_postfix *tiler_postfix,
241 unsigned *vertex_count,
242 unsigned *padded_count)
243 {
244 tiler_prefix->draw_mode = draw_mode;
245
246 unsigned draw_flags = 0;
247
248 if (panfrost_writes_point_size(ctx))
249 draw_flags |= MALI_DRAW_VARYING_SIZE;
250
251 if (info->primitive_restart)
252 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
253
254 /* These doesn't make much sense */
255
256 draw_flags |= 0x3000;
257
258 if (info->index_size) {
259 unsigned min_index = 0, max_index = 0;
260
261 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
262 info,
263 &min_index,
264 &max_index);
265
266 /* Use the corresponding values */
267 *vertex_count = max_index - min_index + 1;
268 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
269 tiler_prefix->offset_bias_correction = -min_index;
270 tiler_prefix->index_count = MALI_POSITIVE(info->count);
271 draw_flags |= panfrost_translate_index_size(info->index_size);
272 } else {
273 tiler_prefix->indices = 0;
274 *vertex_count = ctx->vertex_count;
275 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
276 tiler_prefix->offset_bias_correction = 0;
277 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
278 }
279
280 tiler_prefix->unknown_draw = draw_flags;
281
282 /* Encode the padded vertex count */
283
284 if (info->instance_count > 1) {
285 *padded_count = panfrost_padded_vertex_count(*vertex_count);
286
287 unsigned shift = __builtin_ctz(ctx->padded_count);
288 unsigned k = ctx->padded_count >> (shift + 1);
289
290 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
291 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
292 } else {
293 *padded_count = *vertex_count;
294
295 /* Reset instancing state */
296 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
297 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
298 }
299 }
300
301 static void
302 panfrost_shader_meta_init(struct panfrost_context *ctx,
303 enum pipe_shader_type st,
304 struct mali_shader_meta *meta)
305 {
306 const struct panfrost_device *dev = pan_device(ctx->base.screen);
307 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
308
309 memset(meta, 0, sizeof(*meta));
310 meta->shader = (ss->bo ? ss->bo->gpu : 0) | ss->first_tag;
311 meta->attribute_count = ss->attribute_count;
312 meta->varying_count = ss->varying_count;
313 meta->texture_count = ctx->sampler_view_count[st];
314 meta->sampler_count = ctx->sampler_count[st];
315
316 if (dev->quirks & IS_BIFROST) {
317 if (st == PIPE_SHADER_VERTEX)
318 meta->bifrost1.unk1 = 0x800000;
319 else {
320 /* First clause ATEST |= 0x4000000.
321 * Less than 32 regs |= 0x200 */
322 meta->bifrost1.unk1 = 0x950020;
323 }
324
325 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
326 if (st == PIPE_SHADER_VERTEX)
327 meta->bifrost2.preload_regs = 0xC0;
328 else {
329 meta->bifrost2.preload_regs = 0x1;
330 SET_BIT(meta->bifrost2.preload_regs, 0x10, ss->reads_frag_coord);
331 }
332
333 meta->bifrost2.uniform_count = MIN2(ss->uniform_count,
334 ss->uniform_cutoff);
335 } else {
336 meta->midgard1.uniform_count = MIN2(ss->uniform_count,
337 ss->uniform_cutoff);
338 meta->midgard1.work_count = ss->work_reg_count;
339
340 /* TODO: This is not conformant on ES3 */
341 meta->midgard1.flags_hi = MALI_SUPPRESS_INF_NAN;
342
343 meta->midgard1.flags_lo = 0x20;
344 meta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
345
346 SET_BIT(meta->midgard1.flags_hi, MALI_WRITES_GLOBAL, ss->writes_global);
347 }
348 }
349
350 static unsigned
351 translate_tex_wrap(enum pipe_tex_wrap w)
352 {
353 switch (w) {
354 case PIPE_TEX_WRAP_REPEAT: return MALI_WRAP_MODE_REPEAT;
355 case PIPE_TEX_WRAP_CLAMP: return MALI_WRAP_MODE_CLAMP;
356 case PIPE_TEX_WRAP_CLAMP_TO_EDGE: return MALI_WRAP_MODE_CLAMP_TO_EDGE;
357 case PIPE_TEX_WRAP_CLAMP_TO_BORDER: return MALI_WRAP_MODE_CLAMP_TO_BORDER;
358 case PIPE_TEX_WRAP_MIRROR_REPEAT: return MALI_WRAP_MODE_MIRRORED_REPEAT;
359 case PIPE_TEX_WRAP_MIRROR_CLAMP: return MALI_WRAP_MODE_MIRRORED_CLAMP;
360 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE;
361 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER;
362 default: unreachable("Invalid wrap");
363 }
364 }
365
366 /* The hardware compares in the wrong order order, so we have to flip before
367 * encoding. Yes, really. */
368
369 static enum mali_func
370 panfrost_sampler_compare_func(const struct pipe_sampler_state *cso)
371 {
372 if (!cso->compare_mode)
373 return MALI_FUNC_NEVER;
374
375 enum mali_func f = panfrost_translate_compare_func(cso->compare_func);
376 return panfrost_flip_compare_func(f);
377 }
378
379 static enum mali_mipmap_mode
380 pan_pipe_to_mipmode(enum pipe_tex_mipfilter f)
381 {
382 switch (f) {
383 case PIPE_TEX_MIPFILTER_NEAREST: return MALI_MIPMAP_MODE_NEAREST;
384 case PIPE_TEX_MIPFILTER_LINEAR: return MALI_MIPMAP_MODE_TRILINEAR;
385 case PIPE_TEX_MIPFILTER_NONE: return MALI_MIPMAP_MODE_NONE;
386 default: unreachable("Invalid");
387 }
388 }
389
390 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
391 struct mali_midgard_sampler_packed *hw)
392 {
393 pan_pack(hw, MIDGARD_SAMPLER, cfg) {
394 cfg.magnify_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
395 cfg.minify_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
396 cfg.mipmap_mode = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR) ?
397 MALI_MIPMAP_MODE_TRILINEAR : MALI_MIPMAP_MODE_NEAREST;
398 cfg.normalized_coordinates = cso->normalized_coords;
399
400 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
401
402 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
403
404 /* If necessary, we disable mipmapping in the sampler descriptor by
405 * clamping the LOD as tight as possible (from 0 to epsilon,
406 * essentially -- remember these are fixed point numbers, so
407 * epsilon=1/256) */
408
409 cfg.maximum_lod = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) ?
410 cfg.minimum_lod + 1 :
411 FIXED_16(cso->max_lod, false);
412
413 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
414 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
415 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
416
417 cfg.compare_function = panfrost_sampler_compare_func(cso);
418 cfg.seamless_cube_map = cso->seamless_cube_map;
419
420 cfg.border_color_r = cso->border_color.f[0];
421 cfg.border_color_g = cso->border_color.f[1];
422 cfg.border_color_b = cso->border_color.f[2];
423 cfg.border_color_a = cso->border_color.f[3];
424 }
425 }
426
427 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
428 struct mali_bifrost_sampler_packed *hw)
429 {
430 pan_pack(hw, BIFROST_SAMPLER, cfg) {
431 cfg.magnify_linear = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR;
432 cfg.minify_linear = cso->min_img_filter == PIPE_TEX_FILTER_LINEAR;
433 cfg.mipmap_mode = pan_pipe_to_mipmode(cso->min_mip_filter);
434 cfg.normalized_coordinates = cso->normalized_coords;
435
436 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
437 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
438 cfg.maximum_lod = FIXED_16(cso->max_lod, false);
439
440 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
441 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
442 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
443
444 cfg.compare_function = panfrost_sampler_compare_func(cso);
445 cfg.seamless_cube_map = cso->seamless_cube_map;
446 }
447 }
448
449 static void
450 panfrost_frag_meta_rasterizer_update(struct panfrost_context *ctx,
451 struct mali_shader_meta *fragmeta)
452 {
453 if (!ctx->rasterizer) {
454 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, true);
455 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, false);
456 fragmeta->depth_units = 0.0f;
457 fragmeta->depth_factor = 0.0f;
458 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, false);
459 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, false);
460 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_NEAR, true);
461 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_FAR, true);
462 return;
463 }
464
465 struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
466
467 bool msaa = rast->multisample;
468
469 /* TODO: Sample size */
470 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
471 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
472
473 struct panfrost_shader_state *fs;
474 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
475
476 /* EXT_shader_framebuffer_fetch requires the shader to be run
477 * per-sample when outputs are read. */
478 bool per_sample = ctx->min_samples > 1 || fs->outputs_read;
479 SET_BIT(fragmeta->unknown2_3, MALI_PER_SAMPLE, msaa && per_sample);
480
481 fragmeta->depth_units = rast->offset_units * 2.0f;
482 fragmeta->depth_factor = rast->offset_scale;
483
484 /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
485
486 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, rast->offset_tri);
487 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, rast->offset_tri);
488
489 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_NEAR, rast->depth_clip_near);
490 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_FAR, rast->depth_clip_far);
491 }
492
493 static void
494 panfrost_frag_meta_zsa_update(struct panfrost_context *ctx,
495 struct mali_shader_meta *fragmeta)
496 {
497 const struct panfrost_zsa_state *so = ctx->depth_stencil;
498 int zfunc = PIPE_FUNC_ALWAYS;
499
500 if (!so) {
501 /* If stenciling is disabled, the state is irrelevant */
502 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST, false);
503 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK, false);
504 } else {
505 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
506 so->base.stencil[0].enabled);
507
508 fragmeta->stencil_mask_front = so->stencil_mask_front;
509 fragmeta->stencil_mask_back = so->stencil_mask_back;
510
511 /* Bottom bits for stencil ref, exactly one word */
512 fragmeta->stencil_front.opaque[0] = so->stencil_front.opaque[0] | ctx->stencil_ref.ref_value[0];
513
514 /* If back-stencil is not enabled, use the front values */
515
516 if (so->base.stencil[1].enabled)
517 fragmeta->stencil_back.opaque[0] = so->stencil_back.opaque[0] | ctx->stencil_ref.ref_value[1];
518 else
519 fragmeta->stencil_back = fragmeta->stencil_front;
520
521 if (so->base.depth.enabled)
522 zfunc = so->base.depth.func;
523
524 /* Depth state (TODO: Refactor) */
525
526 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
527 so->base.depth.writemask);
528 }
529
530 fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
531 fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(zfunc));
532 }
533
534 static bool
535 panfrost_fs_required(
536 struct panfrost_shader_state *fs,
537 struct panfrost_blend_final *blend,
538 unsigned rt_count)
539 {
540 /* If we generally have side effects */
541 if (fs->fs_sidefx)
542 return true;
543
544 /* If colour is written we need to execute */
545 for (unsigned i = 0; i < rt_count; ++i) {
546 if (!blend[i].no_colour)
547 return true;
548 }
549
550 /* If depth is written and not implied we need to execute.
551 * TODO: Predicate on Z/S writes being enabled */
552 return (fs->writes_depth || fs->writes_stencil);
553 }
554
555 static void
556 panfrost_frag_meta_blend_update(struct panfrost_context *ctx,
557 struct mali_shader_meta *fragmeta,
558 void *rts)
559 {
560 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
561 const struct panfrost_device *dev = pan_device(ctx->base.screen);
562 struct panfrost_shader_state *fs;
563 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
564
565 SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
566 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
567 !ctx->blend->base.dither);
568
569 SET_BIT(fragmeta->unknown2_4, MALI_ALPHA_TO_COVERAGE,
570 ctx->blend->base.alpha_to_coverage);
571
572 /* Get blending setup */
573 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
574
575 struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
576 unsigned shader_offset = 0;
577 struct panfrost_bo *shader_bo = NULL;
578
579 for (unsigned c = 0; c < rt_count; ++c)
580 blend[c] = panfrost_get_blend_for_context(ctx, c, &shader_bo,
581 &shader_offset);
582
583 /* Disable shader execution if we can */
584 if (dev->quirks & MIDGARD_SHADERLESS
585 && !panfrost_fs_required(fs, blend, rt_count)) {
586 fragmeta->shader = 0;
587 fragmeta->attribute_count = 0;
588 fragmeta->varying_count = 0;
589 fragmeta->texture_count = 0;
590 fragmeta->sampler_count = 0;
591
592 /* This feature is not known to work on Bifrost */
593 fragmeta->midgard1.work_count = 1;
594 fragmeta->midgard1.uniform_count = 0;
595 fragmeta->midgard1.uniform_buffer_count = 0;
596 }
597
598 /* If there is a blend shader, work registers are shared. We impose 8
599 * work registers as a limit for blend shaders. Should be lower XXX */
600
601 if (!(dev->quirks & IS_BIFROST)) {
602 for (unsigned c = 0; c < rt_count; ++c) {
603 if (blend[c].is_shader) {
604 fragmeta->midgard1.work_count =
605 MAX2(fragmeta->midgard1.work_count, 8);
606 }
607 }
608 }
609
610 /* Even on MFBD, the shader descriptor gets blend shaders. It's *also*
611 * copied to the blend_meta appended (by convention), but this is the
612 * field actually read by the hardware. (Or maybe both are read...?).
613 * Specify the last RTi with a blend shader. */
614
615 fragmeta->blend.shader = 0;
616
617 for (signed rt = (rt_count - 1); rt >= 0; --rt) {
618 if (!blend[rt].is_shader)
619 continue;
620
621 fragmeta->blend.shader = blend[rt].shader.gpu |
622 blend[rt].shader.first_tag;
623 break;
624 }
625
626 if (dev->quirks & MIDGARD_SFBD) {
627 /* When only a single render target platform is used, the blend
628 * information is inside the shader meta itself. We additionally
629 * need to signal CAN_DISCARD for nontrivial blend modes (so
630 * we're able to read back the destination buffer) */
631
632 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
633 blend[0].is_shader);
634
635 if (!blend[0].is_shader) {
636 fragmeta->blend.equation = *blend[0].equation.equation;
637 fragmeta->blend.constant = blend[0].equation.constant;
638 }
639
640 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
641 !blend[0].no_blending || fs->can_discard);
642
643 batch->draws |= PIPE_CLEAR_COLOR0;
644 return;
645 }
646
647 if (dev->quirks & IS_BIFROST) {
648 bool no_blend = true;
649
650 for (unsigned i = 0; i < rt_count; ++i)
651 no_blend &= (blend[i].no_blending | blend[i].no_colour);
652
653 SET_BIT(fragmeta->bifrost1.unk1, MALI_BIFROST_EARLY_Z,
654 !fs->can_discard && !fs->writes_depth && no_blend);
655 }
656
657 /* Additional blend descriptor tacked on for jobs using MFBD */
658
659 for (unsigned i = 0; i < rt_count; ++i) {
660 unsigned flags = 0;
661
662 if (ctx->pipe_framebuffer.nr_cbufs > i && !blend[i].no_colour) {
663 flags = 0x200;
664 batch->draws |= (PIPE_CLEAR_COLOR0 << i);
665
666 bool is_srgb = (ctx->pipe_framebuffer.nr_cbufs > i) &&
667 (ctx->pipe_framebuffer.cbufs[i]) &&
668 util_format_is_srgb(ctx->pipe_framebuffer.cbufs[i]->format);
669
670 SET_BIT(flags, MALI_BLEND_MRT_SHADER, blend[i].is_shader);
671 SET_BIT(flags, MALI_BLEND_LOAD_TIB, !blend[i].no_blending);
672 SET_BIT(flags, MALI_BLEND_SRGB, is_srgb);
673 SET_BIT(flags, MALI_BLEND_NO_DITHER, !ctx->blend->base.dither);
674 }
675
676 if (dev->quirks & IS_BIFROST) {
677 struct bifrost_blend_rt *brts = rts;
678
679 brts[i].flags = flags;
680
681 if (blend[i].is_shader) {
682 /* The blend shader's address needs to be at
683 * the same top 32 bit as the fragment shader.
684 * TODO: Ensure that's always the case.
685 */
686 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
687 (fs->bo->gpu & (0xffffffffull << 32)));
688 brts[i].shader = blend[i].shader.gpu;
689 brts[i].unk2 = 0x0;
690 } else if (ctx->pipe_framebuffer.nr_cbufs > i) {
691 enum pipe_format format = ctx->pipe_framebuffer.cbufs[i]->format;
692 const struct util_format_description *format_desc;
693 format_desc = util_format_description(format);
694
695 brts[i].equation = *blend[i].equation.equation;
696
697 /* TODO: this is a bit more complicated */
698 brts[i].constant = blend[i].equation.constant;
699
700 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
701
702 /* 0x19 disables blending and forces REPLACE
703 * mode (equivalent to rgb_mode = alpha_mode =
704 * x122, colour mask = 0xF). 0x1a allows
705 * blending. */
706 brts[i].unk2 = blend[i].no_blending ? 0x19 : 0x1a;
707
708 brts[i].shader_type = fs->blend_types[i];
709 } else {
710 /* Dummy attachment for depth-only */
711 brts[i].unk2 = 0x3;
712 brts[i].shader_type = fs->blend_types[i];
713 }
714 } else {
715 struct midgard_blend_rt *mrts = rts;
716 mrts[i].flags = flags;
717
718 if (blend[i].is_shader) {
719 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
720 } else {
721 mrts[i].blend.equation = *blend[i].equation.equation;
722 mrts[i].blend.constant = blend[i].equation.constant;
723 }
724 }
725 }
726 }
727
728 static void
729 panfrost_frag_shader_meta_init(struct panfrost_context *ctx,
730 struct mali_shader_meta *fragmeta,
731 void *rts)
732 {
733 const struct panfrost_device *dev = pan_device(ctx->base.screen);
734 struct panfrost_shader_state *fs;
735
736 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
737
738 bool msaa = ctx->rasterizer && ctx->rasterizer->base.multisample;
739 fragmeta->coverage_mask = msaa ? ctx->sample_mask : ~0;
740
741 fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x10;
742 fragmeta->unknown2_4 = 0x4e0;
743
744 /* unknown2_4 has 0x10 bit set on T6XX and T720. We don't know why this
745 * is required (independent of 32-bit/64-bit descriptors), or why it's
746 * not used on later GPU revisions. Otherwise, all shader jobs fault on
747 * these earlier chips (perhaps this is a chicken bit of some kind).
748 * More investigation is needed. */
749
750 SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
751
752 if (dev->quirks & IS_BIFROST) {
753 /* TODO */
754 } else {
755 /* Depending on whether it's legal to in the given shader, we try to
756 * enable early-z testing. TODO: respect e-z force */
757
758 SET_BIT(fragmeta->midgard1.flags_lo, MALI_EARLY_Z,
759 !fs->can_discard && !fs->writes_global &&
760 !fs->writes_depth && !fs->writes_stencil &&
761 !ctx->blend->base.alpha_to_coverage);
762
763 /* Add the writes Z/S flags if needed. */
764 SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_Z, fs->writes_depth);
765 SET_BIT(fragmeta->midgard1.flags_hi, MALI_WRITES_S, fs->writes_stencil);
766
767 /* Any time texturing is used, derivatives are implicitly calculated,
768 * so we need to enable helper invocations */
769
770 SET_BIT(fragmeta->midgard1.flags_lo, MALI_HELPER_INVOCATIONS,
771 fs->helper_invocations);
772
773 /* If discard is enabled, which bit we set to convey this
774 * depends on if depth/stencil is used for the draw or not.
775 * Just one of depth OR stencil is enough to trigger this. */
776
777 const struct pipe_depth_stencil_alpha_state *zsa = &ctx->depth_stencil->base;
778 bool zs_enabled = fs->writes_depth || fs->writes_stencil;
779
780 if (zsa) {
781 zs_enabled |= (zsa->depth.enabled && zsa->depth.func != PIPE_FUNC_ALWAYS);
782 zs_enabled |= zsa->stencil[0].enabled;
783 }
784
785 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_TILEBUFFER,
786 fs->outputs_read || (!zs_enabled && fs->can_discard));
787 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_ZS, zs_enabled && fs->can_discard);
788 }
789
790 panfrost_frag_meta_rasterizer_update(ctx, fragmeta);
791 panfrost_frag_meta_zsa_update(ctx, fragmeta);
792 panfrost_frag_meta_blend_update(ctx, fragmeta, rts);
793 }
794
795 void
796 panfrost_emit_shader_meta(struct panfrost_batch *batch,
797 enum pipe_shader_type st,
798 struct mali_vertex_tiler_postfix *postfix)
799 {
800 struct panfrost_context *ctx = batch->ctx;
801 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
802
803 if (!ss) {
804 postfix->shader = 0;
805 return;
806 }
807
808 struct mali_shader_meta meta;
809
810 panfrost_shader_meta_init(ctx, st, &meta);
811
812 /* Add the shader BO to the batch. */
813 panfrost_batch_add_bo(batch, ss->bo,
814 PAN_BO_ACCESS_PRIVATE |
815 PAN_BO_ACCESS_READ |
816 panfrost_bo_access_for_stage(st));
817
818 mali_ptr shader_ptr;
819
820 if (st == PIPE_SHADER_FRAGMENT) {
821 struct panfrost_device *dev = pan_device(ctx->base.screen);
822 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
823 size_t desc_size = sizeof(meta);
824 void *rts = NULL;
825 struct panfrost_transfer xfer;
826 unsigned rt_size;
827
828 if (dev->quirks & MIDGARD_SFBD)
829 rt_size = 0;
830 else if (dev->quirks & IS_BIFROST)
831 rt_size = sizeof(struct bifrost_blend_rt);
832 else
833 rt_size = sizeof(struct midgard_blend_rt);
834
835 desc_size += rt_size * rt_count;
836
837 if (rt_size)
838 rts = rzalloc_size(ctx, rt_size * rt_count);
839
840 panfrost_frag_shader_meta_init(ctx, &meta, rts);
841
842 xfer = panfrost_pool_alloc(&batch->pool, desc_size);
843
844 memcpy(xfer.cpu, &meta, sizeof(meta));
845 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
846
847 if (rt_size)
848 ralloc_free(rts);
849
850 shader_ptr = xfer.gpu;
851 } else {
852 shader_ptr = panfrost_pool_upload(&batch->pool, &meta,
853 sizeof(meta));
854 }
855
856 postfix->shader = shader_ptr;
857 }
858
859 void
860 panfrost_emit_viewport(struct panfrost_batch *batch,
861 struct mali_vertex_tiler_postfix *tiler_postfix)
862 {
863 struct panfrost_context *ctx = batch->ctx;
864 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
865 const struct pipe_scissor_state *ss = &ctx->scissor;
866 const struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
867 const struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer;
868
869 /* Derive min/max from translate/scale. Note since |x| >= 0 by
870 * definition, we have that -|x| <= |x| hence translate - |scale| <=
871 * translate + |scale|, so the ordering is correct here. */
872 float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
873 float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
874 float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
875 float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
876 float minz = (vp->translate[2] - fabsf(vp->scale[2]));
877 float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
878
879 /* Scissor to the intersection of viewport and to the scissor, clamped
880 * to the framebuffer */
881
882 unsigned minx = MIN2(fb->width, vp_minx);
883 unsigned maxx = MIN2(fb->width, vp_maxx);
884 unsigned miny = MIN2(fb->height, vp_miny);
885 unsigned maxy = MIN2(fb->height, vp_maxy);
886
887 if (ss && rast && rast->scissor) {
888 minx = MAX2(ss->minx, minx);
889 miny = MAX2(ss->miny, miny);
890 maxx = MIN2(ss->maxx, maxx);
891 maxy = MIN2(ss->maxy, maxy);
892 }
893
894 struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool, MALI_VIEWPORT_LENGTH);
895
896 pan_pack(T.cpu, VIEWPORT, cfg) {
897 cfg.scissor_minimum_x = minx;
898 cfg.scissor_minimum_y = miny;
899 cfg.scissor_maximum_x = maxx - 1;
900 cfg.scissor_maximum_y = maxy - 1;
901
902 cfg.minimum_z = rast->depth_clip_near ? minz : -INFINITY;
903 cfg.maximum_z = rast->depth_clip_far ? maxz : INFINITY;
904 }
905
906 tiler_postfix->viewport = T.gpu;
907 panfrost_batch_union_scissor(batch, minx, miny, maxx, maxy);
908 }
909
910 static mali_ptr
911 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
912 enum pipe_shader_type st,
913 struct panfrost_constant_buffer *buf,
914 unsigned index)
915 {
916 struct pipe_constant_buffer *cb = &buf->cb[index];
917 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
918
919 if (rsrc) {
920 panfrost_batch_add_bo(batch, rsrc->bo,
921 PAN_BO_ACCESS_SHARED |
922 PAN_BO_ACCESS_READ |
923 panfrost_bo_access_for_stage(st));
924
925 /* Alignment gauranteed by
926 * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
927 return rsrc->bo->gpu + cb->buffer_offset;
928 } else if (cb->user_buffer) {
929 return panfrost_pool_upload(&batch->pool,
930 cb->user_buffer +
931 cb->buffer_offset,
932 cb->buffer_size);
933 } else {
934 unreachable("No constant buffer");
935 }
936 }
937
938 struct sysval_uniform {
939 union {
940 float f[4];
941 int32_t i[4];
942 uint32_t u[4];
943 uint64_t du[2];
944 };
945 };
946
947 static void
948 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
949 struct sysval_uniform *uniform)
950 {
951 struct panfrost_context *ctx = batch->ctx;
952 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
953
954 uniform->f[0] = vp->scale[0];
955 uniform->f[1] = vp->scale[1];
956 uniform->f[2] = vp->scale[2];
957 }
958
959 static void
960 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
961 struct sysval_uniform *uniform)
962 {
963 struct panfrost_context *ctx = batch->ctx;
964 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
965
966 uniform->f[0] = vp->translate[0];
967 uniform->f[1] = vp->translate[1];
968 uniform->f[2] = vp->translate[2];
969 }
970
971 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
972 enum pipe_shader_type st,
973 unsigned int sysvalid,
974 struct sysval_uniform *uniform)
975 {
976 struct panfrost_context *ctx = batch->ctx;
977 unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
978 unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
979 bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
980 struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
981
982 assert(dim);
983 uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
984
985 if (dim > 1)
986 uniform->i[1] = u_minify(tex->texture->height0,
987 tex->u.tex.first_level);
988
989 if (dim > 2)
990 uniform->i[2] = u_minify(tex->texture->depth0,
991 tex->u.tex.first_level);
992
993 if (is_array)
994 uniform->i[dim] = tex->texture->array_size;
995 }
996
997 static void
998 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
999 enum pipe_shader_type st,
1000 unsigned ssbo_id,
1001 struct sysval_uniform *uniform)
1002 {
1003 struct panfrost_context *ctx = batch->ctx;
1004
1005 assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
1006 struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
1007
1008 /* Compute address */
1009 struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
1010
1011 panfrost_batch_add_bo(batch, bo,
1012 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
1013 panfrost_bo_access_for_stage(st));
1014
1015 /* Upload address and size as sysval */
1016 uniform->du[0] = bo->gpu + sb.buffer_offset;
1017 uniform->u[2] = sb.buffer_size;
1018 }
1019
1020 static void
1021 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1022 enum pipe_shader_type st,
1023 unsigned samp_idx,
1024 struct sysval_uniform *uniform)
1025 {
1026 struct panfrost_context *ctx = batch->ctx;
1027 struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1028
1029 uniform->f[0] = sampl->min_lod;
1030 uniform->f[1] = sampl->max_lod;
1031 uniform->f[2] = sampl->lod_bias;
1032
1033 /* Even without any errata, Midgard represents "no mipmapping" as
1034 * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1035 * panfrost_create_sampler_state which also explains our choice of
1036 * epsilon value (again to keep behaviour consistent) */
1037
1038 if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1039 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1040 }
1041
1042 static void
1043 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1044 struct sysval_uniform *uniform)
1045 {
1046 struct panfrost_context *ctx = batch->ctx;
1047
1048 uniform->u[0] = ctx->compute_grid->grid[0];
1049 uniform->u[1] = ctx->compute_grid->grid[1];
1050 uniform->u[2] = ctx->compute_grid->grid[2];
1051 }
1052
1053 static void
1054 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1055 struct panfrost_shader_state *ss,
1056 enum pipe_shader_type st)
1057 {
1058 struct sysval_uniform *uniforms = (void *)buf;
1059
1060 for (unsigned i = 0; i < ss->sysval_count; ++i) {
1061 int sysval = ss->sysval[i];
1062
1063 switch (PAN_SYSVAL_TYPE(sysval)) {
1064 case PAN_SYSVAL_VIEWPORT_SCALE:
1065 panfrost_upload_viewport_scale_sysval(batch,
1066 &uniforms[i]);
1067 break;
1068 case PAN_SYSVAL_VIEWPORT_OFFSET:
1069 panfrost_upload_viewport_offset_sysval(batch,
1070 &uniforms[i]);
1071 break;
1072 case PAN_SYSVAL_TEXTURE_SIZE:
1073 panfrost_upload_txs_sysval(batch, st,
1074 PAN_SYSVAL_ID(sysval),
1075 &uniforms[i]);
1076 break;
1077 case PAN_SYSVAL_SSBO:
1078 panfrost_upload_ssbo_sysval(batch, st,
1079 PAN_SYSVAL_ID(sysval),
1080 &uniforms[i]);
1081 break;
1082 case PAN_SYSVAL_NUM_WORK_GROUPS:
1083 panfrost_upload_num_work_groups_sysval(batch,
1084 &uniforms[i]);
1085 break;
1086 case PAN_SYSVAL_SAMPLER:
1087 panfrost_upload_sampler_sysval(batch, st,
1088 PAN_SYSVAL_ID(sysval),
1089 &uniforms[i]);
1090 break;
1091 default:
1092 assert(0);
1093 }
1094 }
1095 }
1096
1097 static const void *
1098 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1099 unsigned index)
1100 {
1101 struct pipe_constant_buffer *cb = &buf->cb[index];
1102 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1103
1104 if (rsrc)
1105 return rsrc->bo->cpu;
1106 else if (cb->user_buffer)
1107 return cb->user_buffer;
1108 else
1109 unreachable("No constant buffer");
1110 }
1111
1112 void
1113 panfrost_emit_const_buf(struct panfrost_batch *batch,
1114 enum pipe_shader_type stage,
1115 struct mali_vertex_tiler_postfix *postfix)
1116 {
1117 struct panfrost_context *ctx = batch->ctx;
1118 struct panfrost_shader_variants *all = ctx->shader[stage];
1119
1120 if (!all)
1121 return;
1122
1123 struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1124
1125 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1126
1127 /* Uniforms are implicitly UBO #0 */
1128 bool has_uniforms = buf->enabled_mask & (1 << 0);
1129
1130 /* Allocate room for the sysval and the uniforms */
1131 size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1132 size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1133 size_t size = sys_size + uniform_size;
1134 struct panfrost_transfer transfer = panfrost_pool_alloc(&batch->pool,
1135 size);
1136
1137 /* Upload sysvals requested by the shader */
1138 panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1139
1140 /* Upload uniforms */
1141 if (has_uniforms && uniform_size) {
1142 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1143 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1144 }
1145
1146 /* Next up, attach UBOs. UBO #0 is the uniforms we just
1147 * uploaded */
1148
1149 unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1150 assert(ubo_count >= 1);
1151
1152 size_t sz = MALI_UNIFORM_BUFFER_LENGTH * ubo_count;
1153 struct panfrost_transfer ubos = panfrost_pool_alloc(&batch->pool, sz);
1154 uint64_t *ubo_ptr = (uint64_t *) ubos.cpu;
1155
1156 /* Upload uniforms as a UBO */
1157
1158 if (ss->uniform_count) {
1159 pan_pack(ubo_ptr, UNIFORM_BUFFER, cfg) {
1160 cfg.entries = ss->uniform_count;
1161 cfg.pointer = transfer.gpu;
1162 }
1163 } else {
1164 *ubo_ptr = 0;
1165 }
1166
1167 /* The rest are honest-to-goodness UBOs */
1168
1169 for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1170 size_t usz = buf->cb[ubo].buffer_size;
1171 bool enabled = buf->enabled_mask & (1 << ubo);
1172 bool empty = usz == 0;
1173
1174 if (!enabled || empty) {
1175 ubo_ptr[ubo] = 0;
1176 continue;
1177 }
1178
1179 pan_pack(ubo_ptr + ubo, UNIFORM_BUFFER, cfg) {
1180 cfg.entries = DIV_ROUND_UP(usz, 16);
1181 cfg.pointer = panfrost_map_constant_buffer_gpu(batch,
1182 stage, buf, ubo);
1183 }
1184 }
1185
1186 postfix->uniforms = transfer.gpu;
1187 postfix->uniform_buffers = ubos.gpu;
1188
1189 buf->dirty_mask = 0;
1190 }
1191
1192 void
1193 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1194 const struct pipe_grid_info *info,
1195 struct midgard_payload_vertex_tiler *vtp)
1196 {
1197 struct panfrost_context *ctx = batch->ctx;
1198 struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1199 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1200 unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1201 128));
1202 unsigned shared_size = single_size * info->grid[0] * info->grid[1] *
1203 info->grid[2] * 4;
1204 struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1205 shared_size,
1206 1);
1207
1208 struct mali_shared_memory shared = {
1209 .shared_memory = bo->gpu,
1210 .shared_workgroup_count =
1211 util_logbase2_ceil(info->grid[0]) +
1212 util_logbase2_ceil(info->grid[1]) +
1213 util_logbase2_ceil(info->grid[2]),
1214 .shared_unk1 = 0x2,
1215 .shared_shift = util_logbase2(single_size) - 1
1216 };
1217
1218 vtp->postfix.shared_memory = panfrost_pool_upload(&batch->pool, &shared,
1219 sizeof(shared));
1220 }
1221
1222 static mali_ptr
1223 panfrost_get_tex_desc(struct panfrost_batch *batch,
1224 enum pipe_shader_type st,
1225 struct panfrost_sampler_view *view)
1226 {
1227 if (!view)
1228 return (mali_ptr) 0;
1229
1230 struct pipe_sampler_view *pview = &view->base;
1231 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1232
1233 /* Add the BO to the job so it's retained until the job is done. */
1234
1235 panfrost_batch_add_bo(batch, rsrc->bo,
1236 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1237 panfrost_bo_access_for_stage(st));
1238
1239 panfrost_batch_add_bo(batch, view->bo,
1240 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1241 panfrost_bo_access_for_stage(st));
1242
1243 return view->bo->gpu;
1244 }
1245
1246 static void
1247 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
1248 struct pipe_context *pctx)
1249 {
1250 struct panfrost_resource *rsrc = pan_resource(view->base.texture);
1251 if (view->texture_bo != rsrc->bo->gpu ||
1252 view->modifier != rsrc->modifier) {
1253 panfrost_bo_unreference(view->bo);
1254 panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);
1255 }
1256 }
1257
1258 void
1259 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1260 enum pipe_shader_type stage,
1261 struct mali_vertex_tiler_postfix *postfix)
1262 {
1263 struct panfrost_context *ctx = batch->ctx;
1264 struct panfrost_device *device = pan_device(ctx->base.screen);
1265
1266 if (!ctx->sampler_view_count[stage])
1267 return;
1268
1269 if (device->quirks & IS_BIFROST) {
1270 struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool,
1271 MALI_BIFROST_TEXTURE_LENGTH *
1272 ctx->sampler_view_count[stage]);
1273
1274 struct mali_bifrost_texture_packed *out =
1275 (struct mali_bifrost_texture_packed *) T.cpu;
1276
1277 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1278 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1279 struct pipe_sampler_view *pview = &view->base;
1280 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1281
1282 panfrost_update_sampler_view(view, &ctx->base);
1283 out[i] = view->bifrost_descriptor;
1284
1285 /* Add the BOs to the job so they are retained until the job is done. */
1286
1287 panfrost_batch_add_bo(batch, rsrc->bo,
1288 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1289 panfrost_bo_access_for_stage(stage));
1290
1291 panfrost_batch_add_bo(batch, view->bo,
1292 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1293 panfrost_bo_access_for_stage(stage));
1294 }
1295
1296 postfix->textures = T.gpu;
1297 } else {
1298 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1299
1300 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1301 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1302
1303 panfrost_update_sampler_view(view, &ctx->base);
1304
1305 trampolines[i] = panfrost_get_tex_desc(batch, stage, view);
1306 }
1307
1308 postfix->textures = panfrost_pool_upload(&batch->pool,
1309 trampolines,
1310 sizeof(uint64_t) *
1311 ctx->sampler_view_count[stage]);
1312 }
1313 }
1314
1315 void
1316 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1317 enum pipe_shader_type stage,
1318 struct mali_vertex_tiler_postfix *postfix)
1319 {
1320 struct panfrost_context *ctx = batch->ctx;
1321
1322 if (!ctx->sampler_count[stage])
1323 return;
1324
1325 size_t desc_size = MALI_BIFROST_SAMPLER_LENGTH;
1326 assert(MALI_BIFROST_SAMPLER_LENGTH == MALI_MIDGARD_SAMPLER_LENGTH);
1327
1328 size_t sz = desc_size * ctx->sampler_count[stage];
1329 struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool, sz);
1330 struct mali_midgard_sampler_packed *out = (struct mali_midgard_sampler_packed *) T.cpu;
1331
1332 for (unsigned i = 0; i < ctx->sampler_count[stage]; ++i)
1333 out[i] = ctx->samplers[stage][i]->hw;
1334
1335 postfix->sampler_descriptor = T.gpu;
1336 }
1337
1338 void
1339 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1340 struct mali_vertex_tiler_postfix *vertex_postfix)
1341 {
1342 struct panfrost_context *ctx = batch->ctx;
1343 struct panfrost_vertex_state *so = ctx->vertex;
1344
1345 unsigned instance_shift = vertex_postfix->instance_shift;
1346 unsigned instance_odd = vertex_postfix->instance_odd;
1347
1348 /* Worst case: everything is NPOT */
1349
1350 struct panfrost_transfer S = panfrost_pool_alloc(&batch->pool,
1351 MALI_ATTRIBUTE_LENGTH * PIPE_MAX_ATTRIBS * 2);
1352
1353 struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool,
1354 MALI_ATTRIBUTE_LENGTH * (PAN_INSTANCE_ID + 1));
1355
1356 struct mali_attribute_buffer_packed *bufs =
1357 (struct mali_attribute_buffer_packed *) S.cpu;
1358
1359 struct mali_attribute_packed *out =
1360 (struct mali_attribute_packed *) T.cpu;
1361
1362 unsigned attrib_to_buffer[PIPE_MAX_ATTRIBS] = { 0 };
1363 unsigned k = 0;
1364
1365 for (unsigned i = 0; i < so->num_elements; ++i) {
1366 /* We map buffers 1:1 with the attributes, which
1367 * means duplicating some vertex buffers (who cares? aside from
1368 * maybe some caching implications but I somehow doubt that
1369 * matters) */
1370
1371 struct pipe_vertex_element *elem = &so->pipe[i];
1372 unsigned vbi = elem->vertex_buffer_index;
1373 attrib_to_buffer[i] = k;
1374
1375 if (!(ctx->vb_mask & (1 << vbi)))
1376 continue;
1377
1378 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1379 struct panfrost_resource *rsrc;
1380
1381 rsrc = pan_resource(buf->buffer.resource);
1382 if (!rsrc)
1383 continue;
1384
1385 /* Add a dependency of the batch on the vertex buffer */
1386 panfrost_batch_add_bo(batch, rsrc->bo,
1387 PAN_BO_ACCESS_SHARED |
1388 PAN_BO_ACCESS_READ |
1389 PAN_BO_ACCESS_VERTEX_TILER);
1390
1391 /* Mask off lower bits, see offset fixup below */
1392 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1393 mali_ptr addr = raw_addr & ~63;
1394
1395 /* Since we advanced the base pointer, we shrink the buffer
1396 * size, but add the offset we subtracted */
1397 unsigned size = rsrc->base.width0 + (raw_addr - addr)
1398 - buf->buffer_offset;
1399
1400 /* When there is a divisor, the hardware-level divisor is
1401 * the product of the instance divisor and the padded count */
1402 unsigned divisor = elem->instance_divisor;
1403 unsigned hw_divisor = ctx->padded_count * divisor;
1404 unsigned stride = buf->stride;
1405
1406 /* If there's a divisor(=1) but no instancing, we want every
1407 * attribute to be the same */
1408
1409 if (divisor && ctx->instance_count == 1)
1410 stride = 0;
1411
1412 if (!divisor || ctx->instance_count <= 1) {
1413 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1414 if (ctx->instance_count > 1)
1415 cfg.type = MALI_ATTRIBUTE_TYPE_1D_MODULUS;
1416
1417 cfg.pointer = addr;
1418 cfg.stride = stride;
1419 cfg.size = size;
1420 cfg.divisor_r = instance_shift;
1421 cfg.divisor_p = instance_odd;
1422 }
1423 } else if (util_is_power_of_two_or_zero(hw_divisor)) {
1424 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1425 cfg.type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;
1426 cfg.pointer = addr;
1427 cfg.stride = stride;
1428 cfg.size = size;
1429 cfg.divisor_r = __builtin_ctz(hw_divisor);
1430 }
1431
1432 } else {
1433 unsigned shift = 0, extra_flags = 0;
1434
1435 unsigned magic_divisor =
1436 panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags);
1437
1438 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1439 cfg.type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;
1440 cfg.pointer = addr;
1441 cfg.stride = stride;
1442 cfg.size = size;
1443
1444 cfg.divisor_r = shift;
1445 cfg.divisor_e = extra_flags;
1446 }
1447
1448 pan_pack(bufs + k + 1, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) {
1449 cfg.divisor_numerator = magic_divisor;
1450 cfg.divisor = divisor;
1451 }
1452
1453 ++k;
1454 }
1455
1456 ++k;
1457 }
1458
1459 /* Add special gl_VertexID/gl_InstanceID buffers */
1460
1461 panfrost_vertex_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1462
1463 pan_pack(out + PAN_VERTEX_ID, ATTRIBUTE, cfg) {
1464 cfg.buffer_index = k++;
1465 cfg.format = so->formats[PAN_VERTEX_ID];
1466 }
1467
1468 panfrost_instance_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1469
1470 pan_pack(out + PAN_INSTANCE_ID, ATTRIBUTE, cfg) {
1471 cfg.buffer_index = k++;
1472 cfg.format = so->formats[PAN_INSTANCE_ID];
1473 }
1474
1475 /* Attribute addresses require 64-byte alignment, so let:
1476 *
1477 * base' = base & ~63 = base - (base & 63)
1478 * offset' = offset + (base & 63)
1479 *
1480 * Since base' + offset' = base + offset, these are equivalent
1481 * addressing modes and now base is 64 aligned.
1482 */
1483
1484 unsigned start = vertex_postfix->offset_start;
1485
1486 for (unsigned i = 0; i < so->num_elements; ++i) {
1487 unsigned vbi = so->pipe[i].vertex_buffer_index;
1488 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1489
1490 /* Adjust by the masked off bits of the offset. Make sure we
1491 * read src_offset from so->hw (which is not GPU visible)
1492 * rather than target (which is) due to caching effects */
1493
1494 unsigned src_offset = so->pipe[i].src_offset;
1495
1496 /* BOs aligned to 4k so guaranteed aligned to 64 */
1497 src_offset += (buf->buffer_offset & 63);
1498
1499 /* Also, somewhat obscurely per-instance data needs to be
1500 * offset in response to a delayed start in an indexed draw */
1501
1502 if (so->pipe[i].instance_divisor && ctx->instance_count > 1 && start)
1503 src_offset -= buf->stride * start;
1504
1505 pan_pack(out + i, ATTRIBUTE, cfg) {
1506 cfg.buffer_index = attrib_to_buffer[i];
1507 cfg.format = so->formats[i];
1508 cfg.offset = src_offset;
1509 }
1510 }
1511
1512 vertex_postfix->attributes = S.gpu;
1513 vertex_postfix->attribute_meta = T.gpu;
1514 }
1515
1516 static mali_ptr
1517 panfrost_emit_varyings(struct panfrost_batch *batch,
1518 struct mali_attribute_buffer_packed *slot,
1519 unsigned stride, unsigned count)
1520 {
1521 unsigned size = stride * count;
1522 mali_ptr ptr = panfrost_pool_alloc(&batch->pool, size).gpu;
1523
1524 pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1525 cfg.stride = stride;
1526 cfg.size = size;
1527 cfg.pointer = ptr;
1528 }
1529
1530 return ptr;
1531 }
1532
1533 static unsigned
1534 panfrost_streamout_offset(unsigned stride, unsigned offset,
1535 struct pipe_stream_output_target *target)
1536 {
1537 return (target->buffer_offset + (offset * stride * 4)) & 63;
1538 }
1539
1540 static void
1541 panfrost_emit_streamout(struct panfrost_batch *batch,
1542 struct mali_attribute_buffer_packed *slot,
1543 unsigned stride_words, unsigned offset, unsigned count,
1544 struct pipe_stream_output_target *target)
1545 {
1546 unsigned stride = stride_words * 4;
1547 unsigned max_size = target->buffer_size;
1548 unsigned expected_size = stride * count;
1549
1550 /* Grab the BO and bind it to the batch */
1551 struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1552
1553 /* Varyings are WRITE from the perspective of the VERTEX but READ from
1554 * the perspective of the TILER and FRAGMENT.
1555 */
1556 panfrost_batch_add_bo(batch, bo,
1557 PAN_BO_ACCESS_SHARED |
1558 PAN_BO_ACCESS_RW |
1559 PAN_BO_ACCESS_VERTEX_TILER |
1560 PAN_BO_ACCESS_FRAGMENT);
1561
1562 /* We will have an offset applied to get alignment */
1563 mali_ptr addr = bo->gpu + target->buffer_offset + (offset * stride);
1564
1565 pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1566 cfg.pointer = (addr & ~63);
1567 cfg.stride = stride;
1568 cfg.size = MIN2(max_size, expected_size) + (addr & 63);
1569 }
1570 }
1571
1572 static bool
1573 has_point_coord(unsigned mask, gl_varying_slot loc)
1574 {
1575 if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1576 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1577 else if (loc == VARYING_SLOT_PNTC)
1578 return (mask & (1 << 8));
1579 else
1580 return false;
1581 }
1582
1583 /* Helpers for manipulating stream out information so we can pack varyings
1584 * accordingly. Compute the src_offset for a given captured varying */
1585
1586 static struct pipe_stream_output *
1587 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1588 {
1589 for (unsigned i = 0; i < info->num_outputs; ++i) {
1590 if (info->output[i].register_index == loc)
1591 return &info->output[i];
1592 }
1593
1594 unreachable("Varying not captured");
1595 }
1596
1597 static unsigned
1598 pan_varying_size(enum mali_format fmt)
1599 {
1600 unsigned type = MALI_EXTRACT_TYPE(fmt);
1601 unsigned chan = MALI_EXTRACT_CHANNELS(fmt);
1602 unsigned bits = MALI_EXTRACT_BITS(fmt);
1603 unsigned bpc = 0;
1604
1605 if (bits == MALI_CHANNEL_FLOAT) {
1606 /* No doubles */
1607 bool fp16 = (type == MALI_FORMAT_SINT);
1608 assert(fp16 || (type == MALI_FORMAT_UNORM));
1609
1610 bpc = fp16 ? 2 : 4;
1611 } else {
1612 assert(type >= MALI_FORMAT_SNORM && type <= MALI_FORMAT_SINT);
1613
1614 /* See the enums */
1615 bits = 1 << bits;
1616 assert(bits >= 8);
1617 bpc = bits / 8;
1618 }
1619
1620 return bpc * chan;
1621 }
1622
1623 /* Indices for named (non-XFB) varyings that are present. These are packed
1624 * tightly so they correspond to a bitfield present (P) indexed by (1 <<
1625 * PAN_VARY_*). This has the nice property that you can lookup the buffer index
1626 * of a given special field given a shift S by:
1627 *
1628 * idx = popcount(P & ((1 << S) - 1))
1629 *
1630 * That is... look at all of the varyings that come earlier and count them, the
1631 * count is the new index since plus one. Likewise, the total number of special
1632 * buffers required is simply popcount(P)
1633 */
1634
1635 enum pan_special_varying {
1636 PAN_VARY_GENERAL = 0,
1637 PAN_VARY_POSITION = 1,
1638 PAN_VARY_PSIZ = 2,
1639 PAN_VARY_PNTCOORD = 3,
1640 PAN_VARY_FACE = 4,
1641 PAN_VARY_FRAGCOORD = 5,
1642
1643 /* Keep last */
1644 PAN_VARY_MAX,
1645 };
1646
1647 /* Given a varying, figure out which index it correpsonds to */
1648
1649 static inline unsigned
1650 pan_varying_index(unsigned present, enum pan_special_varying v)
1651 {
1652 unsigned mask = (1 << v) - 1;
1653 return util_bitcount(present & mask);
1654 }
1655
1656 /* Get the base offset for XFB buffers, which by convention come after
1657 * everything else. Wrapper function for semantic reasons; by construction this
1658 * is just popcount. */
1659
1660 static inline unsigned
1661 pan_xfb_base(unsigned present)
1662 {
1663 return util_bitcount(present);
1664 }
1665
1666 /* Computes the present mask for varyings so we can start emitting varying records */
1667
1668 static inline unsigned
1669 pan_varying_present(
1670 struct panfrost_shader_state *vs,
1671 struct panfrost_shader_state *fs,
1672 unsigned quirks)
1673 {
1674 /* At the moment we always emit general and position buffers. Not
1675 * strictly necessary but usually harmless */
1676
1677 unsigned present = (1 << PAN_VARY_GENERAL) | (1 << PAN_VARY_POSITION);
1678
1679 /* Enable special buffers by the shader info */
1680
1681 if (vs->writes_point_size)
1682 present |= (1 << PAN_VARY_PSIZ);
1683
1684 if (fs->reads_point_coord)
1685 present |= (1 << PAN_VARY_PNTCOORD);
1686
1687 if (fs->reads_face)
1688 present |= (1 << PAN_VARY_FACE);
1689
1690 if (fs->reads_frag_coord && !(quirks & IS_BIFROST))
1691 present |= (1 << PAN_VARY_FRAGCOORD);
1692
1693 /* Also, if we have a point sprite, we need a point coord buffer */
1694
1695 for (unsigned i = 0; i < fs->varying_count; i++) {
1696 gl_varying_slot loc = fs->varyings_loc[i];
1697
1698 if (has_point_coord(fs->point_sprite_mask, loc))
1699 present |= (1 << PAN_VARY_PNTCOORD);
1700 }
1701
1702 return present;
1703 }
1704
1705 /* Emitters for varying records */
1706
1707 static void
1708 pan_emit_vary(struct mali_attribute_packed *out,
1709 unsigned present, enum pan_special_varying buf,
1710 unsigned quirks, enum mali_format format,
1711 unsigned offset)
1712 {
1713 unsigned nr_channels = MALI_EXTRACT_CHANNELS(format);
1714 unsigned swizzle = quirks & HAS_SWIZZLES ?
1715 panfrost_get_default_swizzle(nr_channels) :
1716 panfrost_bifrost_swizzle(nr_channels);
1717
1718 pan_pack(out, ATTRIBUTE, cfg) {
1719 cfg.buffer_index = pan_varying_index(present, buf);
1720 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1721 cfg.format = (format << 12) | swizzle;
1722 cfg.offset = offset;
1723 }
1724 }
1725
1726 /* General varying that is unused */
1727
1728 static void
1729 pan_emit_vary_only(struct mali_attribute_packed *out,
1730 unsigned present, unsigned quirks)
1731 {
1732 pan_emit_vary(out, present, 0, quirks, MALI_VARYING_DISCARD, 0);
1733 }
1734
1735 /* Special records */
1736
1737 static const enum mali_format pan_varying_formats[PAN_VARY_MAX] = {
1738 [PAN_VARY_POSITION] = MALI_VARYING_POS,
1739 [PAN_VARY_PSIZ] = MALI_R16F,
1740 [PAN_VARY_PNTCOORD] = MALI_R16F,
1741 [PAN_VARY_FACE] = MALI_R32I,
1742 [PAN_VARY_FRAGCOORD] = MALI_RGBA32F
1743 };
1744
1745 static void
1746 pan_emit_vary_special(struct mali_attribute_packed *out,
1747 unsigned present, enum pan_special_varying buf,
1748 unsigned quirks)
1749 {
1750 assert(buf < PAN_VARY_MAX);
1751 pan_emit_vary(out, present, buf, quirks, pan_varying_formats[buf], 0);
1752 }
1753
1754 static enum mali_format
1755 pan_xfb_format(enum mali_format format, unsigned nr)
1756 {
1757 if (MALI_EXTRACT_BITS(format) == MALI_CHANNEL_FLOAT)
1758 return MALI_R32F | MALI_NR_CHANNELS(nr);
1759 else
1760 return MALI_EXTRACT_TYPE(format) | MALI_NR_CHANNELS(nr) | MALI_CHANNEL_32;
1761 }
1762
1763 /* Transform feedback records. Note struct pipe_stream_output is (if packed as
1764 * a bitfield) 32-bit, smaller than a 64-bit pointer, so may as well pass by
1765 * value. */
1766
1767 static void
1768 pan_emit_vary_xfb(struct mali_attribute_packed *out,
1769 unsigned present,
1770 unsigned max_xfb,
1771 unsigned *streamout_offsets,
1772 unsigned quirks,
1773 enum mali_format format,
1774 struct pipe_stream_output o)
1775 {
1776 unsigned swizzle = quirks & HAS_SWIZZLES ?
1777 panfrost_get_default_swizzle(o.num_components) :
1778 panfrost_bifrost_swizzle(o.num_components);
1779
1780 pan_pack(out, ATTRIBUTE, cfg) {
1781 /* XFB buffers come after everything else */
1782 cfg.buffer_index = pan_xfb_base(present) + o.output_buffer;
1783 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1784
1785 /* Override number of channels and precision to highp */
1786 cfg.format = (pan_xfb_format(format, o.num_components) << 12) | swizzle;
1787
1788 /* Apply given offsets together */
1789 cfg.offset = (o.dst_offset * 4) /* dwords */
1790 + streamout_offsets[o.output_buffer];
1791 }
1792 }
1793
1794 /* Determine if we should capture a varying for XFB. This requires actually
1795 * having a buffer for it. If we don't capture it, we'll fallback to a general
1796 * varying path (linked or unlinked, possibly discarding the write) */
1797
1798 static bool
1799 panfrost_xfb_captured(struct panfrost_shader_state *xfb,
1800 unsigned loc, unsigned max_xfb)
1801 {
1802 if (!(xfb->so_mask & (1ll << loc)))
1803 return false;
1804
1805 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1806 return o->output_buffer < max_xfb;
1807 }
1808
1809 static void
1810 pan_emit_general_varying(struct mali_attribute_packed *out,
1811 struct panfrost_shader_state *other,
1812 struct panfrost_shader_state *xfb,
1813 gl_varying_slot loc,
1814 enum mali_format format,
1815 unsigned present,
1816 unsigned quirks,
1817 unsigned *gen_offsets,
1818 enum mali_format *gen_formats,
1819 unsigned *gen_stride,
1820 unsigned idx,
1821 bool should_alloc)
1822 {
1823 /* Check if we're linked */
1824 signed other_idx = -1;
1825
1826 for (unsigned j = 0; j < other->varying_count; ++j) {
1827 if (other->varyings_loc[j] == loc) {
1828 other_idx = j;
1829 break;
1830 }
1831 }
1832
1833 if (other_idx < 0) {
1834 pan_emit_vary_only(out, present, quirks);
1835 return;
1836 }
1837
1838 unsigned offset = gen_offsets[other_idx];
1839
1840 if (should_alloc) {
1841 /* We're linked, so allocate a space via a watermark allocation */
1842 enum mali_format alt = other->varyings[other_idx];
1843
1844 /* Do interpolation at minimum precision */
1845 unsigned size_main = pan_varying_size(format);
1846 unsigned size_alt = pan_varying_size(alt);
1847 unsigned size = MIN2(size_main, size_alt);
1848
1849 /* If a varying is marked for XFB but not actually captured, we
1850 * should match the format to the format that would otherwise
1851 * be used for XFB, since dEQP checks for invariance here. It's
1852 * unclear if this is required by the spec. */
1853
1854 if (xfb->so_mask & (1ull << loc)) {
1855 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1856 format = pan_xfb_format(format, o->num_components);
1857 size = pan_varying_size(format);
1858 } else if (size == size_alt) {
1859 format = alt;
1860 }
1861
1862 gen_offsets[idx] = *gen_stride;
1863 gen_formats[other_idx] = format;
1864 offset = *gen_stride;
1865 *gen_stride += size;
1866 }
1867
1868 pan_emit_vary(out, present, PAN_VARY_GENERAL, quirks, format, offset);
1869 }
1870
1871 /* Higher-level wrapper around all of the above, classifying a varying into one
1872 * of the above types */
1873
1874 static void
1875 panfrost_emit_varying(
1876 struct mali_attribute_packed *out,
1877 struct panfrost_shader_state *stage,
1878 struct panfrost_shader_state *other,
1879 struct panfrost_shader_state *xfb,
1880 unsigned present,
1881 unsigned max_xfb,
1882 unsigned *streamout_offsets,
1883 unsigned quirks,
1884 unsigned *gen_offsets,
1885 enum mali_format *gen_formats,
1886 unsigned *gen_stride,
1887 unsigned idx,
1888 bool should_alloc,
1889 bool is_fragment)
1890 {
1891 gl_varying_slot loc = stage->varyings_loc[idx];
1892 enum mali_format format = stage->varyings[idx];
1893
1894 /* Override format to match linkage */
1895 if (!should_alloc && gen_formats[idx])
1896 format = gen_formats[idx];
1897
1898 if (has_point_coord(stage->point_sprite_mask, loc)) {
1899 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1900 } else if (panfrost_xfb_captured(xfb, loc, max_xfb)) {
1901 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1902 pan_emit_vary_xfb(out, present, max_xfb, streamout_offsets, quirks, format, *o);
1903 } else if (loc == VARYING_SLOT_POS) {
1904 if (is_fragment)
1905 pan_emit_vary_special(out, present, PAN_VARY_FRAGCOORD, quirks);
1906 else
1907 pan_emit_vary_special(out, present, PAN_VARY_POSITION, quirks);
1908 } else if (loc == VARYING_SLOT_PSIZ) {
1909 pan_emit_vary_special(out, present, PAN_VARY_PSIZ, quirks);
1910 } else if (loc == VARYING_SLOT_PNTC) {
1911 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1912 } else if (loc == VARYING_SLOT_FACE) {
1913 pan_emit_vary_special(out, present, PAN_VARY_FACE, quirks);
1914 } else {
1915 pan_emit_general_varying(out, other, xfb, loc, format, present,
1916 quirks, gen_offsets, gen_formats, gen_stride,
1917 idx, should_alloc);
1918 }
1919 }
1920
1921 static void
1922 pan_emit_special_input(struct mali_attribute_buffer_packed *out,
1923 unsigned present,
1924 enum pan_special_varying v,
1925 unsigned special)
1926 {
1927 if (present & (1 << v)) {
1928 unsigned idx = pan_varying_index(present, v);
1929
1930 pan_pack(out + idx, ATTRIBUTE_BUFFER, cfg) {
1931 cfg.special = special;
1932 cfg.type = 0;
1933 }
1934 }
1935 }
1936
1937 void
1938 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1939 unsigned vertex_count,
1940 struct mali_vertex_tiler_postfix *vertex_postfix,
1941 struct mali_vertex_tiler_postfix *tiler_postfix,
1942 union midgard_primitive_size *primitive_size)
1943 {
1944 /* Load the shaders */
1945 struct panfrost_context *ctx = batch->ctx;
1946 struct panfrost_device *dev = pan_device(ctx->base.screen);
1947 struct panfrost_shader_state *vs, *fs;
1948 size_t vs_size, fs_size;
1949
1950 /* Allocate the varying descriptor */
1951
1952 vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1953 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1954 vs_size = MALI_ATTRIBUTE_LENGTH * vs->varying_count;
1955 fs_size = MALI_ATTRIBUTE_LENGTH * fs->varying_count;
1956
1957 struct panfrost_transfer trans = panfrost_pool_alloc(&batch->pool,
1958 vs_size +
1959 fs_size);
1960
1961 struct pipe_stream_output_info *so = &vs->stream_output;
1962 unsigned present = pan_varying_present(vs, fs, dev->quirks);
1963
1964 /* Check if this varying is linked by us. This is the case for
1965 * general-purpose, non-captured varyings. If it is, link it. If it's
1966 * not, use the provided stream out information to determine the
1967 * offset, since it was already linked for us. */
1968
1969 unsigned gen_offsets[32];
1970 enum mali_format gen_formats[32];
1971 memset(gen_offsets, 0, sizeof(gen_offsets));
1972 memset(gen_formats, 0, sizeof(gen_formats));
1973
1974 unsigned gen_stride = 0;
1975 assert(vs->varying_count < ARRAY_SIZE(gen_offsets));
1976 assert(fs->varying_count < ARRAY_SIZE(gen_offsets));
1977
1978 unsigned streamout_offsets[32];
1979
1980 for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1981 streamout_offsets[i] = panfrost_streamout_offset(
1982 so->stride[i],
1983 ctx->streamout.offsets[i],
1984 ctx->streamout.targets[i]);
1985 }
1986
1987 struct mali_attribute_packed *ovs = (struct mali_attribute_packed *)trans.cpu;
1988 struct mali_attribute_packed *ofs = ovs + vs->varying_count;
1989
1990 for (unsigned i = 0; i < vs->varying_count; i++) {
1991 panfrost_emit_varying(ovs + i, vs, fs, vs, present,
1992 ctx->streamout.num_targets, streamout_offsets,
1993 dev->quirks,
1994 gen_offsets, gen_formats, &gen_stride, i, true, false);
1995 }
1996
1997 for (unsigned i = 0; i < fs->varying_count; i++) {
1998 panfrost_emit_varying(ofs + i, fs, vs, vs, present,
1999 ctx->streamout.num_targets, streamout_offsets,
2000 dev->quirks,
2001 gen_offsets, gen_formats, &gen_stride, i, false, true);
2002 }
2003
2004 unsigned xfb_base = pan_xfb_base(present);
2005 struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool,
2006 MALI_ATTRIBUTE_BUFFER_LENGTH * (xfb_base + ctx->streamout.num_targets));
2007 struct mali_attribute_buffer_packed *varyings =
2008 (struct mali_attribute_buffer_packed *) T.cpu;
2009
2010 /* Emit the stream out buffers */
2011
2012 unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
2013 ctx->vertex_count);
2014
2015 for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
2016 panfrost_emit_streamout(batch, &varyings[xfb_base + i],
2017 so->stride[i],
2018 ctx->streamout.offsets[i],
2019 out_count,
2020 ctx->streamout.targets[i]);
2021 }
2022
2023 panfrost_emit_varyings(batch,
2024 &varyings[pan_varying_index(present, PAN_VARY_GENERAL)],
2025 gen_stride, vertex_count);
2026
2027 /* fp32 vec4 gl_Position */
2028 tiler_postfix->position_varying = panfrost_emit_varyings(batch,
2029 &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
2030 sizeof(float) * 4, vertex_count);
2031
2032 if (present & (1 << PAN_VARY_PSIZ)) {
2033 primitive_size->pointer = panfrost_emit_varyings(batch,
2034 &varyings[pan_varying_index(present, PAN_VARY_PSIZ)],
2035 2, vertex_count);
2036 }
2037
2038 pan_emit_special_input(varyings, present, PAN_VARY_PNTCOORD, MALI_ATTRIBUTE_SPECIAL_POINT_COORD);
2039 pan_emit_special_input(varyings, present, PAN_VARY_FACE, MALI_ATTRIBUTE_SPECIAL_FRONT_FACING);
2040 pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD, MALI_ATTRIBUTE_SPECIAL_FRAG_COORD);
2041
2042 vertex_postfix->varyings = T.gpu;
2043 tiler_postfix->varyings = T.gpu;
2044
2045 vertex_postfix->varying_meta = trans.gpu;
2046 tiler_postfix->varying_meta = trans.gpu + vs_size;
2047 }
2048
2049 void
2050 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
2051 struct mali_vertex_tiler_prefix *vertex_prefix,
2052 struct mali_vertex_tiler_postfix *vertex_postfix,
2053 struct mali_vertex_tiler_prefix *tiler_prefix,
2054 struct mali_vertex_tiler_postfix *tiler_postfix,
2055 union midgard_primitive_size *primitive_size)
2056 {
2057 struct panfrost_context *ctx = batch->ctx;
2058 struct panfrost_device *device = pan_device(ctx->base.screen);
2059 bool wallpapering = ctx->wallpaper_batch && batch->scoreboard.tiler_dep;
2060 struct bifrost_payload_vertex bifrost_vertex = {0,};
2061 struct bifrost_payload_tiler bifrost_tiler = {0,};
2062 struct midgard_payload_vertex_tiler midgard_vertex = {0,};
2063 struct midgard_payload_vertex_tiler midgard_tiler = {0,};
2064 void *vp, *tp;
2065 size_t vp_size, tp_size;
2066
2067 if (device->quirks & IS_BIFROST) {
2068 bifrost_vertex.prefix = *vertex_prefix;
2069 bifrost_vertex.postfix = *vertex_postfix;
2070 vp = &bifrost_vertex;
2071 vp_size = sizeof(bifrost_vertex);
2072
2073 bifrost_tiler.prefix = *tiler_prefix;
2074 bifrost_tiler.tiler.primitive_size = *primitive_size;
2075 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
2076 bifrost_tiler.postfix = *tiler_postfix;
2077 tp = &bifrost_tiler;
2078 tp_size = sizeof(bifrost_tiler);
2079 } else {
2080 midgard_vertex.prefix = *vertex_prefix;
2081 midgard_vertex.postfix = *vertex_postfix;
2082 vp = &midgard_vertex;
2083 vp_size = sizeof(midgard_vertex);
2084
2085 midgard_tiler.prefix = *tiler_prefix;
2086 midgard_tiler.postfix = *tiler_postfix;
2087 midgard_tiler.primitive_size = *primitive_size;
2088 tp = &midgard_tiler;
2089 tp_size = sizeof(midgard_tiler);
2090 }
2091
2092 if (wallpapering) {
2093 /* Inject in reverse order, with "predicted" job indices.
2094 * THIS IS A HACK XXX */
2095 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false,
2096 batch->scoreboard.job_index + 2, tp, tp_size, true);
2097 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2098 vp, vp_size, true);
2099 return;
2100 }
2101
2102 /* If rasterizer discard is enable, only submit the vertex */
2103
2104 bool rasterizer_discard = ctx->rasterizer &&
2105 ctx->rasterizer->base.rasterizer_discard;
2106
2107 unsigned vertex = panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2108 vp, vp_size, false);
2109
2110 if (rasterizer_discard)
2111 return;
2112
2113 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false, vertex, tp, tp_size,
2114 false);
2115 }
2116
2117 /* TODO: stop hardcoding this */
2118 mali_ptr
2119 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2120 {
2121 uint16_t locations[] = {
2122 128, 128,
2123 0, 256,
2124 0, 256,
2125 0, 256,
2126 0, 256,
2127 0, 256,
2128 0, 256,
2129 0, 256,
2130 0, 256,
2131 0, 256,
2132 0, 256,
2133 0, 256,
2134 0, 256,
2135 0, 256,
2136 0, 256,
2137 0, 256,
2138 0, 256,
2139 0, 256,
2140 0, 256,
2141 0, 256,
2142 0, 256,
2143 0, 256,
2144 0, 256,
2145 0, 256,
2146 0, 256,
2147 0, 256,
2148 0, 256,
2149 0, 256,
2150 0, 256,
2151 0, 256,
2152 0, 256,
2153 0, 256,
2154 128, 128,
2155 0, 0,
2156 0, 0,
2157 0, 0,
2158 0, 0,
2159 0, 0,
2160 0, 0,
2161 0, 0,
2162 0, 0,
2163 0, 0,
2164 0, 0,
2165 0, 0,
2166 0, 0,
2167 0, 0,
2168 0, 0,
2169 0, 0,
2170 };
2171
2172 return panfrost_pool_upload(&batch->pool, locations, 96 * sizeof(uint16_t));
2173 }