panfrost: Simplify zsa == NULL case
[mesa.git] / src / gallium / drivers / panfrost / pan_cmdstream.c
1 /*
2 * Copyright (C) 2018 Alyssa Rosenzweig
3 * Copyright (C) 2020 Collabora Ltd.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "util/macros.h"
26 #include "util/u_prim.h"
27 #include "util/u_vbuf.h"
28
29 #include "panfrost-quirks.h"
30
31 #include "pan_pool.h"
32 #include "pan_bo.h"
33 #include "pan_cmdstream.h"
34 #include "pan_context.h"
35 #include "pan_job.h"
36
37 /* If a BO is accessed for a particular shader stage, will it be in the primary
38 * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
39 * fragment will be primary, e.g. compute jobs will be considered
40 * "vertex/tiler" by analogy */
41
42 static inline uint32_t
43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
44 {
45 assert(stage == PIPE_SHADER_FRAGMENT ||
46 stage == PIPE_SHADER_VERTEX ||
47 stage == PIPE_SHADER_COMPUTE);
48
49 return stage == PIPE_SHADER_FRAGMENT ?
50 PAN_BO_ACCESS_FRAGMENT :
51 PAN_BO_ACCESS_VERTEX_TILER;
52 }
53
54 static void
55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
56 struct mali_vertex_tiler_postfix *postfix)
57 {
58 struct panfrost_device *dev = pan_device(ctx->base.screen);
59 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
60
61 unsigned shift = panfrost_get_stack_shift(batch->stack_size);
62 struct mali_shared_memory shared = {
63 .stack_shift = shift,
64 .scratchpad = panfrost_batch_get_scratchpad(batch, shift, dev->thread_tls_alloc, dev->core_count)->gpu,
65 .shared_workgroup_count = ~0,
66 };
67 postfix->shared_memory = panfrost_pool_upload(&batch->pool, &shared, sizeof(shared));
68 }
69
70 static void
71 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
72 struct mali_vertex_tiler_postfix *postfix)
73 {
74 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
75 postfix->shared_memory = panfrost_batch_reserve_framebuffer(batch);
76 }
77
78 static void
79 panfrost_vt_update_rasterizer(struct panfrost_context *ctx,
80 struct mali_vertex_tiler_prefix *prefix,
81 struct mali_vertex_tiler_postfix *postfix)
82 {
83 struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
84
85 postfix->gl_enables |= 0x7;
86 SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
87 rasterizer && rasterizer->base.front_ccw);
88 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
89 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_FRONT));
90 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
91 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_BACK));
92 SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
93 rasterizer && rasterizer->base.flatshade_first);
94 }
95
96 void
97 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
98 struct mali_vertex_tiler_prefix *prefix,
99 union midgard_primitive_size *primitive_size)
100 {
101 struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
102
103 if (!panfrost_writes_point_size(ctx)) {
104 bool points = prefix->draw_mode == MALI_DRAW_MODE_POINTS;
105 float val = 0.0f;
106
107 if (rasterizer)
108 val = points ?
109 rasterizer->base.point_size :
110 rasterizer->base.line_width;
111
112 primitive_size->constant = val;
113 }
114 }
115
116 static void
117 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
118 struct mali_vertex_tiler_postfix *postfix)
119 {
120 SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
121 if (ctx->occlusion_query) {
122 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
123 panfrost_batch_add_bo(ctx->batch, ctx->occlusion_query->bo,
124 PAN_BO_ACCESS_SHARED |
125 PAN_BO_ACCESS_RW |
126 PAN_BO_ACCESS_FRAGMENT);
127 } else {
128 postfix->occlusion_counter = 0;
129 }
130 }
131
132 void
133 panfrost_vt_init(struct panfrost_context *ctx,
134 enum pipe_shader_type stage,
135 struct mali_vertex_tiler_prefix *prefix,
136 struct mali_vertex_tiler_postfix *postfix)
137 {
138 struct panfrost_device *device = pan_device(ctx->base.screen);
139
140 if (!ctx->shader[stage])
141 return;
142
143 memset(prefix, 0, sizeof(*prefix));
144 memset(postfix, 0, sizeof(*postfix));
145
146 if (device->quirks & IS_BIFROST) {
147 postfix->gl_enables = 0x2;
148 panfrost_vt_emit_shared_memory(ctx, postfix);
149 } else {
150 postfix->gl_enables = 0x6;
151 panfrost_vt_attach_framebuffer(ctx, postfix);
152 }
153
154 if (stage == PIPE_SHADER_FRAGMENT) {
155 panfrost_vt_update_occlusion_query(ctx, postfix);
156 panfrost_vt_update_rasterizer(ctx, prefix, postfix);
157 }
158 }
159
160 static unsigned
161 panfrost_translate_index_size(unsigned size)
162 {
163 switch (size) {
164 case 1:
165 return MALI_DRAW_INDEXED_UINT8;
166
167 case 2:
168 return MALI_DRAW_INDEXED_UINT16;
169
170 case 4:
171 return MALI_DRAW_INDEXED_UINT32;
172
173 default:
174 unreachable("Invalid index size");
175 }
176 }
177
178 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
179 * good for the duration of the draw (transient), could last longer. Also get
180 * the bounds on the index buffer for the range accessed by the draw. We do
181 * these operations together because there are natural optimizations which
182 * require them to be together. */
183
184 static mali_ptr
185 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
186 const struct pipe_draw_info *info,
187 unsigned *min_index, unsigned *max_index)
188 {
189 struct panfrost_resource *rsrc = pan_resource(info->index.resource);
190 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
191 off_t offset = info->start * info->index_size;
192 bool needs_indices = true;
193 mali_ptr out = 0;
194
195 if (info->max_index != ~0u) {
196 *min_index = info->min_index;
197 *max_index = info->max_index;
198 needs_indices = false;
199 }
200
201 if (!info->has_user_indices) {
202 /* Only resources can be directly mapped */
203 panfrost_batch_add_bo(batch, rsrc->bo,
204 PAN_BO_ACCESS_SHARED |
205 PAN_BO_ACCESS_READ |
206 PAN_BO_ACCESS_VERTEX_TILER);
207 out = rsrc->bo->gpu + offset;
208
209 /* Check the cache */
210 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
211 info->start,
212 info->count,
213 min_index,
214 max_index);
215 } else {
216 /* Otherwise, we need to upload to transient memory */
217 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
218 out = panfrost_pool_upload(&batch->pool, ibuf8 + offset,
219 info->count *
220 info->index_size);
221 }
222
223 if (needs_indices) {
224 /* Fallback */
225 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
226
227 if (!info->has_user_indices)
228 panfrost_minmax_cache_add(rsrc->index_cache,
229 info->start, info->count,
230 *min_index, *max_index);
231 }
232
233 return out;
234 }
235
236 void
237 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
238 const struct pipe_draw_info *info,
239 enum mali_draw_mode draw_mode,
240 struct mali_vertex_tiler_postfix *vertex_postfix,
241 struct mali_vertex_tiler_prefix *tiler_prefix,
242 struct mali_vertex_tiler_postfix *tiler_postfix,
243 unsigned *vertex_count,
244 unsigned *padded_count)
245 {
246 tiler_prefix->draw_mode = draw_mode;
247
248 unsigned draw_flags = 0;
249
250 if (panfrost_writes_point_size(ctx))
251 draw_flags |= MALI_DRAW_VARYING_SIZE;
252
253 if (info->primitive_restart)
254 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
255
256 /* These doesn't make much sense */
257
258 draw_flags |= 0x3000;
259
260 if (info->index_size) {
261 unsigned min_index = 0, max_index = 0;
262
263 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
264 info,
265 &min_index,
266 &max_index);
267
268 /* Use the corresponding values */
269 *vertex_count = max_index - min_index + 1;
270 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
271 tiler_prefix->offset_bias_correction = -min_index;
272 tiler_prefix->index_count = MALI_POSITIVE(info->count);
273 draw_flags |= panfrost_translate_index_size(info->index_size);
274 } else {
275 tiler_prefix->indices = 0;
276 *vertex_count = ctx->vertex_count;
277 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
278 tiler_prefix->offset_bias_correction = 0;
279 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
280 }
281
282 tiler_prefix->unknown_draw = draw_flags;
283
284 /* Encode the padded vertex count */
285
286 if (info->instance_count > 1) {
287 *padded_count = panfrost_padded_vertex_count(*vertex_count);
288
289 unsigned shift = __builtin_ctz(ctx->padded_count);
290 unsigned k = ctx->padded_count >> (shift + 1);
291
292 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
293 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
294 } else {
295 *padded_count = *vertex_count;
296
297 /* Reset instancing state */
298 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
299 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
300 }
301 }
302
303 static void
304 panfrost_shader_meta_init(struct panfrost_context *ctx,
305 enum pipe_shader_type st,
306 struct mali_shader_meta *meta)
307 {
308 const struct panfrost_device *dev = pan_device(ctx->base.screen);
309 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
310
311 memset(meta, 0, sizeof(*meta));
312 meta->shader = (ss->bo ? ss->bo->gpu : 0) | ss->first_tag;
313 meta->attribute_count = ss->attribute_count;
314 meta->varying_count = ss->varying_count;
315 meta->texture_count = ctx->sampler_view_count[st];
316 meta->sampler_count = ctx->sampler_count[st];
317
318 if (dev->quirks & IS_BIFROST) {
319 if (st == PIPE_SHADER_VERTEX)
320 meta->bifrost1.unk1 = 0x800000;
321 else {
322 /* First clause ATEST |= 0x4000000.
323 * Less than 32 regs |= 0x200 */
324 meta->bifrost1.unk1 = 0x950020;
325 }
326
327 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
328 if (st == PIPE_SHADER_VERTEX)
329 meta->bifrost2.preload_regs = 0xC0;
330 else {
331 meta->bifrost2.preload_regs = 0x1;
332 SET_BIT(meta->bifrost2.preload_regs, 0x10, ss->reads_frag_coord);
333 }
334
335 meta->bifrost2.uniform_count = MIN2(ss->uniform_count,
336 ss->uniform_cutoff);
337 } else {
338 meta->midgard1.uniform_count = MIN2(ss->uniform_count,
339 ss->uniform_cutoff);
340 meta->midgard1.work_count = ss->work_reg_count;
341
342 /* TODO: This is not conformant on ES3 */
343 meta->midgard1.flags_hi = MALI_SUPPRESS_INF_NAN;
344
345 meta->midgard1.flags_lo = 0x20;
346 meta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
347
348 SET_BIT(meta->midgard1.flags_hi, MALI_WRITES_GLOBAL, ss->writes_global);
349 }
350 }
351
352 static unsigned
353 panfrost_translate_compare_func(enum pipe_compare_func in)
354 {
355 switch (in) {
356 case PIPE_FUNC_NEVER:
357 return MALI_FUNC_NEVER;
358
359 case PIPE_FUNC_LESS:
360 return MALI_FUNC_LESS;
361
362 case PIPE_FUNC_EQUAL:
363 return MALI_FUNC_EQUAL;
364
365 case PIPE_FUNC_LEQUAL:
366 return MALI_FUNC_LEQUAL;
367
368 case PIPE_FUNC_GREATER:
369 return MALI_FUNC_GREATER;
370
371 case PIPE_FUNC_NOTEQUAL:
372 return MALI_FUNC_NOT_EQUAL;
373
374 case PIPE_FUNC_GEQUAL:
375 return MALI_FUNC_GEQUAL;
376
377 case PIPE_FUNC_ALWAYS:
378 return MALI_FUNC_ALWAYS;
379
380 default:
381 unreachable("Invalid func");
382 }
383 }
384
385 static unsigned
386 panfrost_translate_stencil_op(enum pipe_stencil_op in)
387 {
388 switch (in) {
389 case PIPE_STENCIL_OP_KEEP:
390 return MALI_STENCIL_OP_KEEP;
391
392 case PIPE_STENCIL_OP_ZERO:
393 return MALI_STENCIL_OP_ZERO;
394
395 case PIPE_STENCIL_OP_REPLACE:
396 return MALI_STENCIL_OP_REPLACE;
397
398 case PIPE_STENCIL_OP_INCR:
399 return MALI_STENCIL_OP_INCR_SAT;
400
401 case PIPE_STENCIL_OP_DECR:
402 return MALI_STENCIL_OP_DECR_SAT;
403
404 case PIPE_STENCIL_OP_INCR_WRAP:
405 return MALI_STENCIL_OP_INCR_WRAP;
406
407 case PIPE_STENCIL_OP_DECR_WRAP:
408 return MALI_STENCIL_OP_DECR_WRAP;
409
410 case PIPE_STENCIL_OP_INVERT:
411 return MALI_STENCIL_OP_INVERT;
412
413 default:
414 unreachable("Invalid stencil op");
415 }
416 }
417
418 static unsigned
419 translate_tex_wrap(enum pipe_tex_wrap w)
420 {
421 switch (w) {
422 case PIPE_TEX_WRAP_REPEAT:
423 return MALI_WRAP_MODE_REPEAT;
424
425 case PIPE_TEX_WRAP_CLAMP:
426 return MALI_WRAP_MODE_CLAMP;
427
428 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
429 return MALI_WRAP_MODE_CLAMP_TO_EDGE;
430
431 case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
432 return MALI_WRAP_MODE_CLAMP_TO_BORDER;
433
434 case PIPE_TEX_WRAP_MIRROR_REPEAT:
435 return MALI_WRAP_MODE_MIRRORED_REPEAT;
436
437 case PIPE_TEX_WRAP_MIRROR_CLAMP:
438 return MALI_WRAP_MODE_MIRRORED_CLAMP;
439
440 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
441 return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE;
442
443 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
444 return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER;
445
446 default:
447 unreachable("Invalid wrap");
448 }
449 }
450
451 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
452 struct mali_sampler_descriptor *hw)
453 {
454 unsigned func = panfrost_translate_compare_func(cso->compare_func);
455 bool min_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
456 bool mag_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
457 bool mip_linear = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR;
458 unsigned min_filter = min_nearest ? MALI_SAMP_MIN_NEAREST : 0;
459 unsigned mag_filter = mag_nearest ? MALI_SAMP_MAG_NEAREST : 0;
460 unsigned mip_filter = mip_linear ?
461 (MALI_SAMP_MIP_LINEAR_1 | MALI_SAMP_MIP_LINEAR_2) : 0;
462 unsigned normalized = cso->normalized_coords ? MALI_SAMP_NORM_COORDS : 0;
463
464 *hw = (struct mali_sampler_descriptor) {
465 .filter_mode = min_filter | mag_filter | mip_filter |
466 normalized,
467 .wrap_s = translate_tex_wrap(cso->wrap_s),
468 .wrap_t = translate_tex_wrap(cso->wrap_t),
469 .wrap_r = translate_tex_wrap(cso->wrap_r),
470 .compare_func = cso->compare_mode ?
471 panfrost_flip_compare_func(func) :
472 MALI_FUNC_NEVER,
473 .border_color = {
474 cso->border_color.f[0],
475 cso->border_color.f[1],
476 cso->border_color.f[2],
477 cso->border_color.f[3]
478 },
479 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
480 .max_lod = FIXED_16(cso->max_lod, false),
481 .lod_bias = FIXED_16(cso->lod_bias, true), /* can be negative */
482 .seamless_cube_map = cso->seamless_cube_map,
483 };
484
485 /* If necessary, we disable mipmapping in the sampler descriptor by
486 * clamping the LOD as tight as possible (from 0 to epsilon,
487 * essentially -- remember these are fixed point numbers, so
488 * epsilon=1/256) */
489
490 if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
491 hw->max_lod = hw->min_lod + 1;
492 }
493
494 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
495 struct bifrost_sampler_descriptor *hw)
496 {
497 *hw = (struct bifrost_sampler_descriptor) {
498 .unk1 = 0x1,
499 .wrap_s = translate_tex_wrap(cso->wrap_s),
500 .wrap_t = translate_tex_wrap(cso->wrap_t),
501 .wrap_r = translate_tex_wrap(cso->wrap_r),
502 .unk8 = 0x8,
503 .min_filter = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST,
504 .norm_coords = cso->normalized_coords,
505 .mip_filter = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR,
506 .mag_filter = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR,
507 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
508 .max_lod = FIXED_16(cso->max_lod, false),
509 };
510
511 /* If necessary, we disable mipmapping in the sampler descriptor by
512 * clamping the LOD as tight as possible (from 0 to epsilon,
513 * essentially -- remember these are fixed point numbers, so
514 * epsilon=1/256) */
515
516 if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
517 hw->max_lod = hw->min_lod + 1;
518 }
519
520 static void
521 panfrost_make_stencil_state(const struct pipe_stencil_state *in,
522 void *out)
523 {
524 pan_pack(out, STENCIL, cfg) {
525 cfg.mask = in->valuemask;
526 cfg.compare_function = panfrost_translate_compare_func(in->func);
527 cfg.stencil_fail = panfrost_translate_stencil_op(in->fail_op);
528 cfg.depth_fail = panfrost_translate_stencil_op(in->zfail_op);
529 cfg.depth_pass = panfrost_translate_stencil_op(in->zpass_op);
530 }
531 }
532
533 static void
534 panfrost_frag_meta_rasterizer_update(struct panfrost_context *ctx,
535 struct mali_shader_meta *fragmeta)
536 {
537 if (!ctx->rasterizer) {
538 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, true);
539 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, false);
540 fragmeta->depth_units = 0.0f;
541 fragmeta->depth_factor = 0.0f;
542 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, false);
543 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, false);
544 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_NEAR, true);
545 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_FAR, true);
546 return;
547 }
548
549 struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
550
551 bool msaa = rast->multisample;
552
553 /* TODO: Sample size */
554 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
555 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
556
557 struct panfrost_shader_state *fs;
558 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
559
560 /* EXT_shader_framebuffer_fetch requires the shader to be run
561 * per-sample when outputs are read. */
562 bool per_sample = ctx->min_samples > 1 || fs->outputs_read;
563 SET_BIT(fragmeta->unknown2_3, MALI_PER_SAMPLE, msaa && per_sample);
564
565 fragmeta->depth_units = rast->offset_units * 2.0f;
566 fragmeta->depth_factor = rast->offset_scale;
567
568 /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
569
570 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, rast->offset_tri);
571 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, rast->offset_tri);
572
573 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_NEAR, rast->depth_clip_near);
574 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_FAR, rast->depth_clip_far);
575 }
576
577 static void
578 panfrost_frag_meta_zsa_update(struct panfrost_context *ctx,
579 struct mali_shader_meta *fragmeta)
580 {
581 const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
582 int zfunc = PIPE_FUNC_ALWAYS;
583
584 if (!zsa) {
585 /* If stenciling is disabled, the state is irrelevant */
586 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST, false);
587 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK, false);
588 } else {
589 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
590 zsa->stencil[0].enabled);
591 panfrost_make_stencil_state(&zsa->stencil[0],
592 &fragmeta->stencil_front);
593 fragmeta->stencil_mask_front = zsa->stencil[0].writemask;
594
595 /* Bottom 8-bits of stencil state is the stencil ref, ref is no
596 * more than 8-bits. Be extra careful. */
597 fragmeta->stencil_front.opaque[0] |= ctx->stencil_ref.ref_value[0];
598
599 /* If back-stencil is not enabled, use the front values */
600
601 if (zsa->stencil[1].enabled) {
602 panfrost_make_stencil_state(&zsa->stencil[1],
603 &fragmeta->stencil_back);
604 fragmeta->stencil_mask_back = zsa->stencil[1].writemask;
605 fragmeta->stencil_back.opaque[0] |= ctx->stencil_ref.ref_value[1];
606 } else {
607 fragmeta->stencil_back = fragmeta->stencil_front;
608 fragmeta->stencil_mask_back = fragmeta->stencil_mask_front;
609 }
610
611 if (zsa->depth.enabled)
612 zfunc = zsa->depth.func;
613
614 /* Depth state (TODO: Refactor) */
615
616 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
617 zsa->depth.writemask);
618 }
619
620 fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
621 fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(zfunc));
622 }
623
624 static bool
625 panfrost_fs_required(
626 struct panfrost_shader_state *fs,
627 struct panfrost_blend_final *blend,
628 unsigned rt_count)
629 {
630 /* If we generally have side effects */
631 if (fs->fs_sidefx)
632 return true;
633
634 /* If colour is written we need to execute */
635 for (unsigned i = 0; i < rt_count; ++i) {
636 if (!blend[i].no_colour)
637 return true;
638 }
639
640 /* If depth is written and not implied we need to execute.
641 * TODO: Predicate on Z/S writes being enabled */
642 return (fs->writes_depth || fs->writes_stencil);
643 }
644
645 static void
646 panfrost_frag_meta_blend_update(struct panfrost_context *ctx,
647 struct mali_shader_meta *fragmeta,
648 void *rts)
649 {
650 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
651 const struct panfrost_device *dev = pan_device(ctx->base.screen);
652 struct panfrost_shader_state *fs;
653 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
654
655 SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
656 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
657 !ctx->blend->base.dither);
658
659 SET_BIT(fragmeta->unknown2_4, MALI_ALPHA_TO_COVERAGE,
660 ctx->blend->base.alpha_to_coverage);
661
662 /* Get blending setup */
663 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
664
665 struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
666 unsigned shader_offset = 0;
667 struct panfrost_bo *shader_bo = NULL;
668
669 for (unsigned c = 0; c < rt_count; ++c)
670 blend[c] = panfrost_get_blend_for_context(ctx, c, &shader_bo,
671 &shader_offset);
672
673 /* Disable shader execution if we can */
674 if (dev->quirks & MIDGARD_SHADERLESS
675 && !panfrost_fs_required(fs, blend, rt_count)) {
676 fragmeta->shader = 0;
677 fragmeta->attribute_count = 0;
678 fragmeta->varying_count = 0;
679 fragmeta->texture_count = 0;
680 fragmeta->sampler_count = 0;
681
682 /* This feature is not known to work on Bifrost */
683 fragmeta->midgard1.work_count = 1;
684 fragmeta->midgard1.uniform_count = 0;
685 fragmeta->midgard1.uniform_buffer_count = 0;
686 }
687
688 /* If there is a blend shader, work registers are shared. We impose 8
689 * work registers as a limit for blend shaders. Should be lower XXX */
690
691 if (!(dev->quirks & IS_BIFROST)) {
692 for (unsigned c = 0; c < rt_count; ++c) {
693 if (blend[c].is_shader) {
694 fragmeta->midgard1.work_count =
695 MAX2(fragmeta->midgard1.work_count, 8);
696 }
697 }
698 }
699
700 /* Even on MFBD, the shader descriptor gets blend shaders. It's *also*
701 * copied to the blend_meta appended (by convention), but this is the
702 * field actually read by the hardware. (Or maybe both are read...?).
703 * Specify the last RTi with a blend shader. */
704
705 fragmeta->blend.shader = 0;
706
707 for (signed rt = (rt_count - 1); rt >= 0; --rt) {
708 if (!blend[rt].is_shader)
709 continue;
710
711 fragmeta->blend.shader = blend[rt].shader.gpu |
712 blend[rt].shader.first_tag;
713 break;
714 }
715
716 if (dev->quirks & MIDGARD_SFBD) {
717 /* When only a single render target platform is used, the blend
718 * information is inside the shader meta itself. We additionally
719 * need to signal CAN_DISCARD for nontrivial blend modes (so
720 * we're able to read back the destination buffer) */
721
722 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
723 blend[0].is_shader);
724
725 if (!blend[0].is_shader) {
726 fragmeta->blend.equation = *blend[0].equation.equation;
727 fragmeta->blend.constant = blend[0].equation.constant;
728 }
729
730 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
731 !blend[0].no_blending || fs->can_discard);
732
733 batch->draws |= PIPE_CLEAR_COLOR0;
734 return;
735 }
736
737 if (dev->quirks & IS_BIFROST) {
738 bool no_blend = true;
739
740 for (unsigned i = 0; i < rt_count; ++i)
741 no_blend &= (blend[i].no_blending | blend[i].no_colour);
742
743 SET_BIT(fragmeta->bifrost1.unk1, MALI_BIFROST_EARLY_Z,
744 !fs->can_discard && !fs->writes_depth && no_blend);
745 }
746
747 /* Additional blend descriptor tacked on for jobs using MFBD */
748
749 for (unsigned i = 0; i < rt_count; ++i) {
750 unsigned flags = 0;
751
752 if (ctx->pipe_framebuffer.nr_cbufs > i && !blend[i].no_colour) {
753 flags = 0x200;
754 batch->draws |= (PIPE_CLEAR_COLOR0 << i);
755
756 bool is_srgb = (ctx->pipe_framebuffer.nr_cbufs > i) &&
757 (ctx->pipe_framebuffer.cbufs[i]) &&
758 util_format_is_srgb(ctx->pipe_framebuffer.cbufs[i]->format);
759
760 SET_BIT(flags, MALI_BLEND_MRT_SHADER, blend[i].is_shader);
761 SET_BIT(flags, MALI_BLEND_LOAD_TIB, !blend[i].no_blending);
762 SET_BIT(flags, MALI_BLEND_SRGB, is_srgb);
763 SET_BIT(flags, MALI_BLEND_NO_DITHER, !ctx->blend->base.dither);
764 }
765
766 if (dev->quirks & IS_BIFROST) {
767 struct bifrost_blend_rt *brts = rts;
768
769 brts[i].flags = flags;
770
771 if (blend[i].is_shader) {
772 /* The blend shader's address needs to be at
773 * the same top 32 bit as the fragment shader.
774 * TODO: Ensure that's always the case.
775 */
776 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
777 (fs->bo->gpu & (0xffffffffull << 32)));
778 brts[i].shader = blend[i].shader.gpu;
779 brts[i].unk2 = 0x0;
780 } else if (ctx->pipe_framebuffer.nr_cbufs > i) {
781 enum pipe_format format = ctx->pipe_framebuffer.cbufs[i]->format;
782 const struct util_format_description *format_desc;
783 format_desc = util_format_description(format);
784
785 brts[i].equation = *blend[i].equation.equation;
786
787 /* TODO: this is a bit more complicated */
788 brts[i].constant = blend[i].equation.constant;
789
790 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
791
792 /* 0x19 disables blending and forces REPLACE
793 * mode (equivalent to rgb_mode = alpha_mode =
794 * x122, colour mask = 0xF). 0x1a allows
795 * blending. */
796 brts[i].unk2 = blend[i].no_blending ? 0x19 : 0x1a;
797
798 brts[i].shader_type = fs->blend_types[i];
799 } else {
800 /* Dummy attachment for depth-only */
801 brts[i].unk2 = 0x3;
802 brts[i].shader_type = fs->blend_types[i];
803 }
804 } else {
805 struct midgard_blend_rt *mrts = rts;
806 mrts[i].flags = flags;
807
808 if (blend[i].is_shader) {
809 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
810 } else {
811 mrts[i].blend.equation = *blend[i].equation.equation;
812 mrts[i].blend.constant = blend[i].equation.constant;
813 }
814 }
815 }
816 }
817
818 static void
819 panfrost_frag_shader_meta_init(struct panfrost_context *ctx,
820 struct mali_shader_meta *fragmeta,
821 void *rts)
822 {
823 const struct panfrost_device *dev = pan_device(ctx->base.screen);
824 struct panfrost_shader_state *fs;
825
826 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
827
828 bool msaa = ctx->rasterizer && ctx->rasterizer->base.multisample;
829 fragmeta->coverage_mask = (msaa ? ctx->sample_mask : ~0) & 0xF;
830
831 fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x10;
832 fragmeta->unknown2_4 = 0x4e0;
833
834 /* unknown2_4 has 0x10 bit set on T6XX and T720. We don't know why this
835 * is required (independent of 32-bit/64-bit descriptors), or why it's
836 * not used on later GPU revisions. Otherwise, all shader jobs fault on
837 * these earlier chips (perhaps this is a chicken bit of some kind).
838 * More investigation is needed. */
839
840 SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
841
842 if (dev->quirks & IS_BIFROST) {
843 /* TODO */
844 } else {
845 /* Depending on whether it's legal to in the given shader, we try to
846 * enable early-z testing. TODO: respect e-z force */
847
848 SET_BIT(fragmeta->midgard1.flags_lo, MALI_EARLY_Z,
849 !fs->can_discard && !fs->writes_global &&
850 !fs->writes_depth && !fs->writes_stencil &&
851 !ctx->blend->base.alpha_to_coverage);
852
853 /* Add the writes Z/S flags if needed. */
854 SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_Z, fs->writes_depth);
855 SET_BIT(fragmeta->midgard1.flags_hi, MALI_WRITES_S, fs->writes_stencil);
856
857 /* Any time texturing is used, derivatives are implicitly calculated,
858 * so we need to enable helper invocations */
859
860 SET_BIT(fragmeta->midgard1.flags_lo, MALI_HELPER_INVOCATIONS,
861 fs->helper_invocations);
862
863 /* If discard is enabled, which bit we set to convey this
864 * depends on if depth/stencil is used for the draw or not.
865 * Just one of depth OR stencil is enough to trigger this. */
866
867 const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
868 bool zs_enabled = fs->writes_depth || fs->writes_stencil;
869
870 if (zsa) {
871 zs_enabled |= (zsa->depth.enabled && zsa->depth.func != PIPE_FUNC_ALWAYS);
872 zs_enabled |= zsa->stencil[0].enabled;
873 }
874
875 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_TILEBUFFER,
876 fs->outputs_read || (!zs_enabled && fs->can_discard));
877 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_ZS, zs_enabled && fs->can_discard);
878 }
879
880 panfrost_frag_meta_rasterizer_update(ctx, fragmeta);
881 panfrost_frag_meta_zsa_update(ctx, fragmeta);
882 panfrost_frag_meta_blend_update(ctx, fragmeta, rts);
883 }
884
885 void
886 panfrost_emit_shader_meta(struct panfrost_batch *batch,
887 enum pipe_shader_type st,
888 struct mali_vertex_tiler_postfix *postfix)
889 {
890 struct panfrost_context *ctx = batch->ctx;
891 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
892
893 if (!ss) {
894 postfix->shader = 0;
895 return;
896 }
897
898 struct mali_shader_meta meta;
899
900 panfrost_shader_meta_init(ctx, st, &meta);
901
902 /* Add the shader BO to the batch. */
903 panfrost_batch_add_bo(batch, ss->bo,
904 PAN_BO_ACCESS_PRIVATE |
905 PAN_BO_ACCESS_READ |
906 panfrost_bo_access_for_stage(st));
907
908 mali_ptr shader_ptr;
909
910 if (st == PIPE_SHADER_FRAGMENT) {
911 struct panfrost_device *dev = pan_device(ctx->base.screen);
912 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
913 size_t desc_size = sizeof(meta);
914 void *rts = NULL;
915 struct panfrost_transfer xfer;
916 unsigned rt_size;
917
918 if (dev->quirks & MIDGARD_SFBD)
919 rt_size = 0;
920 else if (dev->quirks & IS_BIFROST)
921 rt_size = sizeof(struct bifrost_blend_rt);
922 else
923 rt_size = sizeof(struct midgard_blend_rt);
924
925 desc_size += rt_size * rt_count;
926
927 if (rt_size)
928 rts = rzalloc_size(ctx, rt_size * rt_count);
929
930 panfrost_frag_shader_meta_init(ctx, &meta, rts);
931
932 xfer = panfrost_pool_alloc(&batch->pool, desc_size);
933
934 memcpy(xfer.cpu, &meta, sizeof(meta));
935 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
936
937 if (rt_size)
938 ralloc_free(rts);
939
940 shader_ptr = xfer.gpu;
941 } else {
942 shader_ptr = panfrost_pool_upload(&batch->pool, &meta,
943 sizeof(meta));
944 }
945
946 postfix->shader = shader_ptr;
947 }
948
949 void
950 panfrost_emit_viewport(struct panfrost_batch *batch,
951 struct mali_vertex_tiler_postfix *tiler_postfix)
952 {
953 struct panfrost_context *ctx = batch->ctx;
954 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
955 const struct pipe_scissor_state *ss = &ctx->scissor;
956 const struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
957 const struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer;
958
959 /* Derive min/max from translate/scale. Note since |x| >= 0 by
960 * definition, we have that -|x| <= |x| hence translate - |scale| <=
961 * translate + |scale|, so the ordering is correct here. */
962 float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
963 float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
964 float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
965 float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
966 float minz = (vp->translate[2] - fabsf(vp->scale[2]));
967 float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
968
969 /* Scissor to the intersection of viewport and to the scissor, clamped
970 * to the framebuffer */
971
972 unsigned minx = MIN2(fb->width, vp_minx);
973 unsigned maxx = MIN2(fb->width, vp_maxx);
974 unsigned miny = MIN2(fb->height, vp_miny);
975 unsigned maxy = MIN2(fb->height, vp_maxy);
976
977 if (ss && rast && rast->scissor) {
978 minx = MAX2(ss->minx, minx);
979 miny = MAX2(ss->miny, miny);
980 maxx = MIN2(ss->maxx, maxx);
981 maxy = MIN2(ss->maxy, maxy);
982 }
983
984 struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool, MALI_VIEWPORT_LENGTH);
985
986 pan_pack(T.cpu, VIEWPORT, cfg) {
987 cfg.scissor_minimum_x = minx;
988 cfg.scissor_minimum_y = miny;
989 cfg.scissor_maximum_x = maxx - 1;
990 cfg.scissor_maximum_y = maxy - 1;
991
992 cfg.minimum_z = rast->depth_clip_near ? minz : -INFINITY;
993 cfg.maximum_z = rast->depth_clip_far ? maxz : INFINITY;
994 }
995
996 tiler_postfix->viewport = T.gpu;
997 panfrost_batch_union_scissor(batch, minx, miny, maxx, maxy);
998 }
999
1000 static mali_ptr
1001 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
1002 enum pipe_shader_type st,
1003 struct panfrost_constant_buffer *buf,
1004 unsigned index)
1005 {
1006 struct pipe_constant_buffer *cb = &buf->cb[index];
1007 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1008
1009 if (rsrc) {
1010 panfrost_batch_add_bo(batch, rsrc->bo,
1011 PAN_BO_ACCESS_SHARED |
1012 PAN_BO_ACCESS_READ |
1013 panfrost_bo_access_for_stage(st));
1014
1015 /* Alignment gauranteed by
1016 * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
1017 return rsrc->bo->gpu + cb->buffer_offset;
1018 } else if (cb->user_buffer) {
1019 return panfrost_pool_upload(&batch->pool,
1020 cb->user_buffer +
1021 cb->buffer_offset,
1022 cb->buffer_size);
1023 } else {
1024 unreachable("No constant buffer");
1025 }
1026 }
1027
1028 struct sysval_uniform {
1029 union {
1030 float f[4];
1031 int32_t i[4];
1032 uint32_t u[4];
1033 uint64_t du[2];
1034 };
1035 };
1036
1037 static void
1038 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
1039 struct sysval_uniform *uniform)
1040 {
1041 struct panfrost_context *ctx = batch->ctx;
1042 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1043
1044 uniform->f[0] = vp->scale[0];
1045 uniform->f[1] = vp->scale[1];
1046 uniform->f[2] = vp->scale[2];
1047 }
1048
1049 static void
1050 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
1051 struct sysval_uniform *uniform)
1052 {
1053 struct panfrost_context *ctx = batch->ctx;
1054 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1055
1056 uniform->f[0] = vp->translate[0];
1057 uniform->f[1] = vp->translate[1];
1058 uniform->f[2] = vp->translate[2];
1059 }
1060
1061 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
1062 enum pipe_shader_type st,
1063 unsigned int sysvalid,
1064 struct sysval_uniform *uniform)
1065 {
1066 struct panfrost_context *ctx = batch->ctx;
1067 unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
1068 unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
1069 bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
1070 struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
1071
1072 assert(dim);
1073 uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
1074
1075 if (dim > 1)
1076 uniform->i[1] = u_minify(tex->texture->height0,
1077 tex->u.tex.first_level);
1078
1079 if (dim > 2)
1080 uniform->i[2] = u_minify(tex->texture->depth0,
1081 tex->u.tex.first_level);
1082
1083 if (is_array)
1084 uniform->i[dim] = tex->texture->array_size;
1085 }
1086
1087 static void
1088 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
1089 enum pipe_shader_type st,
1090 unsigned ssbo_id,
1091 struct sysval_uniform *uniform)
1092 {
1093 struct panfrost_context *ctx = batch->ctx;
1094
1095 assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
1096 struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
1097
1098 /* Compute address */
1099 struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
1100
1101 panfrost_batch_add_bo(batch, bo,
1102 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
1103 panfrost_bo_access_for_stage(st));
1104
1105 /* Upload address and size as sysval */
1106 uniform->du[0] = bo->gpu + sb.buffer_offset;
1107 uniform->u[2] = sb.buffer_size;
1108 }
1109
1110 static void
1111 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1112 enum pipe_shader_type st,
1113 unsigned samp_idx,
1114 struct sysval_uniform *uniform)
1115 {
1116 struct panfrost_context *ctx = batch->ctx;
1117 struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1118
1119 uniform->f[0] = sampl->min_lod;
1120 uniform->f[1] = sampl->max_lod;
1121 uniform->f[2] = sampl->lod_bias;
1122
1123 /* Even without any errata, Midgard represents "no mipmapping" as
1124 * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1125 * panfrost_create_sampler_state which also explains our choice of
1126 * epsilon value (again to keep behaviour consistent) */
1127
1128 if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1129 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1130 }
1131
1132 static void
1133 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1134 struct sysval_uniform *uniform)
1135 {
1136 struct panfrost_context *ctx = batch->ctx;
1137
1138 uniform->u[0] = ctx->compute_grid->grid[0];
1139 uniform->u[1] = ctx->compute_grid->grid[1];
1140 uniform->u[2] = ctx->compute_grid->grid[2];
1141 }
1142
1143 static void
1144 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1145 struct panfrost_shader_state *ss,
1146 enum pipe_shader_type st)
1147 {
1148 struct sysval_uniform *uniforms = (void *)buf;
1149
1150 for (unsigned i = 0; i < ss->sysval_count; ++i) {
1151 int sysval = ss->sysval[i];
1152
1153 switch (PAN_SYSVAL_TYPE(sysval)) {
1154 case PAN_SYSVAL_VIEWPORT_SCALE:
1155 panfrost_upload_viewport_scale_sysval(batch,
1156 &uniforms[i]);
1157 break;
1158 case PAN_SYSVAL_VIEWPORT_OFFSET:
1159 panfrost_upload_viewport_offset_sysval(batch,
1160 &uniforms[i]);
1161 break;
1162 case PAN_SYSVAL_TEXTURE_SIZE:
1163 panfrost_upload_txs_sysval(batch, st,
1164 PAN_SYSVAL_ID(sysval),
1165 &uniforms[i]);
1166 break;
1167 case PAN_SYSVAL_SSBO:
1168 panfrost_upload_ssbo_sysval(batch, st,
1169 PAN_SYSVAL_ID(sysval),
1170 &uniforms[i]);
1171 break;
1172 case PAN_SYSVAL_NUM_WORK_GROUPS:
1173 panfrost_upload_num_work_groups_sysval(batch,
1174 &uniforms[i]);
1175 break;
1176 case PAN_SYSVAL_SAMPLER:
1177 panfrost_upload_sampler_sysval(batch, st,
1178 PAN_SYSVAL_ID(sysval),
1179 &uniforms[i]);
1180 break;
1181 default:
1182 assert(0);
1183 }
1184 }
1185 }
1186
1187 static const void *
1188 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1189 unsigned index)
1190 {
1191 struct pipe_constant_buffer *cb = &buf->cb[index];
1192 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1193
1194 if (rsrc)
1195 return rsrc->bo->cpu;
1196 else if (cb->user_buffer)
1197 return cb->user_buffer;
1198 else
1199 unreachable("No constant buffer");
1200 }
1201
1202 void
1203 panfrost_emit_const_buf(struct panfrost_batch *batch,
1204 enum pipe_shader_type stage,
1205 struct mali_vertex_tiler_postfix *postfix)
1206 {
1207 struct panfrost_context *ctx = batch->ctx;
1208 struct panfrost_shader_variants *all = ctx->shader[stage];
1209
1210 if (!all)
1211 return;
1212
1213 struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1214
1215 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1216
1217 /* Uniforms are implicitly UBO #0 */
1218 bool has_uniforms = buf->enabled_mask & (1 << 0);
1219
1220 /* Allocate room for the sysval and the uniforms */
1221 size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1222 size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1223 size_t size = sys_size + uniform_size;
1224 struct panfrost_transfer transfer = panfrost_pool_alloc(&batch->pool,
1225 size);
1226
1227 /* Upload sysvals requested by the shader */
1228 panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1229
1230 /* Upload uniforms */
1231 if (has_uniforms && uniform_size) {
1232 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1233 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1234 }
1235
1236 /* Next up, attach UBOs. UBO #0 is the uniforms we just
1237 * uploaded */
1238
1239 unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1240 assert(ubo_count >= 1);
1241
1242 size_t sz = MALI_UNIFORM_BUFFER_LENGTH * ubo_count;
1243 struct panfrost_transfer ubos = panfrost_pool_alloc(&batch->pool, sz);
1244 uint64_t *ubo_ptr = (uint64_t *) ubos.cpu;
1245
1246 /* Upload uniforms as a UBO */
1247
1248 if (ss->uniform_count) {
1249 pan_pack(ubo_ptr, UNIFORM_BUFFER, cfg) {
1250 cfg.entries = ss->uniform_count;
1251 cfg.pointer = transfer.gpu;
1252 }
1253 } else {
1254 *ubo_ptr = 0;
1255 }
1256
1257 /* The rest are honest-to-goodness UBOs */
1258
1259 for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1260 size_t usz = buf->cb[ubo].buffer_size;
1261 bool enabled = buf->enabled_mask & (1 << ubo);
1262 bool empty = usz == 0;
1263
1264 if (!enabled || empty) {
1265 ubo_ptr[ubo] = 0;
1266 continue;
1267 }
1268
1269 pan_pack(ubo_ptr + ubo, UNIFORM_BUFFER, cfg) {
1270 cfg.entries = DIV_ROUND_UP(usz, 16);
1271 cfg.pointer = panfrost_map_constant_buffer_gpu(batch,
1272 stage, buf, ubo);
1273 }
1274 }
1275
1276 postfix->uniforms = transfer.gpu;
1277 postfix->uniform_buffers = ubos.gpu;
1278
1279 buf->dirty_mask = 0;
1280 }
1281
1282 void
1283 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1284 const struct pipe_grid_info *info,
1285 struct midgard_payload_vertex_tiler *vtp)
1286 {
1287 struct panfrost_context *ctx = batch->ctx;
1288 struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1289 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1290 unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1291 128));
1292 unsigned shared_size = single_size * info->grid[0] * info->grid[1] *
1293 info->grid[2] * 4;
1294 struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1295 shared_size,
1296 1);
1297
1298 struct mali_shared_memory shared = {
1299 .shared_memory = bo->gpu,
1300 .shared_workgroup_count =
1301 util_logbase2_ceil(info->grid[0]) +
1302 util_logbase2_ceil(info->grid[1]) +
1303 util_logbase2_ceil(info->grid[2]),
1304 .shared_unk1 = 0x2,
1305 .shared_shift = util_logbase2(single_size) - 1
1306 };
1307
1308 vtp->postfix.shared_memory = panfrost_pool_upload(&batch->pool, &shared,
1309 sizeof(shared));
1310 }
1311
1312 static mali_ptr
1313 panfrost_get_tex_desc(struct panfrost_batch *batch,
1314 enum pipe_shader_type st,
1315 struct panfrost_sampler_view *view)
1316 {
1317 if (!view)
1318 return (mali_ptr) 0;
1319
1320 struct pipe_sampler_view *pview = &view->base;
1321 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1322
1323 /* Add the BO to the job so it's retained until the job is done. */
1324
1325 panfrost_batch_add_bo(batch, rsrc->bo,
1326 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1327 panfrost_bo_access_for_stage(st));
1328
1329 panfrost_batch_add_bo(batch, view->bo,
1330 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1331 panfrost_bo_access_for_stage(st));
1332
1333 return view->bo->gpu;
1334 }
1335
1336 static void
1337 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
1338 struct pipe_context *pctx)
1339 {
1340 struct panfrost_resource *rsrc = pan_resource(view->base.texture);
1341 if (view->texture_bo != rsrc->bo->gpu ||
1342 view->modifier != rsrc->modifier) {
1343 panfrost_bo_unreference(view->bo);
1344 panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);
1345 }
1346 }
1347
1348 void
1349 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1350 enum pipe_shader_type stage,
1351 struct mali_vertex_tiler_postfix *postfix)
1352 {
1353 struct panfrost_context *ctx = batch->ctx;
1354 struct panfrost_device *device = pan_device(ctx->base.screen);
1355
1356 if (!ctx->sampler_view_count[stage])
1357 return;
1358
1359 if (device->quirks & IS_BIFROST) {
1360 struct bifrost_texture_descriptor *descriptors;
1361
1362 descriptors = malloc(sizeof(struct bifrost_texture_descriptor) *
1363 ctx->sampler_view_count[stage]);
1364
1365 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1366 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1367 struct pipe_sampler_view *pview = &view->base;
1368 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1369 panfrost_update_sampler_view(view, &ctx->base);
1370
1371 /* Add the BOs to the job so they are retained until the job is done. */
1372
1373 panfrost_batch_add_bo(batch, rsrc->bo,
1374 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1375 panfrost_bo_access_for_stage(stage));
1376
1377 panfrost_batch_add_bo(batch, view->bo,
1378 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1379 panfrost_bo_access_for_stage(stage));
1380
1381 memcpy(&descriptors[i], view->bifrost_descriptor, sizeof(*view->bifrost_descriptor));
1382 }
1383
1384 postfix->textures = panfrost_pool_upload(&batch->pool,
1385 descriptors,
1386 sizeof(struct bifrost_texture_descriptor) *
1387 ctx->sampler_view_count[stage]);
1388
1389 free(descriptors);
1390 } else {
1391 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1392
1393 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1394 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1395
1396 panfrost_update_sampler_view(view, &ctx->base);
1397
1398 trampolines[i] = panfrost_get_tex_desc(batch, stage, view);
1399 }
1400
1401 postfix->textures = panfrost_pool_upload(&batch->pool,
1402 trampolines,
1403 sizeof(uint64_t) *
1404 ctx->sampler_view_count[stage]);
1405 }
1406 }
1407
1408 void
1409 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1410 enum pipe_shader_type stage,
1411 struct mali_vertex_tiler_postfix *postfix)
1412 {
1413 struct panfrost_context *ctx = batch->ctx;
1414 struct panfrost_device *device = pan_device(ctx->base.screen);
1415
1416 if (!ctx->sampler_count[stage])
1417 return;
1418
1419 if (device->quirks & IS_BIFROST) {
1420 size_t desc_size = sizeof(struct bifrost_sampler_descriptor);
1421 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1422 struct panfrost_transfer transfer = panfrost_pool_alloc(&batch->pool,
1423 transfer_size);
1424 struct bifrost_sampler_descriptor *desc = (struct bifrost_sampler_descriptor *)transfer.cpu;
1425
1426 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1427 desc[i] = ctx->samplers[stage][i]->bifrost_hw;
1428
1429 postfix->sampler_descriptor = transfer.gpu;
1430 } else {
1431 size_t desc_size = sizeof(struct mali_sampler_descriptor);
1432 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1433 struct panfrost_transfer transfer = panfrost_pool_alloc(&batch->pool,
1434 transfer_size);
1435 struct mali_sampler_descriptor *desc = (struct mali_sampler_descriptor *)transfer.cpu;
1436
1437 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1438 desc[i] = ctx->samplers[stage][i]->midgard_hw;
1439
1440 postfix->sampler_descriptor = transfer.gpu;
1441 }
1442 }
1443
1444 void
1445 panfrost_emit_vertex_attr_meta(struct panfrost_batch *batch,
1446 struct mali_vertex_tiler_postfix *vertex_postfix)
1447 {
1448 struct panfrost_context *ctx = batch->ctx;
1449
1450 if (!ctx->vertex)
1451 return;
1452
1453 struct panfrost_vertex_state *so = ctx->vertex;
1454
1455 panfrost_vertex_state_upd_attr_offs(ctx, vertex_postfix);
1456 vertex_postfix->attribute_meta = panfrost_pool_upload(&batch->pool, so->hw,
1457 sizeof(*so->hw) *
1458 PAN_MAX_ATTRIBUTE);
1459 }
1460
1461 void
1462 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1463 struct mali_vertex_tiler_postfix *vertex_postfix)
1464 {
1465 struct panfrost_context *ctx = batch->ctx;
1466 struct panfrost_vertex_state *so = ctx->vertex;
1467
1468 /* Staged mali_attr, and index into them. i =/= k, depending on the
1469 * vertex buffer mask and instancing. Twice as much room is allocated,
1470 * for a worst case of NPOT_DIVIDEs which take up extra slot */
1471 union mali_attr attrs[PIPE_MAX_ATTRIBS * 2];
1472 unsigned k = 0;
1473
1474 for (unsigned i = 0; i < so->num_elements; ++i) {
1475 /* We map a mali_attr to be 1:1 with the mali_attr_meta, which
1476 * means duplicating some vertex buffers (who cares? aside from
1477 * maybe some caching implications but I somehow doubt that
1478 * matters) */
1479
1480 struct pipe_vertex_element *elem = &so->pipe[i];
1481 unsigned vbi = elem->vertex_buffer_index;
1482
1483 /* The exception to 1:1 mapping is that we can have multiple
1484 * entries (NPOT divisors), so we fixup anyways */
1485
1486 so->hw[i].index = k;
1487
1488 if (!(ctx->vb_mask & (1 << vbi)))
1489 continue;
1490
1491 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1492 struct panfrost_resource *rsrc;
1493
1494 rsrc = pan_resource(buf->buffer.resource);
1495 if (!rsrc)
1496 continue;
1497
1498 /* Align to 64 bytes by masking off the lower bits. This
1499 * will be adjusted back when we fixup the src_offset in
1500 * mali_attr_meta */
1501
1502 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1503 mali_ptr addr = raw_addr & ~63;
1504 unsigned chopped_addr = raw_addr - addr;
1505
1506 /* Add a dependency of the batch on the vertex buffer */
1507 panfrost_batch_add_bo(batch, rsrc->bo,
1508 PAN_BO_ACCESS_SHARED |
1509 PAN_BO_ACCESS_READ |
1510 PAN_BO_ACCESS_VERTEX_TILER);
1511
1512 /* Set common fields */
1513 attrs[k].elements = addr;
1514 attrs[k].stride = buf->stride;
1515
1516 /* Since we advanced the base pointer, we shrink the buffer
1517 * size */
1518 attrs[k].size = rsrc->base.width0 - buf->buffer_offset;
1519
1520 /* We need to add the extra size we masked off (for
1521 * correctness) so the data doesn't get clamped away */
1522 attrs[k].size += chopped_addr;
1523
1524 /* For non-instancing make sure we initialize */
1525 attrs[k].shift = attrs[k].extra_flags = 0;
1526
1527 /* Instancing uses a dramatically different code path than
1528 * linear, so dispatch for the actual emission now that the
1529 * common code is finished */
1530
1531 unsigned divisor = elem->instance_divisor;
1532
1533 if (divisor && ctx->instance_count == 1) {
1534 /* Silly corner case where there's a divisor(=1) but
1535 * there's no legitimate instancing. So we want *every*
1536 * attribute to be the same. So set stride to zero so
1537 * we don't go anywhere. */
1538
1539 attrs[k].size = attrs[k].stride + chopped_addr;
1540 attrs[k].stride = 0;
1541 attrs[k++].elements |= MALI_ATTR_LINEAR;
1542 } else if (ctx->instance_count <= 1) {
1543 /* Normal, non-instanced attributes */
1544 attrs[k++].elements |= MALI_ATTR_LINEAR;
1545 } else {
1546 unsigned instance_shift = vertex_postfix->instance_shift;
1547 unsigned instance_odd = vertex_postfix->instance_odd;
1548
1549 k += panfrost_vertex_instanced(ctx->padded_count,
1550 instance_shift,
1551 instance_odd,
1552 divisor, &attrs[k]);
1553 }
1554 }
1555
1556 /* Add special gl_VertexID/gl_InstanceID buffers */
1557
1558 panfrost_vertex_id(ctx->padded_count, &attrs[k]);
1559 so->hw[PAN_VERTEX_ID].index = k++;
1560 panfrost_instance_id(ctx->padded_count, &attrs[k]);
1561 so->hw[PAN_INSTANCE_ID].index = k++;
1562
1563 /* Upload whatever we emitted and go */
1564
1565 vertex_postfix->attributes = panfrost_pool_upload(&batch->pool, attrs,
1566 k * sizeof(*attrs));
1567 }
1568
1569 static mali_ptr
1570 panfrost_emit_varyings(struct panfrost_batch *batch, union mali_attr *slot,
1571 unsigned stride, unsigned count)
1572 {
1573 /* Fill out the descriptor */
1574 slot->stride = stride;
1575 slot->size = stride * count;
1576 slot->shift = slot->extra_flags = 0;
1577
1578 struct panfrost_transfer transfer = panfrost_pool_alloc(&batch->pool,
1579 slot->size);
1580
1581 slot->elements = transfer.gpu | MALI_ATTR_LINEAR;
1582
1583 return transfer.gpu;
1584 }
1585
1586 static unsigned
1587 panfrost_streamout_offset(unsigned stride, unsigned offset,
1588 struct pipe_stream_output_target *target)
1589 {
1590 return (target->buffer_offset + (offset * stride * 4)) & 63;
1591 }
1592
1593 static void
1594 panfrost_emit_streamout(struct panfrost_batch *batch, union mali_attr *slot,
1595 unsigned stride, unsigned offset, unsigned count,
1596 struct pipe_stream_output_target *target)
1597 {
1598 /* Fill out the descriptor */
1599 slot->stride = stride * 4;
1600 slot->shift = slot->extra_flags = 0;
1601
1602 unsigned max_size = target->buffer_size;
1603 unsigned expected_size = slot->stride * count;
1604
1605 /* Grab the BO and bind it to the batch */
1606 struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1607
1608 /* Varyings are WRITE from the perspective of the VERTEX but READ from
1609 * the perspective of the TILER and FRAGMENT.
1610 */
1611 panfrost_batch_add_bo(batch, bo,
1612 PAN_BO_ACCESS_SHARED |
1613 PAN_BO_ACCESS_RW |
1614 PAN_BO_ACCESS_VERTEX_TILER |
1615 PAN_BO_ACCESS_FRAGMENT);
1616
1617 /* We will have an offset applied to get alignment */
1618 mali_ptr addr = bo->gpu + target->buffer_offset + (offset * slot->stride);
1619 slot->elements = (addr & ~63) | MALI_ATTR_LINEAR;
1620 slot->size = MIN2(max_size, expected_size) + (addr & 63);
1621 }
1622
1623 static bool
1624 has_point_coord(unsigned mask, gl_varying_slot loc)
1625 {
1626 if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1627 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1628 else if (loc == VARYING_SLOT_PNTC)
1629 return (mask & (1 << 8));
1630 else
1631 return false;
1632 }
1633
1634 /* Helpers for manipulating stream out information so we can pack varyings
1635 * accordingly. Compute the src_offset for a given captured varying */
1636
1637 static struct pipe_stream_output *
1638 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1639 {
1640 for (unsigned i = 0; i < info->num_outputs; ++i) {
1641 if (info->output[i].register_index == loc)
1642 return &info->output[i];
1643 }
1644
1645 unreachable("Varying not captured");
1646 }
1647
1648 static unsigned
1649 pan_varying_size(enum mali_format fmt)
1650 {
1651 unsigned type = MALI_EXTRACT_TYPE(fmt);
1652 unsigned chan = MALI_EXTRACT_CHANNELS(fmt);
1653 unsigned bits = MALI_EXTRACT_BITS(fmt);
1654 unsigned bpc = 0;
1655
1656 if (bits == MALI_CHANNEL_FLOAT) {
1657 /* No doubles */
1658 bool fp16 = (type == MALI_FORMAT_SINT);
1659 assert(fp16 || (type == MALI_FORMAT_UNORM));
1660
1661 bpc = fp16 ? 2 : 4;
1662 } else {
1663 assert(type >= MALI_FORMAT_SNORM && type <= MALI_FORMAT_SINT);
1664
1665 /* See the enums */
1666 bits = 1 << bits;
1667 assert(bits >= 8);
1668 bpc = bits / 8;
1669 }
1670
1671 return bpc * chan;
1672 }
1673
1674 /* Indices for named (non-XFB) varyings that are present. These are packed
1675 * tightly so they correspond to a bitfield present (P) indexed by (1 <<
1676 * PAN_VARY_*). This has the nice property that you can lookup the buffer index
1677 * of a given special field given a shift S by:
1678 *
1679 * idx = popcount(P & ((1 << S) - 1))
1680 *
1681 * That is... look at all of the varyings that come earlier and count them, the
1682 * count is the new index since plus one. Likewise, the total number of special
1683 * buffers required is simply popcount(P)
1684 */
1685
1686 enum pan_special_varying {
1687 PAN_VARY_GENERAL = 0,
1688 PAN_VARY_POSITION = 1,
1689 PAN_VARY_PSIZ = 2,
1690 PAN_VARY_PNTCOORD = 3,
1691 PAN_VARY_FACE = 4,
1692 PAN_VARY_FRAGCOORD = 5,
1693
1694 /* Keep last */
1695 PAN_VARY_MAX,
1696 };
1697
1698 /* Given a varying, figure out which index it correpsonds to */
1699
1700 static inline unsigned
1701 pan_varying_index(unsigned present, enum pan_special_varying v)
1702 {
1703 unsigned mask = (1 << v) - 1;
1704 return util_bitcount(present & mask);
1705 }
1706
1707 /* Get the base offset for XFB buffers, which by convention come after
1708 * everything else. Wrapper function for semantic reasons; by construction this
1709 * is just popcount. */
1710
1711 static inline unsigned
1712 pan_xfb_base(unsigned present)
1713 {
1714 return util_bitcount(present);
1715 }
1716
1717 /* Computes the present mask for varyings so we can start emitting varying records */
1718
1719 static inline unsigned
1720 pan_varying_present(
1721 struct panfrost_shader_state *vs,
1722 struct panfrost_shader_state *fs,
1723 unsigned quirks)
1724 {
1725 /* At the moment we always emit general and position buffers. Not
1726 * strictly necessary but usually harmless */
1727
1728 unsigned present = (1 << PAN_VARY_GENERAL) | (1 << PAN_VARY_POSITION);
1729
1730 /* Enable special buffers by the shader info */
1731
1732 if (vs->writes_point_size)
1733 present |= (1 << PAN_VARY_PSIZ);
1734
1735 if (fs->reads_point_coord)
1736 present |= (1 << PAN_VARY_PNTCOORD);
1737
1738 if (fs->reads_face)
1739 present |= (1 << PAN_VARY_FACE);
1740
1741 if (fs->reads_frag_coord && !(quirks & IS_BIFROST))
1742 present |= (1 << PAN_VARY_FRAGCOORD);
1743
1744 /* Also, if we have a point sprite, we need a point coord buffer */
1745
1746 for (unsigned i = 0; i < fs->varying_count; i++) {
1747 gl_varying_slot loc = fs->varyings_loc[i];
1748
1749 if (has_point_coord(fs->point_sprite_mask, loc))
1750 present |= (1 << PAN_VARY_PNTCOORD);
1751 }
1752
1753 return present;
1754 }
1755
1756 /* Emitters for varying records */
1757
1758 static struct mali_attr_meta
1759 pan_emit_vary(unsigned present, enum pan_special_varying buf,
1760 unsigned quirks, enum mali_format format,
1761 unsigned offset)
1762 {
1763 unsigned nr_channels = MALI_EXTRACT_CHANNELS(format);
1764
1765 struct mali_attr_meta meta = {
1766 .index = pan_varying_index(present, buf),
1767 .unknown1 = quirks & IS_BIFROST ? 0x0 : 0x2,
1768 .swizzle = quirks & HAS_SWIZZLES ?
1769 panfrost_get_default_swizzle(nr_channels) :
1770 panfrost_bifrost_swizzle(nr_channels),
1771 .format = format,
1772 .src_offset = offset
1773 };
1774
1775 return meta;
1776 }
1777
1778 /* General varying that is unused */
1779
1780 static struct mali_attr_meta
1781 pan_emit_vary_only(unsigned present, unsigned quirks)
1782 {
1783 return pan_emit_vary(present, 0, quirks, MALI_VARYING_DISCARD, 0);
1784 }
1785
1786 /* Special records */
1787
1788 static const enum mali_format pan_varying_formats[PAN_VARY_MAX] = {
1789 [PAN_VARY_POSITION] = MALI_VARYING_POS,
1790 [PAN_VARY_PSIZ] = MALI_R16F,
1791 [PAN_VARY_PNTCOORD] = MALI_R16F,
1792 [PAN_VARY_FACE] = MALI_R32I,
1793 [PAN_VARY_FRAGCOORD] = MALI_RGBA32F
1794 };
1795
1796 static struct mali_attr_meta
1797 pan_emit_vary_special(unsigned present, enum pan_special_varying buf,
1798 unsigned quirks)
1799 {
1800 assert(buf < PAN_VARY_MAX);
1801 return pan_emit_vary(present, buf, quirks, pan_varying_formats[buf], 0);
1802 }
1803
1804 static enum mali_format
1805 pan_xfb_format(enum mali_format format, unsigned nr)
1806 {
1807 if (MALI_EXTRACT_BITS(format) == MALI_CHANNEL_FLOAT)
1808 return MALI_R32F | MALI_NR_CHANNELS(nr);
1809 else
1810 return MALI_EXTRACT_TYPE(format) | MALI_NR_CHANNELS(nr) | MALI_CHANNEL_32;
1811 }
1812
1813 /* Transform feedback records. Note struct pipe_stream_output is (if packed as
1814 * a bitfield) 32-bit, smaller than a 64-bit pointer, so may as well pass by
1815 * value. */
1816
1817 static struct mali_attr_meta
1818 pan_emit_vary_xfb(unsigned present,
1819 unsigned max_xfb,
1820 unsigned *streamout_offsets,
1821 unsigned quirks,
1822 enum mali_format format,
1823 struct pipe_stream_output o)
1824 {
1825 /* Otherwise construct a record for it */
1826 struct mali_attr_meta meta = {
1827 /* XFB buffers come after everything else */
1828 .index = pan_xfb_base(present) + o.output_buffer,
1829
1830 /* As usual unknown bit */
1831 .unknown1 = quirks & IS_BIFROST ? 0x0 : 0x2,
1832
1833 /* Override swizzle with number of channels */
1834 .swizzle = quirks & HAS_SWIZZLES ?
1835 panfrost_get_default_swizzle(o.num_components) :
1836 panfrost_bifrost_swizzle(o.num_components),
1837
1838 /* Override number of channels and precision to highp */
1839 .format = pan_xfb_format(format, o.num_components),
1840
1841 /* Apply given offsets together */
1842 .src_offset = (o.dst_offset * 4) /* dwords */
1843 + streamout_offsets[o.output_buffer]
1844 };
1845
1846 return meta;
1847 }
1848
1849 /* Determine if we should capture a varying for XFB. This requires actually
1850 * having a buffer for it. If we don't capture it, we'll fallback to a general
1851 * varying path (linked or unlinked, possibly discarding the write) */
1852
1853 static bool
1854 panfrost_xfb_captured(struct panfrost_shader_state *xfb,
1855 unsigned loc, unsigned max_xfb)
1856 {
1857 if (!(xfb->so_mask & (1ll << loc)))
1858 return false;
1859
1860 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1861 return o->output_buffer < max_xfb;
1862 }
1863
1864 /* Higher-level wrapper around all of the above, classifying a varying into one
1865 * of the above types */
1866
1867 static struct mali_attr_meta
1868 panfrost_emit_varying(
1869 struct panfrost_shader_state *stage,
1870 struct panfrost_shader_state *other,
1871 struct panfrost_shader_state *xfb,
1872 unsigned present,
1873 unsigned max_xfb,
1874 unsigned *streamout_offsets,
1875 unsigned quirks,
1876 unsigned *gen_offsets,
1877 enum mali_format *gen_formats,
1878 unsigned *gen_stride,
1879 unsigned idx,
1880 bool should_alloc,
1881 bool is_fragment)
1882 {
1883 gl_varying_slot loc = stage->varyings_loc[idx];
1884 enum mali_format format = stage->varyings[idx];
1885
1886 /* Override format to match linkage */
1887 if (!should_alloc && gen_formats[idx])
1888 format = gen_formats[idx];
1889
1890 if (has_point_coord(stage->point_sprite_mask, loc)) {
1891 return pan_emit_vary_special(present, PAN_VARY_PNTCOORD, quirks);
1892 } else if (panfrost_xfb_captured(xfb, loc, max_xfb)) {
1893 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1894 return pan_emit_vary_xfb(present, max_xfb, streamout_offsets, quirks, format, *o);
1895 } else if (loc == VARYING_SLOT_POS) {
1896 if (is_fragment)
1897 return pan_emit_vary_special(present, PAN_VARY_FRAGCOORD, quirks);
1898 else
1899 return pan_emit_vary_special(present, PAN_VARY_POSITION, quirks);
1900 } else if (loc == VARYING_SLOT_PSIZ) {
1901 return pan_emit_vary_special(present, PAN_VARY_PSIZ, quirks);
1902 } else if (loc == VARYING_SLOT_PNTC) {
1903 return pan_emit_vary_special(present, PAN_VARY_PNTCOORD, quirks);
1904 } else if (loc == VARYING_SLOT_FACE) {
1905 return pan_emit_vary_special(present, PAN_VARY_FACE, quirks);
1906 }
1907
1908 /* We've exhausted special cases, so it's otherwise a general varying. Check if we're linked */
1909 signed other_idx = -1;
1910
1911 for (unsigned j = 0; j < other->varying_count; ++j) {
1912 if (other->varyings_loc[j] == loc) {
1913 other_idx = j;
1914 break;
1915 }
1916 }
1917
1918 if (other_idx < 0)
1919 return pan_emit_vary_only(present, quirks);
1920
1921 unsigned offset = gen_offsets[other_idx];
1922
1923 if (should_alloc) {
1924 /* We're linked, so allocate a space via a watermark allocation */
1925 enum mali_format alt = other->varyings[other_idx];
1926
1927 /* Do interpolation at minimum precision */
1928 unsigned size_main = pan_varying_size(format);
1929 unsigned size_alt = pan_varying_size(alt);
1930 unsigned size = MIN2(size_main, size_alt);
1931
1932 /* If a varying is marked for XFB but not actually captured, we
1933 * should match the format to the format that would otherwise
1934 * be used for XFB, since dEQP checks for invariance here. It's
1935 * unclear if this is required by the spec. */
1936
1937 if (xfb->so_mask & (1ull << loc)) {
1938 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1939 format = pan_xfb_format(format, o->num_components);
1940 size = pan_varying_size(format);
1941 } else if (size == size_alt) {
1942 format = alt;
1943 }
1944
1945 gen_offsets[idx] = *gen_stride;
1946 gen_formats[other_idx] = format;
1947 offset = *gen_stride;
1948 *gen_stride += size;
1949 }
1950
1951 return pan_emit_vary(present, PAN_VARY_GENERAL,
1952 quirks, format, offset);
1953 }
1954
1955 static void
1956 pan_emit_special_input(union mali_attr *varyings,
1957 unsigned present,
1958 enum pan_special_varying v,
1959 mali_ptr addr)
1960 {
1961 if (present & (1 << v)) {
1962 /* Ensure we write exactly once for performance and with fields
1963 * zeroed appropriately to avoid flakes */
1964
1965 union mali_attr s = {
1966 .elements = addr
1967 };
1968
1969 varyings[pan_varying_index(present, v)] = s;
1970 }
1971 }
1972
1973 void
1974 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1975 unsigned vertex_count,
1976 struct mali_vertex_tiler_postfix *vertex_postfix,
1977 struct mali_vertex_tiler_postfix *tiler_postfix,
1978 union midgard_primitive_size *primitive_size)
1979 {
1980 /* Load the shaders */
1981 struct panfrost_context *ctx = batch->ctx;
1982 struct panfrost_device *dev = pan_device(ctx->base.screen);
1983 struct panfrost_shader_state *vs, *fs;
1984 size_t vs_size, fs_size;
1985
1986 /* Allocate the varying descriptor */
1987
1988 vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1989 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1990 vs_size = sizeof(struct mali_attr_meta) * vs->varying_count;
1991 fs_size = sizeof(struct mali_attr_meta) * fs->varying_count;
1992
1993 struct panfrost_transfer trans = panfrost_pool_alloc(&batch->pool,
1994 vs_size +
1995 fs_size);
1996
1997 struct pipe_stream_output_info *so = &vs->stream_output;
1998 unsigned present = pan_varying_present(vs, fs, dev->quirks);
1999
2000 /* Check if this varying is linked by us. This is the case for
2001 * general-purpose, non-captured varyings. If it is, link it. If it's
2002 * not, use the provided stream out information to determine the
2003 * offset, since it was already linked for us. */
2004
2005 unsigned gen_offsets[32];
2006 enum mali_format gen_formats[32];
2007 memset(gen_offsets, 0, sizeof(gen_offsets));
2008 memset(gen_formats, 0, sizeof(gen_formats));
2009
2010 unsigned gen_stride = 0;
2011 assert(vs->varying_count < ARRAY_SIZE(gen_offsets));
2012 assert(fs->varying_count < ARRAY_SIZE(gen_offsets));
2013
2014 unsigned streamout_offsets[32];
2015
2016 for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
2017 streamout_offsets[i] = panfrost_streamout_offset(
2018 so->stride[i],
2019 ctx->streamout.offsets[i],
2020 ctx->streamout.targets[i]);
2021 }
2022
2023 struct mali_attr_meta *ovs = (struct mali_attr_meta *)trans.cpu;
2024 struct mali_attr_meta *ofs = ovs + vs->varying_count;
2025
2026 for (unsigned i = 0; i < vs->varying_count; i++) {
2027 ovs[i] = panfrost_emit_varying(vs, fs, vs, present,
2028 ctx->streamout.num_targets, streamout_offsets,
2029 dev->quirks,
2030 gen_offsets, gen_formats, &gen_stride, i, true, false);
2031 }
2032
2033 for (unsigned i = 0; i < fs->varying_count; i++) {
2034 ofs[i] = panfrost_emit_varying(fs, vs, vs, present,
2035 ctx->streamout.num_targets, streamout_offsets,
2036 dev->quirks,
2037 gen_offsets, gen_formats, &gen_stride, i, false, true);
2038 }
2039
2040 unsigned xfb_base = pan_xfb_base(present);
2041 struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool,
2042 sizeof(union mali_attr) * (xfb_base + ctx->streamout.num_targets));
2043 union mali_attr *varyings = (union mali_attr *) T.cpu;
2044
2045 /* Emit the stream out buffers */
2046
2047 unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
2048 ctx->vertex_count);
2049
2050 for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
2051 panfrost_emit_streamout(batch, &varyings[xfb_base + i],
2052 so->stride[i],
2053 ctx->streamout.offsets[i],
2054 out_count,
2055 ctx->streamout.targets[i]);
2056 }
2057
2058 panfrost_emit_varyings(batch,
2059 &varyings[pan_varying_index(present, PAN_VARY_GENERAL)],
2060 gen_stride, vertex_count);
2061
2062 /* fp32 vec4 gl_Position */
2063 tiler_postfix->position_varying = panfrost_emit_varyings(batch,
2064 &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
2065 sizeof(float) * 4, vertex_count);
2066
2067 if (present & (1 << PAN_VARY_PSIZ)) {
2068 primitive_size->pointer = panfrost_emit_varyings(batch,
2069 &varyings[pan_varying_index(present, PAN_VARY_PSIZ)],
2070 2, vertex_count);
2071 }
2072
2073 pan_emit_special_input(varyings, present, PAN_VARY_PNTCOORD, MALI_VARYING_POINT_COORD);
2074 pan_emit_special_input(varyings, present, PAN_VARY_FACE, MALI_VARYING_FRONT_FACING);
2075 pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD, MALI_VARYING_FRAG_COORD);
2076
2077 vertex_postfix->varyings = T.gpu;
2078 tiler_postfix->varyings = T.gpu;
2079
2080 vertex_postfix->varying_meta = trans.gpu;
2081 tiler_postfix->varying_meta = trans.gpu + vs_size;
2082 }
2083
2084 void
2085 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
2086 struct mali_vertex_tiler_prefix *vertex_prefix,
2087 struct mali_vertex_tiler_postfix *vertex_postfix,
2088 struct mali_vertex_tiler_prefix *tiler_prefix,
2089 struct mali_vertex_tiler_postfix *tiler_postfix,
2090 union midgard_primitive_size *primitive_size)
2091 {
2092 struct panfrost_context *ctx = batch->ctx;
2093 struct panfrost_device *device = pan_device(ctx->base.screen);
2094 bool wallpapering = ctx->wallpaper_batch && batch->scoreboard.tiler_dep;
2095 struct bifrost_payload_vertex bifrost_vertex = {0,};
2096 struct bifrost_payload_tiler bifrost_tiler = {0,};
2097 struct midgard_payload_vertex_tiler midgard_vertex = {0,};
2098 struct midgard_payload_vertex_tiler midgard_tiler = {0,};
2099 void *vp, *tp;
2100 size_t vp_size, tp_size;
2101
2102 if (device->quirks & IS_BIFROST) {
2103 bifrost_vertex.prefix = *vertex_prefix;
2104 bifrost_vertex.postfix = *vertex_postfix;
2105 vp = &bifrost_vertex;
2106 vp_size = sizeof(bifrost_vertex);
2107
2108 bifrost_tiler.prefix = *tiler_prefix;
2109 bifrost_tiler.tiler.primitive_size = *primitive_size;
2110 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
2111 bifrost_tiler.postfix = *tiler_postfix;
2112 tp = &bifrost_tiler;
2113 tp_size = sizeof(bifrost_tiler);
2114 } else {
2115 midgard_vertex.prefix = *vertex_prefix;
2116 midgard_vertex.postfix = *vertex_postfix;
2117 vp = &midgard_vertex;
2118 vp_size = sizeof(midgard_vertex);
2119
2120 midgard_tiler.prefix = *tiler_prefix;
2121 midgard_tiler.postfix = *tiler_postfix;
2122 midgard_tiler.primitive_size = *primitive_size;
2123 tp = &midgard_tiler;
2124 tp_size = sizeof(midgard_tiler);
2125 }
2126
2127 if (wallpapering) {
2128 /* Inject in reverse order, with "predicted" job indices.
2129 * THIS IS A HACK XXX */
2130 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false,
2131 batch->scoreboard.job_index + 2, tp, tp_size, true);
2132 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2133 vp, vp_size, true);
2134 return;
2135 }
2136
2137 /* If rasterizer discard is enable, only submit the vertex */
2138
2139 bool rasterizer_discard = ctx->rasterizer &&
2140 ctx->rasterizer->base.rasterizer_discard;
2141
2142 unsigned vertex = panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
2143 vp, vp_size, false);
2144
2145 if (rasterizer_discard)
2146 return;
2147
2148 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false, vertex, tp, tp_size,
2149 false);
2150 }
2151
2152 /* TODO: stop hardcoding this */
2153 mali_ptr
2154 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2155 {
2156 uint16_t locations[] = {
2157 128, 128,
2158 0, 256,
2159 0, 256,
2160 0, 256,
2161 0, 256,
2162 0, 256,
2163 0, 256,
2164 0, 256,
2165 0, 256,
2166 0, 256,
2167 0, 256,
2168 0, 256,
2169 0, 256,
2170 0, 256,
2171 0, 256,
2172 0, 256,
2173 0, 256,
2174 0, 256,
2175 0, 256,
2176 0, 256,
2177 0, 256,
2178 0, 256,
2179 0, 256,
2180 0, 256,
2181 0, 256,
2182 0, 256,
2183 0, 256,
2184 0, 256,
2185 0, 256,
2186 0, 256,
2187 0, 256,
2188 0, 256,
2189 128, 128,
2190 0, 0,
2191 0, 0,
2192 0, 0,
2193 0, 0,
2194 0, 0,
2195 0, 0,
2196 0, 0,
2197 0, 0,
2198 0, 0,
2199 0, 0,
2200 0, 0,
2201 0, 0,
2202 0, 0,
2203 0, 0,
2204 0, 0,
2205 };
2206
2207 return panfrost_pool_upload(&batch->pool, locations, 96 * sizeof(uint16_t));
2208 }