panfrost: Update sampler views when the texture bo changes
[mesa.git] / src / gallium / drivers / panfrost / pan_cmdstream.c
1 /*
2 * Copyright (C) 2018 Alyssa Rosenzweig
3 * Copyright (C) 2020 Collabora Ltd.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "util/macros.h"
26 #include "util/u_prim.h"
27 #include "util/u_vbuf.h"
28
29 #include "panfrost-quirks.h"
30
31 #include "pan_allocate.h"
32 #include "pan_bo.h"
33 #include "pan_cmdstream.h"
34 #include "pan_context.h"
35 #include "pan_job.h"
36
37 /* If a BO is accessed for a particular shader stage, will it be in the primary
38 * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
39 * fragment will be primary, e.g. compute jobs will be considered
40 * "vertex/tiler" by analogy */
41
42 static inline uint32_t
43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
44 {
45 assert(stage == PIPE_SHADER_FRAGMENT ||
46 stage == PIPE_SHADER_VERTEX ||
47 stage == PIPE_SHADER_COMPUTE);
48
49 return stage == PIPE_SHADER_FRAGMENT ?
50 PAN_BO_ACCESS_FRAGMENT :
51 PAN_BO_ACCESS_VERTEX_TILER;
52 }
53
54 static void
55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
56 struct mali_vertex_tiler_postfix *postfix)
57 {
58 struct panfrost_device *dev = pan_device(ctx->base.screen);
59 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
60
61 unsigned shift = panfrost_get_stack_shift(batch->stack_size);
62 struct mali_shared_memory shared = {
63 .stack_shift = shift,
64 .scratchpad = panfrost_batch_get_scratchpad(batch, shift, dev->thread_tls_alloc, dev->core_count)->gpu,
65 .shared_workgroup_count = ~0,
66 };
67 postfix->shared_memory = panfrost_upload_transient(batch, &shared, sizeof(shared));
68 }
69
70 static void
71 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
72 struct mali_vertex_tiler_postfix *postfix)
73 {
74 struct panfrost_device *dev = pan_device(ctx->base.screen);
75 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
76
77 /* If we haven't, reserve space for the framebuffer */
78
79 if (!batch->framebuffer.gpu) {
80 unsigned size = (dev->quirks & MIDGARD_SFBD) ?
81 sizeof(struct mali_single_framebuffer) :
82 sizeof(struct mali_framebuffer);
83
84 batch->framebuffer = panfrost_allocate_transient(batch, size);
85
86 /* Tag the pointer */
87 if (!(dev->quirks & MIDGARD_SFBD))
88 batch->framebuffer.gpu |= MALI_MFBD;
89 }
90
91 postfix->shared_memory = batch->framebuffer.gpu;
92 }
93
94 static void
95 panfrost_vt_update_rasterizer(struct panfrost_context *ctx,
96 struct mali_vertex_tiler_prefix *prefix,
97 struct mali_vertex_tiler_postfix *postfix)
98 {
99 struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
100
101 postfix->gl_enables |= 0x7;
102 SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
103 rasterizer && rasterizer->base.front_ccw);
104 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
105 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_FRONT));
106 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
107 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_BACK));
108 SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
109 rasterizer && rasterizer->base.flatshade_first);
110 }
111
112 void
113 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
114 struct mali_vertex_tiler_prefix *prefix,
115 union midgard_primitive_size *primitive_size)
116 {
117 struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
118
119 if (!panfrost_writes_point_size(ctx)) {
120 bool points = prefix->draw_mode == MALI_POINTS;
121 float val = 0.0f;
122
123 if (rasterizer)
124 val = points ?
125 rasterizer->base.point_size :
126 rasterizer->base.line_width;
127
128 primitive_size->constant = val;
129 }
130 }
131
132 static void
133 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
134 struct mali_vertex_tiler_postfix *postfix)
135 {
136 SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
137 if (ctx->occlusion_query)
138 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
139 else
140 postfix->occlusion_counter = 0;
141 }
142
143 void
144 panfrost_vt_init(struct panfrost_context *ctx,
145 enum pipe_shader_type stage,
146 struct mali_vertex_tiler_prefix *prefix,
147 struct mali_vertex_tiler_postfix *postfix)
148 {
149 struct panfrost_device *device = pan_device(ctx->base.screen);
150
151 if (!ctx->shader[stage])
152 return;
153
154 memset(prefix, 0, sizeof(*prefix));
155 memset(postfix, 0, sizeof(*postfix));
156
157 if (device->quirks & IS_BIFROST) {
158 postfix->gl_enables = 0x2;
159 panfrost_vt_emit_shared_memory(ctx, postfix);
160 } else {
161 postfix->gl_enables = 0x6;
162 panfrost_vt_attach_framebuffer(ctx, postfix);
163 }
164
165 if (stage == PIPE_SHADER_FRAGMENT) {
166 panfrost_vt_update_occlusion_query(ctx, postfix);
167 panfrost_vt_update_rasterizer(ctx, prefix, postfix);
168 }
169 }
170
171 static unsigned
172 panfrost_translate_index_size(unsigned size)
173 {
174 switch (size) {
175 case 1:
176 return MALI_DRAW_INDEXED_UINT8;
177
178 case 2:
179 return MALI_DRAW_INDEXED_UINT16;
180
181 case 4:
182 return MALI_DRAW_INDEXED_UINT32;
183
184 default:
185 unreachable("Invalid index size");
186 }
187 }
188
189 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
190 * good for the duration of the draw (transient), could last longer. Also get
191 * the bounds on the index buffer for the range accessed by the draw. We do
192 * these operations together because there are natural optimizations which
193 * require them to be together. */
194
195 static mali_ptr
196 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
197 const struct pipe_draw_info *info,
198 unsigned *min_index, unsigned *max_index)
199 {
200 struct panfrost_resource *rsrc = pan_resource(info->index.resource);
201 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
202 off_t offset = info->start * info->index_size;
203 bool needs_indices = true;
204 mali_ptr out = 0;
205
206 if (info->max_index != ~0u) {
207 *min_index = info->min_index;
208 *max_index = info->max_index;
209 needs_indices = false;
210 }
211
212 if (!info->has_user_indices) {
213 /* Only resources can be directly mapped */
214 panfrost_batch_add_bo(batch, rsrc->bo,
215 PAN_BO_ACCESS_SHARED |
216 PAN_BO_ACCESS_READ |
217 PAN_BO_ACCESS_VERTEX_TILER);
218 out = rsrc->bo->gpu + offset;
219
220 /* Check the cache */
221 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
222 info->start,
223 info->count,
224 min_index,
225 max_index);
226 } else {
227 /* Otherwise, we need to upload to transient memory */
228 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
229 out = panfrost_upload_transient(batch, ibuf8 + offset,
230 info->count *
231 info->index_size);
232 }
233
234 if (needs_indices) {
235 /* Fallback */
236 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
237
238 if (!info->has_user_indices)
239 panfrost_minmax_cache_add(rsrc->index_cache,
240 info->start, info->count,
241 *min_index, *max_index);
242 }
243
244 return out;
245 }
246
247 void
248 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
249 const struct pipe_draw_info *info,
250 enum mali_draw_mode draw_mode,
251 struct mali_vertex_tiler_postfix *vertex_postfix,
252 struct mali_vertex_tiler_prefix *tiler_prefix,
253 struct mali_vertex_tiler_postfix *tiler_postfix,
254 unsigned *vertex_count,
255 unsigned *padded_count)
256 {
257 tiler_prefix->draw_mode = draw_mode;
258
259 unsigned draw_flags = 0;
260
261 if (panfrost_writes_point_size(ctx))
262 draw_flags |= MALI_DRAW_VARYING_SIZE;
263
264 if (info->primitive_restart)
265 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
266
267 /* These doesn't make much sense */
268
269 draw_flags |= 0x3000;
270
271 if (info->index_size) {
272 unsigned min_index = 0, max_index = 0;
273
274 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
275 info,
276 &min_index,
277 &max_index);
278
279 /* Use the corresponding values */
280 *vertex_count = max_index - min_index + 1;
281 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
282 tiler_prefix->offset_bias_correction = -min_index;
283 tiler_prefix->index_count = MALI_POSITIVE(info->count);
284 draw_flags |= panfrost_translate_index_size(info->index_size);
285 } else {
286 tiler_prefix->indices = 0;
287 *vertex_count = ctx->vertex_count;
288 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
289 tiler_prefix->offset_bias_correction = 0;
290 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
291 }
292
293 tiler_prefix->unknown_draw = draw_flags;
294
295 /* Encode the padded vertex count */
296
297 if (info->instance_count > 1) {
298 *padded_count = panfrost_padded_vertex_count(*vertex_count);
299
300 unsigned shift = __builtin_ctz(ctx->padded_count);
301 unsigned k = ctx->padded_count >> (shift + 1);
302
303 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
304 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
305 } else {
306 *padded_count = *vertex_count;
307
308 /* Reset instancing state */
309 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
310 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
311 }
312 }
313
314 static void
315 panfrost_shader_meta_init(struct panfrost_context *ctx,
316 enum pipe_shader_type st,
317 struct mali_shader_meta *meta)
318 {
319 const struct panfrost_device *dev = pan_device(ctx->base.screen);
320 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
321
322 memset(meta, 0, sizeof(*meta));
323 meta->shader = (ss->bo ? ss->bo->gpu : 0) | ss->first_tag;
324 meta->attribute_count = ss->attribute_count;
325 meta->varying_count = ss->varying_count;
326 meta->texture_count = ctx->sampler_view_count[st];
327 meta->sampler_count = ctx->sampler_count[st];
328
329 if (dev->quirks & IS_BIFROST) {
330 if (st == PIPE_SHADER_VERTEX)
331 meta->bifrost1.unk1 = 0x800000;
332 else {
333 /* First clause ATEST |= 0x4000000.
334 * Less than 32 regs |= 0x200 */
335 meta->bifrost1.unk1 = 0x950020;
336 }
337
338 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
339 if (st == PIPE_SHADER_VERTEX)
340 meta->bifrost2.preload_regs = 0xC0;
341 else {
342 meta->bifrost2.preload_regs = 0x1;
343 SET_BIT(meta->bifrost2.preload_regs, 0x10, ss->reads_frag_coord);
344 }
345
346 meta->bifrost2.uniform_count = MIN2(ss->uniform_count,
347 ss->uniform_cutoff);
348 } else {
349 meta->midgard1.uniform_count = MIN2(ss->uniform_count,
350 ss->uniform_cutoff);
351 meta->midgard1.work_count = ss->work_reg_count;
352
353 /* TODO: This is not conformant on ES3 */
354 meta->midgard1.flags_hi = MALI_SUPPRESS_INF_NAN;
355
356 meta->midgard1.flags_lo = 0x20;
357 meta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
358
359 SET_BIT(meta->midgard1.flags_hi, MALI_WRITES_GLOBAL, ss->writes_global);
360 }
361 }
362
363 static unsigned
364 panfrost_translate_compare_func(enum pipe_compare_func in)
365 {
366 switch (in) {
367 case PIPE_FUNC_NEVER:
368 return MALI_FUNC_NEVER;
369
370 case PIPE_FUNC_LESS:
371 return MALI_FUNC_LESS;
372
373 case PIPE_FUNC_EQUAL:
374 return MALI_FUNC_EQUAL;
375
376 case PIPE_FUNC_LEQUAL:
377 return MALI_FUNC_LEQUAL;
378
379 case PIPE_FUNC_GREATER:
380 return MALI_FUNC_GREATER;
381
382 case PIPE_FUNC_NOTEQUAL:
383 return MALI_FUNC_NOTEQUAL;
384
385 case PIPE_FUNC_GEQUAL:
386 return MALI_FUNC_GEQUAL;
387
388 case PIPE_FUNC_ALWAYS:
389 return MALI_FUNC_ALWAYS;
390
391 default:
392 unreachable("Invalid func");
393 }
394 }
395
396 static unsigned
397 panfrost_translate_stencil_op(enum pipe_stencil_op in)
398 {
399 switch (in) {
400 case PIPE_STENCIL_OP_KEEP:
401 return MALI_STENCIL_KEEP;
402
403 case PIPE_STENCIL_OP_ZERO:
404 return MALI_STENCIL_ZERO;
405
406 case PIPE_STENCIL_OP_REPLACE:
407 return MALI_STENCIL_REPLACE;
408
409 case PIPE_STENCIL_OP_INCR:
410 return MALI_STENCIL_INCR;
411
412 case PIPE_STENCIL_OP_DECR:
413 return MALI_STENCIL_DECR;
414
415 case PIPE_STENCIL_OP_INCR_WRAP:
416 return MALI_STENCIL_INCR_WRAP;
417
418 case PIPE_STENCIL_OP_DECR_WRAP:
419 return MALI_STENCIL_DECR_WRAP;
420
421 case PIPE_STENCIL_OP_INVERT:
422 return MALI_STENCIL_INVERT;
423
424 default:
425 unreachable("Invalid stencil op");
426 }
427 }
428
429 static unsigned
430 translate_tex_wrap(enum pipe_tex_wrap w)
431 {
432 switch (w) {
433 case PIPE_TEX_WRAP_REPEAT:
434 return MALI_WRAP_REPEAT;
435
436 case PIPE_TEX_WRAP_CLAMP:
437 return MALI_WRAP_CLAMP;
438
439 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
440 return MALI_WRAP_CLAMP_TO_EDGE;
441
442 case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
443 return MALI_WRAP_CLAMP_TO_BORDER;
444
445 case PIPE_TEX_WRAP_MIRROR_REPEAT:
446 return MALI_WRAP_MIRRORED_REPEAT;
447
448 case PIPE_TEX_WRAP_MIRROR_CLAMP:
449 return MALI_WRAP_MIRRORED_CLAMP;
450
451 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
452 return MALI_WRAP_MIRRORED_CLAMP_TO_EDGE;
453
454 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
455 return MALI_WRAP_MIRRORED_CLAMP_TO_BORDER;
456
457 default:
458 unreachable("Invalid wrap");
459 }
460 }
461
462 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
463 struct mali_sampler_descriptor *hw)
464 {
465 unsigned func = panfrost_translate_compare_func(cso->compare_func);
466 bool min_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
467 bool mag_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
468 bool mip_linear = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR;
469 unsigned min_filter = min_nearest ? MALI_SAMP_MIN_NEAREST : 0;
470 unsigned mag_filter = mag_nearest ? MALI_SAMP_MAG_NEAREST : 0;
471 unsigned mip_filter = mip_linear ?
472 (MALI_SAMP_MIP_LINEAR_1 | MALI_SAMP_MIP_LINEAR_2) : 0;
473 unsigned normalized = cso->normalized_coords ? MALI_SAMP_NORM_COORDS : 0;
474
475 *hw = (struct mali_sampler_descriptor) {
476 .filter_mode = min_filter | mag_filter | mip_filter |
477 normalized,
478 .wrap_s = translate_tex_wrap(cso->wrap_s),
479 .wrap_t = translate_tex_wrap(cso->wrap_t),
480 .wrap_r = translate_tex_wrap(cso->wrap_r),
481 .compare_func = panfrost_flip_compare_func(func),
482 .border_color = {
483 cso->border_color.f[0],
484 cso->border_color.f[1],
485 cso->border_color.f[2],
486 cso->border_color.f[3]
487 },
488 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
489 .max_lod = FIXED_16(cso->max_lod, false),
490 .lod_bias = FIXED_16(cso->lod_bias, true), /* can be negative */
491 .seamless_cube_map = cso->seamless_cube_map,
492 };
493
494 /* If necessary, we disable mipmapping in the sampler descriptor by
495 * clamping the LOD as tight as possible (from 0 to epsilon,
496 * essentially -- remember these are fixed point numbers, so
497 * epsilon=1/256) */
498
499 if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
500 hw->max_lod = hw->min_lod + 1;
501 }
502
503 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
504 struct bifrost_sampler_descriptor *hw)
505 {
506 *hw = (struct bifrost_sampler_descriptor) {
507 .unk1 = 0x1,
508 .wrap_s = translate_tex_wrap(cso->wrap_s),
509 .wrap_t = translate_tex_wrap(cso->wrap_t),
510 .wrap_r = translate_tex_wrap(cso->wrap_r),
511 .unk8 = 0x8,
512 .min_filter = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST,
513 .norm_coords = cso->normalized_coords,
514 .mip_filter = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR,
515 .mag_filter = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR,
516 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
517 .max_lod = FIXED_16(cso->max_lod, false),
518 };
519
520 /* If necessary, we disable mipmapping in the sampler descriptor by
521 * clamping the LOD as tight as possible (from 0 to epsilon,
522 * essentially -- remember these are fixed point numbers, so
523 * epsilon=1/256) */
524
525 if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
526 hw->max_lod = hw->min_lod + 1;
527 }
528
529 static void
530 panfrost_make_stencil_state(const struct pipe_stencil_state *in,
531 struct mali_stencil_test *out)
532 {
533 out->ref = 0; /* Gallium gets it from elsewhere */
534
535 out->mask = in->valuemask;
536 out->func = panfrost_translate_compare_func(in->func);
537 out->sfail = panfrost_translate_stencil_op(in->fail_op);
538 out->dpfail = panfrost_translate_stencil_op(in->zfail_op);
539 out->dppass = panfrost_translate_stencil_op(in->zpass_op);
540 }
541
542 static void
543 panfrost_frag_meta_rasterizer_update(struct panfrost_context *ctx,
544 struct mali_shader_meta *fragmeta)
545 {
546 if (!ctx->rasterizer) {
547 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, true);
548 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, false);
549 fragmeta->depth_units = 0.0f;
550 fragmeta->depth_factor = 0.0f;
551 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, false);
552 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, false);
553 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_NEAR, true);
554 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_FAR, true);
555 return;
556 }
557
558 struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
559
560 bool msaa = rast->multisample;
561
562 /* TODO: Sample size */
563 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
564 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
565 fragmeta->depth_units = rast->offset_units * 2.0f;
566 fragmeta->depth_factor = rast->offset_scale;
567
568 /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
569
570 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, rast->offset_tri);
571 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, rast->offset_tri);
572
573 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_NEAR, rast->depth_clip_near);
574 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_CLIP_FAR, rast->depth_clip_far);
575 }
576
577 static void
578 panfrost_frag_meta_zsa_update(struct panfrost_context *ctx,
579 struct mali_shader_meta *fragmeta)
580 {
581 const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
582 int zfunc = PIPE_FUNC_ALWAYS;
583
584 if (!zsa) {
585 struct pipe_stencil_state default_stencil = {
586 .enabled = 0,
587 .func = PIPE_FUNC_ALWAYS,
588 .fail_op = MALI_STENCIL_KEEP,
589 .zfail_op = MALI_STENCIL_KEEP,
590 .zpass_op = MALI_STENCIL_KEEP,
591 .writemask = 0xFF,
592 .valuemask = 0xFF
593 };
594
595 panfrost_make_stencil_state(&default_stencil,
596 &fragmeta->stencil_front);
597 fragmeta->stencil_mask_front = default_stencil.writemask;
598 fragmeta->stencil_back = fragmeta->stencil_front;
599 fragmeta->stencil_mask_back = default_stencil.writemask;
600 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST, false);
601 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK, false);
602 } else {
603 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
604 zsa->stencil[0].enabled);
605 panfrost_make_stencil_state(&zsa->stencil[0],
606 &fragmeta->stencil_front);
607 fragmeta->stencil_mask_front = zsa->stencil[0].writemask;
608 fragmeta->stencil_front.ref = ctx->stencil_ref.ref_value[0];
609
610 /* If back-stencil is not enabled, use the front values */
611
612 if (zsa->stencil[1].enabled) {
613 panfrost_make_stencil_state(&zsa->stencil[1],
614 &fragmeta->stencil_back);
615 fragmeta->stencil_mask_back = zsa->stencil[1].writemask;
616 fragmeta->stencil_back.ref = ctx->stencil_ref.ref_value[1];
617 } else {
618 fragmeta->stencil_back = fragmeta->stencil_front;
619 fragmeta->stencil_mask_back = fragmeta->stencil_mask_front;
620 fragmeta->stencil_back.ref = fragmeta->stencil_front.ref;
621 }
622
623 if (zsa->depth.enabled)
624 zfunc = zsa->depth.func;
625
626 /* Depth state (TODO: Refactor) */
627
628 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
629 zsa->depth.writemask);
630 }
631
632 fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
633 fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(zfunc));
634 }
635
636 static bool
637 panfrost_fs_required(
638 struct panfrost_shader_state *fs,
639 struct panfrost_blend_final *blend,
640 unsigned rt_count)
641 {
642 /* If we generally have side effects */
643 if (fs->fs_sidefx)
644 return true;
645
646 /* If colour is written we need to execute */
647 for (unsigned i = 0; i < rt_count; ++i) {
648 if (!blend[i].no_colour)
649 return true;
650 }
651
652 /* If depth is written and not implied we need to execute.
653 * TODO: Predicate on Z/S writes being enabled */
654 return (fs->writes_depth || fs->writes_stencil);
655 }
656
657 static void
658 panfrost_frag_meta_blend_update(struct panfrost_context *ctx,
659 struct mali_shader_meta *fragmeta,
660 void *rts)
661 {
662 const struct panfrost_device *dev = pan_device(ctx->base.screen);
663 struct panfrost_shader_state *fs;
664 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
665
666 SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
667 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
668 !ctx->blend->base.dither);
669
670 /* Get blending setup */
671 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
672
673 struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
674 unsigned shader_offset = 0;
675 struct panfrost_bo *shader_bo = NULL;
676
677 for (unsigned c = 0; c < rt_count; ++c)
678 blend[c] = panfrost_get_blend_for_context(ctx, c, &shader_bo,
679 &shader_offset);
680
681 /* Disable shader execution if we can */
682 if (dev->quirks & MIDGARD_SHADERLESS
683 && !panfrost_fs_required(fs, blend, rt_count)) {
684 fragmeta->shader = 0;
685 fragmeta->attribute_count = 0;
686 fragmeta->varying_count = 0;
687 fragmeta->texture_count = 0;
688 fragmeta->sampler_count = 0;
689
690 /* This feature is not known to work on Bifrost */
691 fragmeta->midgard1.work_count = 1;
692 fragmeta->midgard1.uniform_count = 0;
693 fragmeta->midgard1.uniform_buffer_count = 0;
694 }
695
696 /* If there is a blend shader, work registers are shared. We impose 8
697 * work registers as a limit for blend shaders. Should be lower XXX */
698
699 if (!(dev->quirks & IS_BIFROST)) {
700 for (unsigned c = 0; c < rt_count; ++c) {
701 if (blend[c].is_shader) {
702 fragmeta->midgard1.work_count =
703 MAX2(fragmeta->midgard1.work_count, 8);
704 }
705 }
706 }
707
708 /* Even on MFBD, the shader descriptor gets blend shaders. It's *also*
709 * copied to the blend_meta appended (by convention), but this is the
710 * field actually read by the hardware. (Or maybe both are read...?).
711 * Specify the last RTi with a blend shader. */
712
713 fragmeta->blend.shader = 0;
714
715 for (signed rt = (rt_count - 1); rt >= 0; --rt) {
716 if (!blend[rt].is_shader)
717 continue;
718
719 fragmeta->blend.shader = blend[rt].shader.gpu |
720 blend[rt].shader.first_tag;
721 break;
722 }
723
724 if (dev->quirks & MIDGARD_SFBD) {
725 /* When only a single render target platform is used, the blend
726 * information is inside the shader meta itself. We additionally
727 * need to signal CAN_DISCARD for nontrivial blend modes (so
728 * we're able to read back the destination buffer) */
729
730 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
731 blend[0].is_shader);
732
733 if (!blend[0].is_shader) {
734 fragmeta->blend.equation = *blend[0].equation.equation;
735 fragmeta->blend.constant = blend[0].equation.constant;
736 }
737
738 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
739 !blend[0].no_blending || fs->can_discard);
740 return;
741 }
742
743 if (dev->quirks & IS_BIFROST) {
744 bool no_blend = true;
745
746 for (unsigned i = 0; i < rt_count; ++i)
747 no_blend &= (blend[i].no_blending | blend[i].no_colour);
748
749 SET_BIT(fragmeta->bifrost1.unk1, MALI_BIFROST_EARLY_Z,
750 !fs->can_discard && !fs->writes_depth && no_blend);
751 }
752
753 /* Additional blend descriptor tacked on for jobs using MFBD */
754
755 for (unsigned i = 0; i < rt_count; ++i) {
756 unsigned flags = 0;
757
758 if (ctx->pipe_framebuffer.nr_cbufs > i && !blend[i].no_colour) {
759 flags = 0x200;
760
761 bool is_srgb = (ctx->pipe_framebuffer.nr_cbufs > i) &&
762 (ctx->pipe_framebuffer.cbufs[i]) &&
763 util_format_is_srgb(ctx->pipe_framebuffer.cbufs[i]->format);
764
765 SET_BIT(flags, MALI_BLEND_MRT_SHADER, blend[i].is_shader);
766 SET_BIT(flags, MALI_BLEND_LOAD_TIB, !blend[i].no_blending);
767 SET_BIT(flags, MALI_BLEND_SRGB, is_srgb);
768 SET_BIT(flags, MALI_BLEND_NO_DITHER, !ctx->blend->base.dither);
769 }
770
771 if (dev->quirks & IS_BIFROST) {
772 struct bifrost_blend_rt *brts = rts;
773
774 brts[i].flags = flags;
775
776 if (blend[i].is_shader) {
777 /* The blend shader's address needs to be at
778 * the same top 32 bit as the fragment shader.
779 * TODO: Ensure that's always the case.
780 */
781 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
782 (fs->bo->gpu & (0xffffffffull << 32)));
783 brts[i].shader = blend[i].shader.gpu;
784 brts[i].unk2 = 0x0;
785 } else if (ctx->pipe_framebuffer.nr_cbufs > i) {
786 enum pipe_format format = ctx->pipe_framebuffer.cbufs[i]->format;
787 const struct util_format_description *format_desc;
788 format_desc = util_format_description(format);
789
790 brts[i].equation = *blend[i].equation.equation;
791
792 /* TODO: this is a bit more complicated */
793 brts[i].constant = blend[i].equation.constant;
794
795 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
796
797 /* 0x19 disables blending and forces REPLACE
798 * mode (equivalent to rgb_mode = alpha_mode =
799 * x122, colour mask = 0xF). 0x1a allows
800 * blending. */
801 brts[i].unk2 = blend[i].no_blending ? 0x19 : 0x1a;
802
803 brts[i].shader_type = fs->blend_types[i];
804 } else {
805 /* Dummy attachment for depth-only */
806 brts[i].unk2 = 0x3;
807 brts[i].shader_type = fs->blend_types[i];
808 }
809 } else {
810 struct midgard_blend_rt *mrts = rts;
811 mrts[i].flags = flags;
812
813 if (blend[i].is_shader) {
814 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
815 } else {
816 mrts[i].blend.equation = *blend[i].equation.equation;
817 mrts[i].blend.constant = blend[i].equation.constant;
818 }
819 }
820 }
821 }
822
823 static void
824 panfrost_frag_shader_meta_init(struct panfrost_context *ctx,
825 struct mali_shader_meta *fragmeta,
826 void *rts)
827 {
828 const struct panfrost_device *dev = pan_device(ctx->base.screen);
829 struct panfrost_shader_state *fs;
830
831 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
832
833 fragmeta->alpha_coverage = ~MALI_ALPHA_COVERAGE(0.000000);
834 fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x10;
835 fragmeta->unknown2_4 = 0x4e0;
836
837 /* unknown2_4 has 0x10 bit set on T6XX and T720. We don't know why this
838 * is required (independent of 32-bit/64-bit descriptors), or why it's
839 * not used on later GPU revisions. Otherwise, all shader jobs fault on
840 * these earlier chips (perhaps this is a chicken bit of some kind).
841 * More investigation is needed. */
842
843 SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
844
845 if (dev->quirks & IS_BIFROST) {
846 /* TODO */
847 } else {
848 /* Depending on whether it's legal to in the given shader, we try to
849 * enable early-z testing. TODO: respect e-z force */
850
851 SET_BIT(fragmeta->midgard1.flags_lo, MALI_EARLY_Z,
852 !fs->can_discard && !fs->writes_global &&
853 !fs->writes_depth && !fs->writes_stencil);
854
855 /* Add the writes Z/S flags if needed. */
856 SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_Z, fs->writes_depth);
857 SET_BIT(fragmeta->midgard1.flags_hi, MALI_WRITES_S, fs->writes_stencil);
858
859 /* Any time texturing is used, derivatives are implicitly calculated,
860 * so we need to enable helper invocations */
861
862 SET_BIT(fragmeta->midgard1.flags_lo, MALI_HELPER_INVOCATIONS,
863 fs->helper_invocations);
864
865 const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
866
867 bool depth_enabled = fs->writes_depth ||
868 (zsa && zsa->depth.enabled && zsa->depth.func != PIPE_FUNC_ALWAYS);
869
870 SET_BIT(fragmeta->midgard1.flags_lo, 0x400, !depth_enabled && fs->can_discard);
871 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_ZS, depth_enabled && fs->can_discard);
872 }
873
874 panfrost_frag_meta_rasterizer_update(ctx, fragmeta);
875 panfrost_frag_meta_zsa_update(ctx, fragmeta);
876 panfrost_frag_meta_blend_update(ctx, fragmeta, rts);
877 }
878
879 void
880 panfrost_emit_shader_meta(struct panfrost_batch *batch,
881 enum pipe_shader_type st,
882 struct mali_vertex_tiler_postfix *postfix)
883 {
884 struct panfrost_context *ctx = batch->ctx;
885 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
886
887 if (!ss) {
888 postfix->shader = 0;
889 return;
890 }
891
892 struct mali_shader_meta meta;
893
894 panfrost_shader_meta_init(ctx, st, &meta);
895
896 /* Add the shader BO to the batch. */
897 panfrost_batch_add_bo(batch, ss->bo,
898 PAN_BO_ACCESS_PRIVATE |
899 PAN_BO_ACCESS_READ |
900 panfrost_bo_access_for_stage(st));
901
902 mali_ptr shader_ptr;
903
904 if (st == PIPE_SHADER_FRAGMENT) {
905 struct panfrost_device *dev = pan_device(ctx->base.screen);
906 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
907 size_t desc_size = sizeof(meta);
908 void *rts = NULL;
909 struct panfrost_transfer xfer;
910 unsigned rt_size;
911
912 if (dev->quirks & MIDGARD_SFBD)
913 rt_size = 0;
914 else if (dev->quirks & IS_BIFROST)
915 rt_size = sizeof(struct bifrost_blend_rt);
916 else
917 rt_size = sizeof(struct midgard_blend_rt);
918
919 desc_size += rt_size * rt_count;
920
921 if (rt_size)
922 rts = rzalloc_size(ctx, rt_size * rt_count);
923
924 panfrost_frag_shader_meta_init(ctx, &meta, rts);
925
926 xfer = panfrost_allocate_transient(batch, desc_size);
927
928 memcpy(xfer.cpu, &meta, sizeof(meta));
929 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
930
931 if (rt_size)
932 ralloc_free(rts);
933
934 shader_ptr = xfer.gpu;
935 } else {
936 shader_ptr = panfrost_upload_transient(batch, &meta,
937 sizeof(meta));
938 }
939
940 postfix->shader = shader_ptr;
941 }
942
943 static void
944 panfrost_mali_viewport_init(struct panfrost_context *ctx,
945 struct mali_viewport *mvp)
946 {
947 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
948
949 /* Clip bounds are encoded as floats. The viewport itself is encoded as
950 * (somewhat) asymmetric ints. */
951
952 const struct pipe_scissor_state *ss = &ctx->scissor;
953
954 memset(mvp, 0, sizeof(*mvp));
955
956 /* By default, do no viewport clipping, i.e. clip to (-inf, inf) in
957 * each direction. Clipping to the viewport in theory should work, but
958 * in practice causes issues when we're not explicitly trying to
959 * scissor */
960
961 *mvp = (struct mali_viewport) {
962 .clip_minx = -INFINITY,
963 .clip_miny = -INFINITY,
964 .clip_maxx = INFINITY,
965 .clip_maxy = INFINITY,
966 };
967
968 /* Always scissor to the viewport by default. */
969 float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
970 float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
971
972 float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
973 float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
974
975 float minz = (vp->translate[2] - fabsf(vp->scale[2]));
976 float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
977
978 /* Apply the scissor test */
979
980 unsigned minx, miny, maxx, maxy;
981
982 if (ss && ctx->rasterizer && ctx->rasterizer->base.scissor) {
983 minx = MAX2(ss->minx, vp_minx);
984 miny = MAX2(ss->miny, vp_miny);
985 maxx = MIN2(ss->maxx, vp_maxx);
986 maxy = MIN2(ss->maxy, vp_maxy);
987 } else {
988 minx = vp_minx;
989 miny = vp_miny;
990 maxx = vp_maxx;
991 maxy = vp_maxy;
992 }
993
994 /* Hardware needs the min/max to be strictly ordered, so flip if we
995 * need to. The viewport transformation in the vertex shader will
996 * handle the negatives if we don't */
997
998 if (miny > maxy) {
999 unsigned temp = miny;
1000 miny = maxy;
1001 maxy = temp;
1002 }
1003
1004 if (minx > maxx) {
1005 unsigned temp = minx;
1006 minx = maxx;
1007 maxx = temp;
1008 }
1009
1010 if (minz > maxz) {
1011 float temp = minz;
1012 minz = maxz;
1013 maxz = temp;
1014 }
1015
1016 /* Clamp to the framebuffer size as a last check */
1017
1018 minx = MIN2(ctx->pipe_framebuffer.width, minx);
1019 maxx = MIN2(ctx->pipe_framebuffer.width, maxx);
1020
1021 miny = MIN2(ctx->pipe_framebuffer.height, miny);
1022 maxy = MIN2(ctx->pipe_framebuffer.height, maxy);
1023
1024 /* Upload */
1025
1026 mvp->viewport0[0] = minx;
1027 mvp->viewport1[0] = MALI_POSITIVE(maxx);
1028
1029 mvp->viewport0[1] = miny;
1030 mvp->viewport1[1] = MALI_POSITIVE(maxy);
1031
1032 bool clip_near = true;
1033 bool clip_far = true;
1034
1035 if (ctx->rasterizer) {
1036 clip_near = ctx->rasterizer->base.depth_clip_near;
1037 clip_far = ctx->rasterizer->base.depth_clip_far;
1038 }
1039
1040 mvp->clip_minz = clip_near ? minz : -INFINITY;
1041 mvp->clip_maxz = clip_far ? maxz : INFINITY;
1042 }
1043
1044 void
1045 panfrost_emit_viewport(struct panfrost_batch *batch,
1046 struct mali_vertex_tiler_postfix *tiler_postfix)
1047 {
1048 struct panfrost_context *ctx = batch->ctx;
1049 struct mali_viewport mvp;
1050
1051 panfrost_mali_viewport_init(batch->ctx, &mvp);
1052
1053 /* Update the job, unless we're doing wallpapering (whose lack of
1054 * scissor we can ignore, since if we "miss" a tile of wallpaper, it'll
1055 * just... be faster :) */
1056
1057 if (!ctx->wallpaper_batch)
1058 panfrost_batch_union_scissor(batch, mvp.viewport0[0],
1059 mvp.viewport0[1],
1060 mvp.viewport1[0] + 1,
1061 mvp.viewport1[1] + 1);
1062
1063 tiler_postfix->viewport = panfrost_upload_transient(batch, &mvp,
1064 sizeof(mvp));
1065 }
1066
1067 static mali_ptr
1068 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
1069 enum pipe_shader_type st,
1070 struct panfrost_constant_buffer *buf,
1071 unsigned index)
1072 {
1073 struct pipe_constant_buffer *cb = &buf->cb[index];
1074 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1075
1076 if (rsrc) {
1077 panfrost_batch_add_bo(batch, rsrc->bo,
1078 PAN_BO_ACCESS_SHARED |
1079 PAN_BO_ACCESS_READ |
1080 panfrost_bo_access_for_stage(st));
1081
1082 /* Alignment gauranteed by
1083 * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
1084 return rsrc->bo->gpu + cb->buffer_offset;
1085 } else if (cb->user_buffer) {
1086 return panfrost_upload_transient(batch,
1087 cb->user_buffer +
1088 cb->buffer_offset,
1089 cb->buffer_size);
1090 } else {
1091 unreachable("No constant buffer");
1092 }
1093 }
1094
1095 struct sysval_uniform {
1096 union {
1097 float f[4];
1098 int32_t i[4];
1099 uint32_t u[4];
1100 uint64_t du[2];
1101 };
1102 };
1103
1104 static void
1105 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
1106 struct sysval_uniform *uniform)
1107 {
1108 struct panfrost_context *ctx = batch->ctx;
1109 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1110
1111 uniform->f[0] = vp->scale[0];
1112 uniform->f[1] = vp->scale[1];
1113 uniform->f[2] = vp->scale[2];
1114 }
1115
1116 static void
1117 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
1118 struct sysval_uniform *uniform)
1119 {
1120 struct panfrost_context *ctx = batch->ctx;
1121 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1122
1123 uniform->f[0] = vp->translate[0];
1124 uniform->f[1] = vp->translate[1];
1125 uniform->f[2] = vp->translate[2];
1126 }
1127
1128 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
1129 enum pipe_shader_type st,
1130 unsigned int sysvalid,
1131 struct sysval_uniform *uniform)
1132 {
1133 struct panfrost_context *ctx = batch->ctx;
1134 unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
1135 unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
1136 bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
1137 struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
1138
1139 assert(dim);
1140 uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
1141
1142 if (dim > 1)
1143 uniform->i[1] = u_minify(tex->texture->height0,
1144 tex->u.tex.first_level);
1145
1146 if (dim > 2)
1147 uniform->i[2] = u_minify(tex->texture->depth0,
1148 tex->u.tex.first_level);
1149
1150 if (is_array)
1151 uniform->i[dim] = tex->texture->array_size;
1152 }
1153
1154 static void
1155 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
1156 enum pipe_shader_type st,
1157 unsigned ssbo_id,
1158 struct sysval_uniform *uniform)
1159 {
1160 struct panfrost_context *ctx = batch->ctx;
1161
1162 assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
1163 struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
1164
1165 /* Compute address */
1166 struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
1167
1168 panfrost_batch_add_bo(batch, bo,
1169 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
1170 panfrost_bo_access_for_stage(st));
1171
1172 /* Upload address and size as sysval */
1173 uniform->du[0] = bo->gpu + sb.buffer_offset;
1174 uniform->u[2] = sb.buffer_size;
1175 }
1176
1177 static void
1178 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1179 enum pipe_shader_type st,
1180 unsigned samp_idx,
1181 struct sysval_uniform *uniform)
1182 {
1183 struct panfrost_context *ctx = batch->ctx;
1184 struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1185
1186 uniform->f[0] = sampl->min_lod;
1187 uniform->f[1] = sampl->max_lod;
1188 uniform->f[2] = sampl->lod_bias;
1189
1190 /* Even without any errata, Midgard represents "no mipmapping" as
1191 * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1192 * panfrost_create_sampler_state which also explains our choice of
1193 * epsilon value (again to keep behaviour consistent) */
1194
1195 if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1196 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1197 }
1198
1199 static void
1200 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1201 struct sysval_uniform *uniform)
1202 {
1203 struct panfrost_context *ctx = batch->ctx;
1204
1205 uniform->u[0] = ctx->compute_grid->grid[0];
1206 uniform->u[1] = ctx->compute_grid->grid[1];
1207 uniform->u[2] = ctx->compute_grid->grid[2];
1208 }
1209
1210 static void
1211 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1212 struct panfrost_shader_state *ss,
1213 enum pipe_shader_type st)
1214 {
1215 struct sysval_uniform *uniforms = (void *)buf;
1216
1217 for (unsigned i = 0; i < ss->sysval_count; ++i) {
1218 int sysval = ss->sysval[i];
1219
1220 switch (PAN_SYSVAL_TYPE(sysval)) {
1221 case PAN_SYSVAL_VIEWPORT_SCALE:
1222 panfrost_upload_viewport_scale_sysval(batch,
1223 &uniforms[i]);
1224 break;
1225 case PAN_SYSVAL_VIEWPORT_OFFSET:
1226 panfrost_upload_viewport_offset_sysval(batch,
1227 &uniforms[i]);
1228 break;
1229 case PAN_SYSVAL_TEXTURE_SIZE:
1230 panfrost_upload_txs_sysval(batch, st,
1231 PAN_SYSVAL_ID(sysval),
1232 &uniforms[i]);
1233 break;
1234 case PAN_SYSVAL_SSBO:
1235 panfrost_upload_ssbo_sysval(batch, st,
1236 PAN_SYSVAL_ID(sysval),
1237 &uniforms[i]);
1238 break;
1239 case PAN_SYSVAL_NUM_WORK_GROUPS:
1240 panfrost_upload_num_work_groups_sysval(batch,
1241 &uniforms[i]);
1242 break;
1243 case PAN_SYSVAL_SAMPLER:
1244 panfrost_upload_sampler_sysval(batch, st,
1245 PAN_SYSVAL_ID(sysval),
1246 &uniforms[i]);
1247 break;
1248 default:
1249 assert(0);
1250 }
1251 }
1252 }
1253
1254 static const void *
1255 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1256 unsigned index)
1257 {
1258 struct pipe_constant_buffer *cb = &buf->cb[index];
1259 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1260
1261 if (rsrc)
1262 return rsrc->bo->cpu;
1263 else if (cb->user_buffer)
1264 return cb->user_buffer;
1265 else
1266 unreachable("No constant buffer");
1267 }
1268
1269 void
1270 panfrost_emit_const_buf(struct panfrost_batch *batch,
1271 enum pipe_shader_type stage,
1272 struct mali_vertex_tiler_postfix *postfix)
1273 {
1274 struct panfrost_context *ctx = batch->ctx;
1275 struct panfrost_shader_variants *all = ctx->shader[stage];
1276
1277 if (!all)
1278 return;
1279
1280 struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1281
1282 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1283
1284 /* Uniforms are implicitly UBO #0 */
1285 bool has_uniforms = buf->enabled_mask & (1 << 0);
1286
1287 /* Allocate room for the sysval and the uniforms */
1288 size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1289 size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1290 size_t size = sys_size + uniform_size;
1291 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1292 size);
1293
1294 /* Upload sysvals requested by the shader */
1295 panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1296
1297 /* Upload uniforms */
1298 if (has_uniforms && uniform_size) {
1299 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1300 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1301 }
1302
1303 /* Next up, attach UBOs. UBO #0 is the uniforms we just
1304 * uploaded */
1305
1306 unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1307 assert(ubo_count >= 1);
1308
1309 size_t sz = sizeof(uint64_t) * ubo_count;
1310 uint64_t ubos[PAN_MAX_CONST_BUFFERS];
1311 int uniform_count = ss->uniform_count;
1312
1313 /* Upload uniforms as a UBO */
1314 ubos[0] = MALI_MAKE_UBO(2 + uniform_count, transfer.gpu);
1315
1316 /* The rest are honest-to-goodness UBOs */
1317
1318 for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1319 size_t usz = buf->cb[ubo].buffer_size;
1320 bool enabled = buf->enabled_mask & (1 << ubo);
1321 bool empty = usz == 0;
1322
1323 if (!enabled || empty) {
1324 /* Stub out disabled UBOs to catch accesses */
1325 ubos[ubo] = MALI_MAKE_UBO(0, 0xDEAD0000);
1326 continue;
1327 }
1328
1329 mali_ptr gpu = panfrost_map_constant_buffer_gpu(batch, stage,
1330 buf, ubo);
1331
1332 unsigned bytes_per_field = 16;
1333 unsigned aligned = ALIGN_POT(usz, bytes_per_field);
1334 ubos[ubo] = MALI_MAKE_UBO(aligned / bytes_per_field, gpu);
1335 }
1336
1337 mali_ptr ubufs = panfrost_upload_transient(batch, ubos, sz);
1338 postfix->uniforms = transfer.gpu;
1339 postfix->uniform_buffers = ubufs;
1340
1341 buf->dirty_mask = 0;
1342 }
1343
1344 void
1345 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1346 const struct pipe_grid_info *info,
1347 struct midgard_payload_vertex_tiler *vtp)
1348 {
1349 struct panfrost_context *ctx = batch->ctx;
1350 struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1351 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1352 unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1353 128));
1354 unsigned shared_size = single_size * info->grid[0] * info->grid[1] *
1355 info->grid[2] * 4;
1356 struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1357 shared_size,
1358 1);
1359
1360 struct mali_shared_memory shared = {
1361 .shared_memory = bo->gpu,
1362 .shared_workgroup_count =
1363 util_logbase2_ceil(info->grid[0]) +
1364 util_logbase2_ceil(info->grid[1]) +
1365 util_logbase2_ceil(info->grid[2]),
1366 .shared_unk1 = 0x2,
1367 .shared_shift = util_logbase2(single_size) - 1
1368 };
1369
1370 vtp->postfix.shared_memory = panfrost_upload_transient(batch, &shared,
1371 sizeof(shared));
1372 }
1373
1374 static mali_ptr
1375 panfrost_get_tex_desc(struct panfrost_batch *batch,
1376 enum pipe_shader_type st,
1377 struct panfrost_sampler_view *view)
1378 {
1379 if (!view)
1380 return (mali_ptr) 0;
1381
1382 struct pipe_sampler_view *pview = &view->base;
1383 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1384
1385 /* Add the BO to the job so it's retained until the job is done. */
1386
1387 panfrost_batch_add_bo(batch, rsrc->bo,
1388 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1389 panfrost_bo_access_for_stage(st));
1390
1391 panfrost_batch_add_bo(batch, view->bo,
1392 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1393 panfrost_bo_access_for_stage(st));
1394
1395 return view->bo->gpu;
1396 }
1397
1398 static void
1399 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
1400 struct pipe_context *pctx)
1401 {
1402 struct panfrost_resource *rsrc = pan_resource(view->base.texture);
1403 if (view->texture_bo != rsrc->bo->gpu ||
1404 view->layout != rsrc->layout) {
1405 panfrost_bo_unreference(view->bo);
1406 panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);
1407 }
1408 }
1409
1410 void
1411 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1412 enum pipe_shader_type stage,
1413 struct mali_vertex_tiler_postfix *postfix)
1414 {
1415 struct panfrost_context *ctx = batch->ctx;
1416 struct panfrost_device *device = pan_device(ctx->base.screen);
1417
1418 if (!ctx->sampler_view_count[stage])
1419 return;
1420
1421 if (device->quirks & IS_BIFROST) {
1422 struct bifrost_texture_descriptor *descriptors;
1423
1424 descriptors = malloc(sizeof(struct bifrost_texture_descriptor) *
1425 ctx->sampler_view_count[stage]);
1426
1427 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1428 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1429 struct pipe_sampler_view *pview = &view->base;
1430 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1431 panfrost_update_sampler_view(view, &ctx->base);
1432
1433 /* Add the BOs to the job so they are retained until the job is done. */
1434
1435 panfrost_batch_add_bo(batch, rsrc->bo,
1436 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1437 panfrost_bo_access_for_stage(stage));
1438
1439 panfrost_batch_add_bo(batch, view->bo,
1440 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1441 panfrost_bo_access_for_stage(stage));
1442
1443 memcpy(&descriptors[i], view->bifrost_descriptor, sizeof(*view->bifrost_descriptor));
1444 }
1445
1446 postfix->textures = panfrost_upload_transient(batch,
1447 descriptors,
1448 sizeof(struct bifrost_texture_descriptor) *
1449 ctx->sampler_view_count[stage]);
1450
1451 free(descriptors);
1452 } else {
1453 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1454
1455 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1456 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1457
1458 panfrost_update_sampler_view(view, &ctx->base);
1459
1460 trampolines[i] = panfrost_get_tex_desc(batch, stage, view);
1461 }
1462
1463 postfix->textures = panfrost_upload_transient(batch,
1464 trampolines,
1465 sizeof(uint64_t) *
1466 ctx->sampler_view_count[stage]);
1467 }
1468 }
1469
1470 void
1471 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1472 enum pipe_shader_type stage,
1473 struct mali_vertex_tiler_postfix *postfix)
1474 {
1475 struct panfrost_context *ctx = batch->ctx;
1476 struct panfrost_device *device = pan_device(ctx->base.screen);
1477
1478 if (!ctx->sampler_count[stage])
1479 return;
1480
1481 if (device->quirks & IS_BIFROST) {
1482 size_t desc_size = sizeof(struct bifrost_sampler_descriptor);
1483 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1484 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1485 transfer_size);
1486 struct bifrost_sampler_descriptor *desc = (struct bifrost_sampler_descriptor *)transfer.cpu;
1487
1488 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1489 desc[i] = ctx->samplers[stage][i]->bifrost_hw;
1490
1491 postfix->sampler_descriptor = transfer.gpu;
1492 } else {
1493 size_t desc_size = sizeof(struct mali_sampler_descriptor);
1494 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1495 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1496 transfer_size);
1497 struct mali_sampler_descriptor *desc = (struct mali_sampler_descriptor *)transfer.cpu;
1498
1499 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1500 desc[i] = ctx->samplers[stage][i]->midgard_hw;
1501
1502 postfix->sampler_descriptor = transfer.gpu;
1503 }
1504 }
1505
1506 void
1507 panfrost_emit_vertex_attr_meta(struct panfrost_batch *batch,
1508 struct mali_vertex_tiler_postfix *vertex_postfix)
1509 {
1510 struct panfrost_context *ctx = batch->ctx;
1511
1512 if (!ctx->vertex)
1513 return;
1514
1515 struct panfrost_vertex_state *so = ctx->vertex;
1516
1517 panfrost_vertex_state_upd_attr_offs(ctx, vertex_postfix);
1518 vertex_postfix->attribute_meta = panfrost_upload_transient(batch, so->hw,
1519 sizeof(*so->hw) *
1520 PAN_MAX_ATTRIBUTE);
1521 }
1522
1523 void
1524 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1525 struct mali_vertex_tiler_postfix *vertex_postfix)
1526 {
1527 struct panfrost_context *ctx = batch->ctx;
1528 struct panfrost_vertex_state *so = ctx->vertex;
1529
1530 /* Staged mali_attr, and index into them. i =/= k, depending on the
1531 * vertex buffer mask and instancing. Twice as much room is allocated,
1532 * for a worst case of NPOT_DIVIDEs which take up extra slot */
1533 union mali_attr attrs[PIPE_MAX_ATTRIBS * 2];
1534 unsigned k = 0;
1535
1536 for (unsigned i = 0; i < so->num_elements; ++i) {
1537 /* We map a mali_attr to be 1:1 with the mali_attr_meta, which
1538 * means duplicating some vertex buffers (who cares? aside from
1539 * maybe some caching implications but I somehow doubt that
1540 * matters) */
1541
1542 struct pipe_vertex_element *elem = &so->pipe[i];
1543 unsigned vbi = elem->vertex_buffer_index;
1544
1545 /* The exception to 1:1 mapping is that we can have multiple
1546 * entries (NPOT divisors), so we fixup anyways */
1547
1548 so->hw[i].index = k;
1549
1550 if (!(ctx->vb_mask & (1 << vbi)))
1551 continue;
1552
1553 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1554 struct panfrost_resource *rsrc;
1555
1556 rsrc = pan_resource(buf->buffer.resource);
1557 if (!rsrc)
1558 continue;
1559
1560 /* Align to 64 bytes by masking off the lower bits. This
1561 * will be adjusted back when we fixup the src_offset in
1562 * mali_attr_meta */
1563
1564 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1565 mali_ptr addr = raw_addr & ~63;
1566 unsigned chopped_addr = raw_addr - addr;
1567
1568 /* Add a dependency of the batch on the vertex buffer */
1569 panfrost_batch_add_bo(batch, rsrc->bo,
1570 PAN_BO_ACCESS_SHARED |
1571 PAN_BO_ACCESS_READ |
1572 PAN_BO_ACCESS_VERTEX_TILER);
1573
1574 /* Set common fields */
1575 attrs[k].elements = addr;
1576 attrs[k].stride = buf->stride;
1577
1578 /* Since we advanced the base pointer, we shrink the buffer
1579 * size */
1580 attrs[k].size = rsrc->base.width0 - buf->buffer_offset;
1581
1582 /* We need to add the extra size we masked off (for
1583 * correctness) so the data doesn't get clamped away */
1584 attrs[k].size += chopped_addr;
1585
1586 /* For non-instancing make sure we initialize */
1587 attrs[k].shift = attrs[k].extra_flags = 0;
1588
1589 /* Instancing uses a dramatically different code path than
1590 * linear, so dispatch for the actual emission now that the
1591 * common code is finished */
1592
1593 unsigned divisor = elem->instance_divisor;
1594
1595 if (divisor && ctx->instance_count == 1) {
1596 /* Silly corner case where there's a divisor(=1) but
1597 * there's no legitimate instancing. So we want *every*
1598 * attribute to be the same. So set stride to zero so
1599 * we don't go anywhere. */
1600
1601 attrs[k].size = attrs[k].stride + chopped_addr;
1602 attrs[k].stride = 0;
1603 attrs[k++].elements |= MALI_ATTR_LINEAR;
1604 } else if (ctx->instance_count <= 1) {
1605 /* Normal, non-instanced attributes */
1606 attrs[k++].elements |= MALI_ATTR_LINEAR;
1607 } else {
1608 unsigned instance_shift = vertex_postfix->instance_shift;
1609 unsigned instance_odd = vertex_postfix->instance_odd;
1610
1611 k += panfrost_vertex_instanced(ctx->padded_count,
1612 instance_shift,
1613 instance_odd,
1614 divisor, &attrs[k]);
1615 }
1616 }
1617
1618 /* Add special gl_VertexID/gl_InstanceID buffers */
1619
1620 panfrost_vertex_id(ctx->padded_count, &attrs[k]);
1621 so->hw[PAN_VERTEX_ID].index = k++;
1622 panfrost_instance_id(ctx->padded_count, &attrs[k]);
1623 so->hw[PAN_INSTANCE_ID].index = k++;
1624
1625 /* Upload whatever we emitted and go */
1626
1627 vertex_postfix->attributes = panfrost_upload_transient(batch, attrs,
1628 k * sizeof(*attrs));
1629 }
1630
1631 static mali_ptr
1632 panfrost_emit_varyings(struct panfrost_batch *batch, union mali_attr *slot,
1633 unsigned stride, unsigned count)
1634 {
1635 /* Fill out the descriptor */
1636 slot->stride = stride;
1637 slot->size = stride * count;
1638 slot->shift = slot->extra_flags = 0;
1639
1640 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1641 slot->size);
1642
1643 slot->elements = transfer.gpu | MALI_ATTR_LINEAR;
1644
1645 return transfer.gpu;
1646 }
1647
1648 static unsigned
1649 panfrost_streamout_offset(unsigned stride, unsigned offset,
1650 struct pipe_stream_output_target *target)
1651 {
1652 return (target->buffer_offset + (offset * stride * 4)) & 63;
1653 }
1654
1655 static void
1656 panfrost_emit_streamout(struct panfrost_batch *batch, union mali_attr *slot,
1657 unsigned stride, unsigned offset, unsigned count,
1658 struct pipe_stream_output_target *target)
1659 {
1660 /* Fill out the descriptor */
1661 slot->stride = stride * 4;
1662 slot->shift = slot->extra_flags = 0;
1663
1664 unsigned max_size = target->buffer_size;
1665 unsigned expected_size = slot->stride * count;
1666
1667 /* Grab the BO and bind it to the batch */
1668 struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1669
1670 /* Varyings are WRITE from the perspective of the VERTEX but READ from
1671 * the perspective of the TILER and FRAGMENT.
1672 */
1673 panfrost_batch_add_bo(batch, bo,
1674 PAN_BO_ACCESS_SHARED |
1675 PAN_BO_ACCESS_RW |
1676 PAN_BO_ACCESS_VERTEX_TILER |
1677 PAN_BO_ACCESS_FRAGMENT);
1678
1679 /* We will have an offset applied to get alignment */
1680 mali_ptr addr = bo->gpu + target->buffer_offset + (offset * slot->stride);
1681 slot->elements = (addr & ~63) | MALI_ATTR_LINEAR;
1682 slot->size = MIN2(max_size, expected_size) + (addr & 63);
1683 }
1684
1685 static bool
1686 has_point_coord(unsigned mask, gl_varying_slot loc)
1687 {
1688 if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1689 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1690 else if (loc == VARYING_SLOT_PNTC)
1691 return (mask & (1 << 8));
1692 else
1693 return false;
1694 }
1695
1696 /* Helpers for manipulating stream out information so we can pack varyings
1697 * accordingly. Compute the src_offset for a given captured varying */
1698
1699 static struct pipe_stream_output *
1700 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1701 {
1702 for (unsigned i = 0; i < info->num_outputs; ++i) {
1703 if (info->output[i].register_index == loc)
1704 return &info->output[i];
1705 }
1706
1707 unreachable("Varying not captured");
1708 }
1709
1710 static unsigned
1711 pan_varying_size(enum mali_format fmt)
1712 {
1713 unsigned type = MALI_EXTRACT_TYPE(fmt);
1714 unsigned chan = MALI_EXTRACT_CHANNELS(fmt);
1715 unsigned bits = MALI_EXTRACT_BITS(fmt);
1716 unsigned bpc = 0;
1717
1718 if (bits == MALI_CHANNEL_FLOAT) {
1719 /* No doubles */
1720 bool fp16 = (type == MALI_FORMAT_SINT);
1721 assert(fp16 || (type == MALI_FORMAT_UNORM));
1722
1723 bpc = fp16 ? 2 : 4;
1724 } else {
1725 assert(type >= MALI_FORMAT_SNORM && type <= MALI_FORMAT_SINT);
1726
1727 /* See the enums */
1728 bits = 1 << bits;
1729 assert(bits >= 8);
1730 bpc = bits / 8;
1731 }
1732
1733 return bpc * chan;
1734 }
1735
1736 /* Indices for named (non-XFB) varyings that are present. These are packed
1737 * tightly so they correspond to a bitfield present (P) indexed by (1 <<
1738 * PAN_VARY_*). This has the nice property that you can lookup the buffer index
1739 * of a given special field given a shift S by:
1740 *
1741 * idx = popcount(P & ((1 << S) - 1))
1742 *
1743 * That is... look at all of the varyings that come earlier and count them, the
1744 * count is the new index since plus one. Likewise, the total number of special
1745 * buffers required is simply popcount(P)
1746 */
1747
1748 enum pan_special_varying {
1749 PAN_VARY_GENERAL = 0,
1750 PAN_VARY_POSITION = 1,
1751 PAN_VARY_PSIZ = 2,
1752 PAN_VARY_PNTCOORD = 3,
1753 PAN_VARY_FACE = 4,
1754 PAN_VARY_FRAGCOORD = 5,
1755
1756 /* Keep last */
1757 PAN_VARY_MAX,
1758 };
1759
1760 /* Given a varying, figure out which index it correpsonds to */
1761
1762 static inline unsigned
1763 pan_varying_index(unsigned present, enum pan_special_varying v)
1764 {
1765 unsigned mask = (1 << v) - 1;
1766 return util_bitcount(present & mask);
1767 }
1768
1769 /* Get the base offset for XFB buffers, which by convention come after
1770 * everything else. Wrapper function for semantic reasons; by construction this
1771 * is just popcount. */
1772
1773 static inline unsigned
1774 pan_xfb_base(unsigned present)
1775 {
1776 return util_bitcount(present);
1777 }
1778
1779 /* Computes the present mask for varyings so we can start emitting varying records */
1780
1781 static inline unsigned
1782 pan_varying_present(
1783 struct panfrost_shader_state *vs,
1784 struct panfrost_shader_state *fs,
1785 unsigned quirks)
1786 {
1787 /* At the moment we always emit general and position buffers. Not
1788 * strictly necessary but usually harmless */
1789
1790 unsigned present = (1 << PAN_VARY_GENERAL) | (1 << PAN_VARY_POSITION);
1791
1792 /* Enable special buffers by the shader info */
1793
1794 if (vs->writes_point_size)
1795 present |= (1 << PAN_VARY_PSIZ);
1796
1797 if (fs->reads_point_coord)
1798 present |= (1 << PAN_VARY_PNTCOORD);
1799
1800 if (fs->reads_face)
1801 present |= (1 << PAN_VARY_FACE);
1802
1803 if (fs->reads_frag_coord && !(quirks & IS_BIFROST))
1804 present |= (1 << PAN_VARY_FRAGCOORD);
1805
1806 /* Also, if we have a point sprite, we need a point coord buffer */
1807
1808 for (unsigned i = 0; i < fs->varying_count; i++) {
1809 gl_varying_slot loc = fs->varyings_loc[i];
1810
1811 if (has_point_coord(fs->point_sprite_mask, loc))
1812 present |= (1 << PAN_VARY_PNTCOORD);
1813 }
1814
1815 return present;
1816 }
1817
1818 /* Emitters for varying records */
1819
1820 static struct mali_attr_meta
1821 pan_emit_vary(unsigned present, enum pan_special_varying buf,
1822 unsigned quirks, enum mali_format format,
1823 unsigned offset)
1824 {
1825 unsigned nr_channels = MALI_EXTRACT_CHANNELS(format);
1826
1827 struct mali_attr_meta meta = {
1828 .index = pan_varying_index(present, buf),
1829 .unknown1 = quirks & IS_BIFROST ? 0x0 : 0x2,
1830 .swizzle = quirks & HAS_SWIZZLES ?
1831 panfrost_get_default_swizzle(nr_channels) :
1832 panfrost_bifrost_swizzle(nr_channels),
1833 .format = format,
1834 .src_offset = offset
1835 };
1836
1837 return meta;
1838 }
1839
1840 /* General varying that is unused */
1841
1842 static struct mali_attr_meta
1843 pan_emit_vary_only(unsigned present, unsigned quirks)
1844 {
1845 return pan_emit_vary(present, 0, quirks, MALI_VARYING_DISCARD, 0);
1846 }
1847
1848 /* Special records */
1849
1850 static const enum mali_format pan_varying_formats[PAN_VARY_MAX] = {
1851 [PAN_VARY_POSITION] = MALI_VARYING_POS,
1852 [PAN_VARY_PSIZ] = MALI_R16F,
1853 [PAN_VARY_PNTCOORD] = MALI_R16F,
1854 [PAN_VARY_FACE] = MALI_R32I,
1855 [PAN_VARY_FRAGCOORD] = MALI_RGBA32F
1856 };
1857
1858 static struct mali_attr_meta
1859 pan_emit_vary_special(unsigned present, enum pan_special_varying buf,
1860 unsigned quirks)
1861 {
1862 assert(buf < PAN_VARY_MAX);
1863 return pan_emit_vary(present, buf, quirks, pan_varying_formats[buf], 0);
1864 }
1865
1866 static enum mali_format
1867 pan_xfb_format(enum mali_format format, unsigned nr)
1868 {
1869 if (MALI_EXTRACT_BITS(format) == MALI_CHANNEL_FLOAT)
1870 return MALI_R32F | MALI_NR_CHANNELS(nr);
1871 else
1872 return MALI_EXTRACT_TYPE(format) | MALI_NR_CHANNELS(nr) | MALI_CHANNEL_32;
1873 }
1874
1875 /* Transform feedback records. Note struct pipe_stream_output is (if packed as
1876 * a bitfield) 32-bit, smaller than a 64-bit pointer, so may as well pass by
1877 * value. */
1878
1879 static struct mali_attr_meta
1880 pan_emit_vary_xfb(unsigned present,
1881 unsigned max_xfb,
1882 unsigned *streamout_offsets,
1883 unsigned quirks,
1884 enum mali_format format,
1885 struct pipe_stream_output o)
1886 {
1887 /* Otherwise construct a record for it */
1888 struct mali_attr_meta meta = {
1889 /* XFB buffers come after everything else */
1890 .index = pan_xfb_base(present) + o.output_buffer,
1891
1892 /* As usual unknown bit */
1893 .unknown1 = quirks & IS_BIFROST ? 0x0 : 0x2,
1894
1895 /* Override swizzle with number of channels */
1896 .swizzle = quirks & HAS_SWIZZLES ?
1897 panfrost_get_default_swizzle(o.num_components) :
1898 panfrost_bifrost_swizzle(o.num_components),
1899
1900 /* Override number of channels and precision to highp */
1901 .format = pan_xfb_format(format, o.num_components),
1902
1903 /* Apply given offsets together */
1904 .src_offset = (o.dst_offset * 4) /* dwords */
1905 + streamout_offsets[o.output_buffer]
1906 };
1907
1908 return meta;
1909 }
1910
1911 /* Determine if we should capture a varying for XFB. This requires actually
1912 * having a buffer for it. If we don't capture it, we'll fallback to a general
1913 * varying path (linked or unlinked, possibly discarding the write) */
1914
1915 static bool
1916 panfrost_xfb_captured(struct panfrost_shader_state *xfb,
1917 unsigned loc, unsigned max_xfb)
1918 {
1919 if (!(xfb->so_mask & (1ll << loc)))
1920 return false;
1921
1922 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1923 return o->output_buffer < max_xfb;
1924 }
1925
1926 /* Higher-level wrapper around all of the above, classifying a varying into one
1927 * of the above types */
1928
1929 static struct mali_attr_meta
1930 panfrost_emit_varying(
1931 struct panfrost_shader_state *stage,
1932 struct panfrost_shader_state *other,
1933 struct panfrost_shader_state *xfb,
1934 unsigned present,
1935 unsigned max_xfb,
1936 unsigned *streamout_offsets,
1937 unsigned quirks,
1938 unsigned *gen_offsets,
1939 enum mali_format *gen_formats,
1940 unsigned *gen_stride,
1941 unsigned idx,
1942 bool should_alloc,
1943 bool is_fragment)
1944 {
1945 gl_varying_slot loc = stage->varyings_loc[idx];
1946 enum mali_format format = stage->varyings[idx];
1947
1948 /* Override format to match linkage */
1949 if (!should_alloc && gen_formats[idx])
1950 format = gen_formats[idx];
1951
1952 if (has_point_coord(stage->point_sprite_mask, loc)) {
1953 return pan_emit_vary_special(present, PAN_VARY_PNTCOORD, quirks);
1954 } else if (panfrost_xfb_captured(xfb, loc, max_xfb)) {
1955 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1956 return pan_emit_vary_xfb(present, max_xfb, streamout_offsets, quirks, format, *o);
1957 } else if (loc == VARYING_SLOT_POS) {
1958 if (is_fragment)
1959 return pan_emit_vary_special(present, PAN_VARY_FRAGCOORD, quirks);
1960 else
1961 return pan_emit_vary_special(present, PAN_VARY_POSITION, quirks);
1962 } else if (loc == VARYING_SLOT_PSIZ) {
1963 return pan_emit_vary_special(present, PAN_VARY_PSIZ, quirks);
1964 } else if (loc == VARYING_SLOT_PNTC) {
1965 return pan_emit_vary_special(present, PAN_VARY_PNTCOORD, quirks);
1966 } else if (loc == VARYING_SLOT_FACE) {
1967 return pan_emit_vary_special(present, PAN_VARY_FACE, quirks);
1968 }
1969
1970 /* We've exhausted special cases, so it's otherwise a general varying. Check if we're linked */
1971 signed other_idx = -1;
1972
1973 for (unsigned j = 0; j < other->varying_count; ++j) {
1974 if (other->varyings_loc[j] == loc) {
1975 other_idx = j;
1976 break;
1977 }
1978 }
1979
1980 if (other_idx < 0)
1981 return pan_emit_vary_only(present, quirks);
1982
1983 unsigned offset = gen_offsets[other_idx];
1984
1985 if (should_alloc) {
1986 /* We're linked, so allocate a space via a watermark allocation */
1987 enum mali_format alt = other->varyings[other_idx];
1988
1989 /* Do interpolation at minimum precision */
1990 unsigned size_main = pan_varying_size(format);
1991 unsigned size_alt = pan_varying_size(alt);
1992 unsigned size = MIN2(size_main, size_alt);
1993
1994 /* If a varying is marked for XFB but not actually captured, we
1995 * should match the format to the format that would otherwise
1996 * be used for XFB, since dEQP checks for invariance here. It's
1997 * unclear if this is required by the spec. */
1998
1999 if (xfb->so_mask & (1ull << loc)) {
2000 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
2001 format = pan_xfb_format(format, o->num_components);
2002 size = pan_varying_size(format);
2003 } else if (size == size_alt) {
2004 format = alt;
2005 }
2006
2007 gen_offsets[idx] = *gen_stride;
2008 gen_formats[other_idx] = format;
2009 offset = *gen_stride;
2010 *gen_stride += size;
2011 }
2012
2013 return pan_emit_vary(present, PAN_VARY_GENERAL,
2014 quirks, format, offset);
2015 }
2016
2017 static void
2018 pan_emit_special_input(union mali_attr *varyings,
2019 unsigned present,
2020 enum pan_special_varying v,
2021 mali_ptr addr)
2022 {
2023 if (present & (1 << v)) {
2024 /* Ensure we write exactly once for performance and with fields
2025 * zeroed appropriately to avoid flakes */
2026
2027 union mali_attr s = {
2028 .elements = addr
2029 };
2030
2031 varyings[pan_varying_index(present, v)] = s;
2032 }
2033 }
2034
2035 void
2036 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
2037 unsigned vertex_count,
2038 struct mali_vertex_tiler_postfix *vertex_postfix,
2039 struct mali_vertex_tiler_postfix *tiler_postfix,
2040 union midgard_primitive_size *primitive_size)
2041 {
2042 /* Load the shaders */
2043 struct panfrost_context *ctx = batch->ctx;
2044 struct panfrost_device *dev = pan_device(ctx->base.screen);
2045 struct panfrost_shader_state *vs, *fs;
2046 size_t vs_size, fs_size;
2047
2048 /* Allocate the varying descriptor */
2049
2050 vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
2051 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
2052 vs_size = sizeof(struct mali_attr_meta) * vs->varying_count;
2053 fs_size = sizeof(struct mali_attr_meta) * fs->varying_count;
2054
2055 struct panfrost_transfer trans = panfrost_allocate_transient(batch,
2056 vs_size +
2057 fs_size);
2058
2059 struct pipe_stream_output_info *so = &vs->stream_output;
2060 unsigned present = pan_varying_present(vs, fs, dev->quirks);
2061
2062 /* Check if this varying is linked by us. This is the case for
2063 * general-purpose, non-captured varyings. If it is, link it. If it's
2064 * not, use the provided stream out information to determine the
2065 * offset, since it was already linked for us. */
2066
2067 unsigned gen_offsets[32];
2068 enum mali_format gen_formats[32];
2069 memset(gen_offsets, 0, sizeof(gen_offsets));
2070 memset(gen_formats, 0, sizeof(gen_formats));
2071
2072 unsigned gen_stride = 0;
2073 assert(vs->varying_count < ARRAY_SIZE(gen_offsets));
2074 assert(fs->varying_count < ARRAY_SIZE(gen_offsets));
2075
2076 unsigned streamout_offsets[32];
2077
2078 for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
2079 streamout_offsets[i] = panfrost_streamout_offset(
2080 so->stride[i],
2081 ctx->streamout.offsets[i],
2082 ctx->streamout.targets[i]);
2083 }
2084
2085 struct mali_attr_meta *ovs = (struct mali_attr_meta *)trans.cpu;
2086 struct mali_attr_meta *ofs = ovs + vs->varying_count;
2087
2088 for (unsigned i = 0; i < vs->varying_count; i++) {
2089 ovs[i] = panfrost_emit_varying(vs, fs, vs, present,
2090 ctx->streamout.num_targets, streamout_offsets,
2091 dev->quirks,
2092 gen_offsets, gen_formats, &gen_stride, i, true, false);
2093 }
2094
2095 for (unsigned i = 0; i < fs->varying_count; i++) {
2096 ofs[i] = panfrost_emit_varying(fs, vs, vs, present,
2097 ctx->streamout.num_targets, streamout_offsets,
2098 dev->quirks,
2099 gen_offsets, gen_formats, &gen_stride, i, false, true);
2100 }
2101
2102 unsigned xfb_base = pan_xfb_base(present);
2103 struct panfrost_transfer T = panfrost_allocate_transient(batch,
2104 sizeof(union mali_attr) * (xfb_base + ctx->streamout.num_targets));
2105 union mali_attr *varyings = (union mali_attr *) T.cpu;
2106
2107 /* Emit the stream out buffers */
2108
2109 unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
2110 ctx->vertex_count);
2111
2112 for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
2113 panfrost_emit_streamout(batch, &varyings[xfb_base + i],
2114 so->stride[i],
2115 ctx->streamout.offsets[i],
2116 out_count,
2117 ctx->streamout.targets[i]);
2118 }
2119
2120 panfrost_emit_varyings(batch,
2121 &varyings[pan_varying_index(present, PAN_VARY_GENERAL)],
2122 gen_stride, vertex_count);
2123
2124 /* fp32 vec4 gl_Position */
2125 tiler_postfix->position_varying = panfrost_emit_varyings(batch,
2126 &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
2127 sizeof(float) * 4, vertex_count);
2128
2129 if (present & (1 << PAN_VARY_PSIZ)) {
2130 primitive_size->pointer = panfrost_emit_varyings(batch,
2131 &varyings[pan_varying_index(present, PAN_VARY_PSIZ)],
2132 2, vertex_count);
2133 }
2134
2135 pan_emit_special_input(varyings, present, PAN_VARY_PNTCOORD, MALI_VARYING_POINT_COORD);
2136 pan_emit_special_input(varyings, present, PAN_VARY_FACE, MALI_VARYING_FRONT_FACING);
2137 pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD, MALI_VARYING_FRAG_COORD);
2138
2139 vertex_postfix->varyings = T.gpu;
2140 tiler_postfix->varyings = T.gpu;
2141
2142 vertex_postfix->varying_meta = trans.gpu;
2143 tiler_postfix->varying_meta = trans.gpu + vs_size;
2144 }
2145
2146 void
2147 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
2148 struct mali_vertex_tiler_prefix *vertex_prefix,
2149 struct mali_vertex_tiler_postfix *vertex_postfix,
2150 struct mali_vertex_tiler_prefix *tiler_prefix,
2151 struct mali_vertex_tiler_postfix *tiler_postfix,
2152 union midgard_primitive_size *primitive_size)
2153 {
2154 struct panfrost_context *ctx = batch->ctx;
2155 struct panfrost_device *device = pan_device(ctx->base.screen);
2156 bool wallpapering = ctx->wallpaper_batch && batch->tiler_dep;
2157 struct bifrost_payload_vertex bifrost_vertex = {0,};
2158 struct bifrost_payload_tiler bifrost_tiler = {0,};
2159 struct midgard_payload_vertex_tiler midgard_vertex = {0,};
2160 struct midgard_payload_vertex_tiler midgard_tiler = {0,};
2161 void *vp, *tp;
2162 size_t vp_size, tp_size;
2163
2164 if (device->quirks & IS_BIFROST) {
2165 bifrost_vertex.prefix = *vertex_prefix;
2166 bifrost_vertex.postfix = *vertex_postfix;
2167 vp = &bifrost_vertex;
2168 vp_size = sizeof(bifrost_vertex);
2169
2170 bifrost_tiler.prefix = *tiler_prefix;
2171 bifrost_tiler.tiler.primitive_size = *primitive_size;
2172 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
2173 bifrost_tiler.postfix = *tiler_postfix;
2174 tp = &bifrost_tiler;
2175 tp_size = sizeof(bifrost_tiler);
2176 } else {
2177 midgard_vertex.prefix = *vertex_prefix;
2178 midgard_vertex.postfix = *vertex_postfix;
2179 vp = &midgard_vertex;
2180 vp_size = sizeof(midgard_vertex);
2181
2182 midgard_tiler.prefix = *tiler_prefix;
2183 midgard_tiler.postfix = *tiler_postfix;
2184 midgard_tiler.primitive_size = *primitive_size;
2185 tp = &midgard_tiler;
2186 tp_size = sizeof(midgard_tiler);
2187 }
2188
2189 if (wallpapering) {
2190 /* Inject in reverse order, with "predicted" job indices.
2191 * THIS IS A HACK XXX */
2192 panfrost_new_job(batch, JOB_TYPE_TILER, false,
2193 batch->job_index + 2, tp, tp_size, true);
2194 panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
2195 vp, vp_size, true);
2196 return;
2197 }
2198
2199 /* If rasterizer discard is enable, only submit the vertex */
2200
2201 bool rasterizer_discard = ctx->rasterizer &&
2202 ctx->rasterizer->base.rasterizer_discard;
2203
2204 unsigned vertex = panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
2205 vp, vp_size, false);
2206
2207 if (rasterizer_discard)
2208 return;
2209
2210 panfrost_new_job(batch, JOB_TYPE_TILER, false, vertex, tp, tp_size,
2211 false);
2212 }
2213
2214 /* TODO: stop hardcoding this */
2215 mali_ptr
2216 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2217 {
2218 uint16_t locations[] = {
2219 128, 128,
2220 0, 256,
2221 0, 256,
2222 0, 256,
2223 0, 256,
2224 0, 256,
2225 0, 256,
2226 0, 256,
2227 0, 256,
2228 0, 256,
2229 0, 256,
2230 0, 256,
2231 0, 256,
2232 0, 256,
2233 0, 256,
2234 0, 256,
2235 0, 256,
2236 0, 256,
2237 0, 256,
2238 0, 256,
2239 0, 256,
2240 0, 256,
2241 0, 256,
2242 0, 256,
2243 0, 256,
2244 0, 256,
2245 0, 256,
2246 0, 256,
2247 0, 256,
2248 0, 256,
2249 0, 256,
2250 0, 256,
2251 128, 128,
2252 0, 0,
2253 0, 0,
2254 0, 0,
2255 0, 0,
2256 0, 0,
2257 0, 0,
2258 0, 0,
2259 0, 0,
2260 0, 0,
2261 0, 0,
2262 0, 0,
2263 0, 0,
2264 0, 0,
2265 0, 0,
2266 0, 0,
2267 };
2268
2269 return panfrost_upload_transient(batch, locations, 96 * sizeof(uint16_t));
2270 }