panfrost: Set MALI_BIFROST_EARLY_Z as necessary
[mesa.git] / src / gallium / drivers / panfrost / pan_cmdstream.c
1 /*
2 * Copyright (C) 2018 Alyssa Rosenzweig
3 * Copyright (C) 2020 Collabora Ltd.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "util/macros.h"
26 #include "util/u_prim.h"
27 #include "util/u_vbuf.h"
28
29 #include "panfrost-quirks.h"
30
31 #include "pan_allocate.h"
32 #include "pan_bo.h"
33 #include "pan_cmdstream.h"
34 #include "pan_context.h"
35 #include "pan_job.h"
36
37 /* If a BO is accessed for a particular shader stage, will it be in the primary
38 * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
39 * fragment will be primary, e.g. compute jobs will be considered
40 * "vertex/tiler" by analogy */
41
42 static inline uint32_t
43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
44 {
45 assert(stage == PIPE_SHADER_FRAGMENT ||
46 stage == PIPE_SHADER_VERTEX ||
47 stage == PIPE_SHADER_COMPUTE);
48
49 return stage == PIPE_SHADER_FRAGMENT ?
50 PAN_BO_ACCESS_FRAGMENT :
51 PAN_BO_ACCESS_VERTEX_TILER;
52 }
53
54 static void
55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
56 struct mali_vertex_tiler_postfix *postfix)
57 {
58 struct panfrost_device *dev = pan_device(ctx->base.screen);
59 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
60
61 unsigned shift = panfrost_get_stack_shift(batch->stack_size);
62 struct mali_shared_memory shared = {
63 .stack_shift = shift,
64 .scratchpad = panfrost_batch_get_scratchpad(batch, shift, dev->thread_tls_alloc, dev->core_count)->gpu,
65 .shared_workgroup_count = ~0,
66 };
67 postfix->shared_memory = panfrost_upload_transient(batch, &shared, sizeof(shared));
68 }
69
70 static void
71 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
72 struct mali_vertex_tiler_postfix *postfix)
73 {
74 struct panfrost_device *dev = pan_device(ctx->base.screen);
75 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
76
77 /* If we haven't, reserve space for the framebuffer */
78
79 if (!batch->framebuffer.gpu) {
80 unsigned size = (dev->quirks & MIDGARD_SFBD) ?
81 sizeof(struct mali_single_framebuffer) :
82 sizeof(struct mali_framebuffer);
83
84 batch->framebuffer = panfrost_allocate_transient(batch, size);
85
86 /* Tag the pointer */
87 if (!(dev->quirks & MIDGARD_SFBD))
88 batch->framebuffer.gpu |= MALI_MFBD;
89 }
90
91 postfix->shared_memory = batch->framebuffer.gpu;
92 }
93
94 static void
95 panfrost_vt_update_rasterizer(struct panfrost_context *ctx,
96 struct mali_vertex_tiler_prefix *prefix,
97 struct mali_vertex_tiler_postfix *postfix)
98 {
99 struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
100
101 postfix->gl_enables |= 0x7;
102 SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
103 rasterizer && rasterizer->base.front_ccw);
104 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
105 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_FRONT));
106 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
107 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_BACK));
108 SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
109 rasterizer && rasterizer->base.flatshade_first);
110 }
111
112 void
113 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
114 struct mali_vertex_tiler_prefix *prefix,
115 union midgard_primitive_size *primitive_size)
116 {
117 struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
118
119 if (!panfrost_writes_point_size(ctx)) {
120 bool points = prefix->draw_mode == MALI_POINTS;
121 float val = 0.0f;
122
123 if (rasterizer)
124 val = points ?
125 rasterizer->base.point_size :
126 rasterizer->base.line_width;
127
128 primitive_size->constant = val;
129 }
130 }
131
132 static void
133 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
134 struct mali_vertex_tiler_postfix *postfix)
135 {
136 SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
137 if (ctx->occlusion_query)
138 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
139 else
140 postfix->occlusion_counter = 0;
141 }
142
143 void
144 panfrost_vt_init(struct panfrost_context *ctx,
145 enum pipe_shader_type stage,
146 struct mali_vertex_tiler_prefix *prefix,
147 struct mali_vertex_tiler_postfix *postfix)
148 {
149 struct panfrost_device *device = pan_device(ctx->base.screen);
150
151 if (!ctx->shader[stage])
152 return;
153
154 memset(prefix, 0, sizeof(*prefix));
155 memset(postfix, 0, sizeof(*postfix));
156
157 if (device->quirks & IS_BIFROST) {
158 postfix->gl_enables = 0x2;
159 panfrost_vt_emit_shared_memory(ctx, postfix);
160 } else {
161 postfix->gl_enables = 0x6;
162 panfrost_vt_attach_framebuffer(ctx, postfix);
163 }
164
165 if (stage == PIPE_SHADER_FRAGMENT) {
166 panfrost_vt_update_occlusion_query(ctx, postfix);
167 panfrost_vt_update_rasterizer(ctx, prefix, postfix);
168 }
169 }
170
171 static unsigned
172 panfrost_translate_index_size(unsigned size)
173 {
174 switch (size) {
175 case 1:
176 return MALI_DRAW_INDEXED_UINT8;
177
178 case 2:
179 return MALI_DRAW_INDEXED_UINT16;
180
181 case 4:
182 return MALI_DRAW_INDEXED_UINT32;
183
184 default:
185 unreachable("Invalid index size");
186 }
187 }
188
189 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
190 * good for the duration of the draw (transient), could last longer. Also get
191 * the bounds on the index buffer for the range accessed by the draw. We do
192 * these operations together because there are natural optimizations which
193 * require them to be together. */
194
195 static mali_ptr
196 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
197 const struct pipe_draw_info *info,
198 unsigned *min_index, unsigned *max_index)
199 {
200 struct panfrost_resource *rsrc = pan_resource(info->index.resource);
201 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
202 off_t offset = info->start * info->index_size;
203 bool needs_indices = true;
204 mali_ptr out = 0;
205
206 if (info->max_index != ~0u) {
207 *min_index = info->min_index;
208 *max_index = info->max_index;
209 needs_indices = false;
210 }
211
212 if (!info->has_user_indices) {
213 /* Only resources can be directly mapped */
214 panfrost_batch_add_bo(batch, rsrc->bo,
215 PAN_BO_ACCESS_SHARED |
216 PAN_BO_ACCESS_READ |
217 PAN_BO_ACCESS_VERTEX_TILER);
218 out = rsrc->bo->gpu + offset;
219
220 /* Check the cache */
221 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
222 info->start,
223 info->count,
224 min_index,
225 max_index);
226 } else {
227 /* Otherwise, we need to upload to transient memory */
228 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
229 out = panfrost_upload_transient(batch, ibuf8 + offset,
230 info->count *
231 info->index_size);
232 }
233
234 if (needs_indices) {
235 /* Fallback */
236 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
237
238 if (!info->has_user_indices)
239 panfrost_minmax_cache_add(rsrc->index_cache,
240 info->start, info->count,
241 *min_index, *max_index);
242 }
243
244 return out;
245 }
246
247 void
248 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
249 const struct pipe_draw_info *info,
250 enum mali_draw_mode draw_mode,
251 struct mali_vertex_tiler_postfix *vertex_postfix,
252 struct mali_vertex_tiler_prefix *tiler_prefix,
253 struct mali_vertex_tiler_postfix *tiler_postfix,
254 unsigned *vertex_count,
255 unsigned *padded_count)
256 {
257 tiler_prefix->draw_mode = draw_mode;
258
259 unsigned draw_flags = 0;
260
261 if (panfrost_writes_point_size(ctx))
262 draw_flags |= MALI_DRAW_VARYING_SIZE;
263
264 if (info->primitive_restart)
265 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
266
267 /* These doesn't make much sense */
268
269 draw_flags |= 0x3000;
270
271 if (info->index_size) {
272 unsigned min_index = 0, max_index = 0;
273
274 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
275 info,
276 &min_index,
277 &max_index);
278
279 /* Use the corresponding values */
280 *vertex_count = max_index - min_index + 1;
281 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
282 tiler_prefix->offset_bias_correction = -min_index;
283 tiler_prefix->index_count = MALI_POSITIVE(info->count);
284 draw_flags |= panfrost_translate_index_size(info->index_size);
285 } else {
286 tiler_prefix->indices = 0;
287 *vertex_count = ctx->vertex_count;
288 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
289 tiler_prefix->offset_bias_correction = 0;
290 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
291 }
292
293 tiler_prefix->unknown_draw = draw_flags;
294
295 /* Encode the padded vertex count */
296
297 if (info->instance_count > 1) {
298 *padded_count = panfrost_padded_vertex_count(*vertex_count);
299
300 unsigned shift = __builtin_ctz(ctx->padded_count);
301 unsigned k = ctx->padded_count >> (shift + 1);
302
303 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
304 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
305 } else {
306 *padded_count = *vertex_count;
307
308 /* Reset instancing state */
309 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
310 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
311 }
312 }
313
314 static void
315 panfrost_shader_meta_init(struct panfrost_context *ctx,
316 enum pipe_shader_type st,
317 struct mali_shader_meta *meta)
318 {
319 const struct panfrost_device *dev = pan_device(ctx->base.screen);
320 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
321
322 memset(meta, 0, sizeof(*meta));
323 meta->shader = (ss->bo ? ss->bo->gpu : 0) | ss->first_tag;
324 meta->attribute_count = ss->attribute_count;
325 meta->varying_count = ss->varying_count;
326 meta->texture_count = ctx->sampler_view_count[st];
327 meta->sampler_count = ctx->sampler_count[st];
328
329 if (dev->quirks & IS_BIFROST) {
330 if (st == PIPE_SHADER_VERTEX)
331 meta->bifrost1.unk1 = 0x800000;
332 else {
333 /* First clause ATEST |= 0x4000000.
334 * Less than 32 regs |= 0x200 */
335 meta->bifrost1.unk1 = 0x950020;
336 }
337
338 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
339 if (st == PIPE_SHADER_VERTEX)
340 meta->bifrost2.preload_regs = 0xC0;
341 else
342 meta->bifrost2.preload_regs = 0x1;
343 meta->bifrost2.uniform_count = MIN2(ss->uniform_count,
344 ss->uniform_cutoff);
345 } else {
346 meta->midgard1.uniform_count = MIN2(ss->uniform_count,
347 ss->uniform_cutoff);
348 meta->midgard1.work_count = ss->work_reg_count;
349
350 /* TODO: This is not conformant on ES3 */
351 meta->midgard1.flags_hi = MALI_SUPPRESS_INF_NAN;
352
353 meta->midgard1.flags_lo = 0x220;
354 meta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
355 }
356 }
357
358 static unsigned
359 panfrost_translate_compare_func(enum pipe_compare_func in)
360 {
361 switch (in) {
362 case PIPE_FUNC_NEVER:
363 return MALI_FUNC_NEVER;
364
365 case PIPE_FUNC_LESS:
366 return MALI_FUNC_LESS;
367
368 case PIPE_FUNC_EQUAL:
369 return MALI_FUNC_EQUAL;
370
371 case PIPE_FUNC_LEQUAL:
372 return MALI_FUNC_LEQUAL;
373
374 case PIPE_FUNC_GREATER:
375 return MALI_FUNC_GREATER;
376
377 case PIPE_FUNC_NOTEQUAL:
378 return MALI_FUNC_NOTEQUAL;
379
380 case PIPE_FUNC_GEQUAL:
381 return MALI_FUNC_GEQUAL;
382
383 case PIPE_FUNC_ALWAYS:
384 return MALI_FUNC_ALWAYS;
385
386 default:
387 unreachable("Invalid func");
388 }
389 }
390
391 static unsigned
392 panfrost_translate_stencil_op(enum pipe_stencil_op in)
393 {
394 switch (in) {
395 case PIPE_STENCIL_OP_KEEP:
396 return MALI_STENCIL_KEEP;
397
398 case PIPE_STENCIL_OP_ZERO:
399 return MALI_STENCIL_ZERO;
400
401 case PIPE_STENCIL_OP_REPLACE:
402 return MALI_STENCIL_REPLACE;
403
404 case PIPE_STENCIL_OP_INCR:
405 return MALI_STENCIL_INCR;
406
407 case PIPE_STENCIL_OP_DECR:
408 return MALI_STENCIL_DECR;
409
410 case PIPE_STENCIL_OP_INCR_WRAP:
411 return MALI_STENCIL_INCR_WRAP;
412
413 case PIPE_STENCIL_OP_DECR_WRAP:
414 return MALI_STENCIL_DECR_WRAP;
415
416 case PIPE_STENCIL_OP_INVERT:
417 return MALI_STENCIL_INVERT;
418
419 default:
420 unreachable("Invalid stencil op");
421 }
422 }
423
424 static unsigned
425 translate_tex_wrap(enum pipe_tex_wrap w)
426 {
427 switch (w) {
428 case PIPE_TEX_WRAP_REPEAT:
429 return MALI_WRAP_REPEAT;
430
431 case PIPE_TEX_WRAP_CLAMP:
432 return MALI_WRAP_CLAMP;
433
434 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
435 return MALI_WRAP_CLAMP_TO_EDGE;
436
437 case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
438 return MALI_WRAP_CLAMP_TO_BORDER;
439
440 case PIPE_TEX_WRAP_MIRROR_REPEAT:
441 return MALI_WRAP_MIRRORED_REPEAT;
442
443 case PIPE_TEX_WRAP_MIRROR_CLAMP:
444 return MALI_WRAP_MIRRORED_CLAMP;
445
446 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
447 return MALI_WRAP_MIRRORED_CLAMP_TO_EDGE;
448
449 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
450 return MALI_WRAP_MIRRORED_CLAMP_TO_BORDER;
451
452 default:
453 unreachable("Invalid wrap");
454 }
455 }
456
457 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
458 struct mali_sampler_descriptor *hw)
459 {
460 unsigned func = panfrost_translate_compare_func(cso->compare_func);
461 bool min_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
462 bool mag_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
463 bool mip_linear = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR;
464 unsigned min_filter = min_nearest ? MALI_SAMP_MIN_NEAREST : 0;
465 unsigned mag_filter = mag_nearest ? MALI_SAMP_MAG_NEAREST : 0;
466 unsigned mip_filter = mip_linear ?
467 (MALI_SAMP_MIP_LINEAR_1 | MALI_SAMP_MIP_LINEAR_2) : 0;
468 unsigned normalized = cso->normalized_coords ? MALI_SAMP_NORM_COORDS : 0;
469
470 *hw = (struct mali_sampler_descriptor) {
471 .filter_mode = min_filter | mag_filter | mip_filter |
472 normalized,
473 .wrap_s = translate_tex_wrap(cso->wrap_s),
474 .wrap_t = translate_tex_wrap(cso->wrap_t),
475 .wrap_r = translate_tex_wrap(cso->wrap_r),
476 .compare_func = panfrost_flip_compare_func(func),
477 .border_color = {
478 cso->border_color.f[0],
479 cso->border_color.f[1],
480 cso->border_color.f[2],
481 cso->border_color.f[3]
482 },
483 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
484 .max_lod = FIXED_16(cso->max_lod, false),
485 .lod_bias = FIXED_16(cso->lod_bias, true), /* can be negative */
486 .seamless_cube_map = cso->seamless_cube_map,
487 };
488
489 /* If necessary, we disable mipmapping in the sampler descriptor by
490 * clamping the LOD as tight as possible (from 0 to epsilon,
491 * essentially -- remember these are fixed point numbers, so
492 * epsilon=1/256) */
493
494 if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
495 hw->max_lod = hw->min_lod + 1;
496 }
497
498 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
499 struct bifrost_sampler_descriptor *hw)
500 {
501 *hw = (struct bifrost_sampler_descriptor) {
502 .unk1 = 0x1,
503 .wrap_s = translate_tex_wrap(cso->wrap_s),
504 .wrap_t = translate_tex_wrap(cso->wrap_t),
505 .wrap_r = translate_tex_wrap(cso->wrap_r),
506 .unk8 = 0x8,
507 .min_filter = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST,
508 .norm_coords = cso->normalized_coords,
509 .mip_filter = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR,
510 .mag_filter = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR,
511 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
512 .max_lod = FIXED_16(cso->max_lod, false),
513 };
514
515 /* If necessary, we disable mipmapping in the sampler descriptor by
516 * clamping the LOD as tight as possible (from 0 to epsilon,
517 * essentially -- remember these are fixed point numbers, so
518 * epsilon=1/256) */
519
520 if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
521 hw->max_lod = hw->min_lod + 1;
522 }
523
524 static void
525 panfrost_make_stencil_state(const struct pipe_stencil_state *in,
526 struct mali_stencil_test *out)
527 {
528 out->ref = 0; /* Gallium gets it from elsewhere */
529
530 out->mask = in->valuemask;
531 out->func = panfrost_translate_compare_func(in->func);
532 out->sfail = panfrost_translate_stencil_op(in->fail_op);
533 out->dpfail = panfrost_translate_stencil_op(in->zfail_op);
534 out->dppass = panfrost_translate_stencil_op(in->zpass_op);
535 }
536
537 static void
538 panfrost_frag_meta_rasterizer_update(struct panfrost_context *ctx,
539 struct mali_shader_meta *fragmeta)
540 {
541 if (!ctx->rasterizer) {
542 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, true);
543 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, false);
544 fragmeta->depth_units = 0.0f;
545 fragmeta->depth_factor = 0.0f;
546 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, false);
547 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, false);
548 return;
549 }
550
551 bool msaa = ctx->rasterizer->base.multisample;
552
553 /* TODO: Sample size */
554 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
555 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
556 fragmeta->depth_units = ctx->rasterizer->base.offset_units * 2.0f;
557 fragmeta->depth_factor = ctx->rasterizer->base.offset_scale;
558
559 /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
560
561 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A,
562 ctx->rasterizer->base.offset_tri);
563 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B,
564 ctx->rasterizer->base.offset_tri);
565 }
566
567 static void
568 panfrost_frag_meta_zsa_update(struct panfrost_context *ctx,
569 struct mali_shader_meta *fragmeta)
570 {
571 const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
572 int zfunc = PIPE_FUNC_ALWAYS;
573
574 if (!zsa) {
575 struct pipe_stencil_state default_stencil = {
576 .enabled = 0,
577 .func = PIPE_FUNC_ALWAYS,
578 .fail_op = MALI_STENCIL_KEEP,
579 .zfail_op = MALI_STENCIL_KEEP,
580 .zpass_op = MALI_STENCIL_KEEP,
581 .writemask = 0xFF,
582 .valuemask = 0xFF
583 };
584
585 panfrost_make_stencil_state(&default_stencil,
586 &fragmeta->stencil_front);
587 fragmeta->stencil_mask_front = default_stencil.writemask;
588 fragmeta->stencil_back = fragmeta->stencil_front;
589 fragmeta->stencil_mask_back = default_stencil.writemask;
590 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST, false);
591 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK, false);
592 } else {
593 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
594 zsa->stencil[0].enabled);
595 panfrost_make_stencil_state(&zsa->stencil[0],
596 &fragmeta->stencil_front);
597 fragmeta->stencil_mask_front = zsa->stencil[0].writemask;
598 fragmeta->stencil_front.ref = ctx->stencil_ref.ref_value[0];
599
600 /* If back-stencil is not enabled, use the front values */
601
602 if (zsa->stencil[1].enabled) {
603 panfrost_make_stencil_state(&zsa->stencil[1],
604 &fragmeta->stencil_back);
605 fragmeta->stencil_mask_back = zsa->stencil[1].writemask;
606 fragmeta->stencil_back.ref = ctx->stencil_ref.ref_value[1];
607 } else {
608 fragmeta->stencil_back = fragmeta->stencil_front;
609 fragmeta->stencil_mask_back = fragmeta->stencil_mask_front;
610 fragmeta->stencil_back.ref = fragmeta->stencil_front.ref;
611 }
612
613 if (zsa->depth.enabled)
614 zfunc = zsa->depth.func;
615
616 /* Depth state (TODO: Refactor) */
617
618 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
619 zsa->depth.writemask);
620 }
621
622 fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
623 fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(zfunc));
624 }
625
626 static bool
627 panfrost_fs_required(
628 struct panfrost_shader_state *fs,
629 struct panfrost_blend_final *blend,
630 unsigned rt_count)
631 {
632 /* If we generally have side effects */
633 if (fs->fs_sidefx)
634 return true;
635
636 /* If colour is written we need to execute */
637 for (unsigned i = 0; i < rt_count; ++i) {
638 if (!blend[i].no_colour)
639 return true;
640 }
641
642 /* If depth is written and not implied we need to execute.
643 * TODO: Predicate on Z/S writes being enabled */
644 return (fs->writes_depth || fs->writes_stencil);
645 }
646
647 static void
648 panfrost_frag_meta_blend_update(struct panfrost_context *ctx,
649 struct mali_shader_meta *fragmeta,
650 void *rts)
651 {
652 const struct panfrost_device *dev = pan_device(ctx->base.screen);
653 struct panfrost_shader_state *fs;
654 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
655
656 SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
657 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
658 !ctx->blend->base.dither);
659
660 /* Get blending setup */
661 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
662
663 struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
664 unsigned shader_offset = 0;
665 struct panfrost_bo *shader_bo = NULL;
666
667 for (unsigned c = 0; c < rt_count; ++c)
668 blend[c] = panfrost_get_blend_for_context(ctx, c, &shader_bo,
669 &shader_offset);
670
671 /* Disable shader execution if we can */
672 if (dev->quirks & MIDGARD_SHADERLESS
673 && !panfrost_fs_required(fs, blend, rt_count)) {
674 fragmeta->shader = 0;
675 fragmeta->attribute_count = 0;
676 fragmeta->varying_count = 0;
677 fragmeta->texture_count = 0;
678 fragmeta->sampler_count = 0;
679
680 /* This feature is not known to work on Bifrost */
681 fragmeta->midgard1.work_count = 1;
682 fragmeta->midgard1.uniform_count = 0;
683 fragmeta->midgard1.uniform_buffer_count = 0;
684 }
685
686 /* If there is a blend shader, work registers are shared. We impose 8
687 * work registers as a limit for blend shaders. Should be lower XXX */
688
689 if (!(dev->quirks & IS_BIFROST)) {
690 for (unsigned c = 0; c < rt_count; ++c) {
691 if (blend[c].is_shader) {
692 fragmeta->midgard1.work_count =
693 MAX2(fragmeta->midgard1.work_count, 8);
694 }
695 }
696 }
697
698 /* Even on MFBD, the shader descriptor gets blend shaders. It's *also*
699 * copied to the blend_meta appended (by convention), but this is the
700 * field actually read by the hardware. (Or maybe both are read...?).
701 * Specify the last RTi with a blend shader. */
702
703 fragmeta->blend.shader = 0;
704
705 for (signed rt = (rt_count - 1); rt >= 0; --rt) {
706 if (!blend[rt].is_shader)
707 continue;
708
709 fragmeta->blend.shader = blend[rt].shader.gpu |
710 blend[rt].shader.first_tag;
711 break;
712 }
713
714 if (dev->quirks & MIDGARD_SFBD) {
715 /* When only a single render target platform is used, the blend
716 * information is inside the shader meta itself. We additionally
717 * need to signal CAN_DISCARD for nontrivial blend modes (so
718 * we're able to read back the destination buffer) */
719
720 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
721 blend[0].is_shader);
722
723 if (!blend[0].is_shader) {
724 fragmeta->blend.equation = *blend[0].equation.equation;
725 fragmeta->blend.constant = blend[0].equation.constant;
726 }
727
728 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
729 !blend[0].no_blending || fs->can_discard);
730 return;
731 }
732
733 if (dev->quirks & IS_BIFROST) {
734 bool no_blend = true;
735
736 for (unsigned i = 0; i < rt_count; ++i)
737 no_blend &= (blend[i].no_blending | blend[i].no_colour);
738
739 SET_BIT(fragmeta->bifrost1.unk1, MALI_BIFROST_EARLY_Z,
740 !fs->can_discard && !fs->writes_depth && no_blend);
741 }
742
743 /* Additional blend descriptor tacked on for jobs using MFBD */
744
745 for (unsigned i = 0; i < rt_count; ++i) {
746 unsigned flags = 0;
747
748 if (ctx->pipe_framebuffer.nr_cbufs > i && !blend[i].no_colour) {
749 flags = 0x200;
750
751 bool is_srgb = (ctx->pipe_framebuffer.nr_cbufs > i) &&
752 (ctx->pipe_framebuffer.cbufs[i]) &&
753 util_format_is_srgb(ctx->pipe_framebuffer.cbufs[i]->format);
754
755 SET_BIT(flags, MALI_BLEND_MRT_SHADER, blend[i].is_shader);
756 SET_BIT(flags, MALI_BLEND_LOAD_TIB, !blend[i].no_blending);
757 SET_BIT(flags, MALI_BLEND_SRGB, is_srgb);
758 SET_BIT(flags, MALI_BLEND_NO_DITHER, !ctx->blend->base.dither);
759 }
760
761 if (dev->quirks & IS_BIFROST) {
762 struct bifrost_blend_rt *brts = rts;
763
764 brts[i].flags = flags;
765
766 if (blend[i].is_shader) {
767 /* The blend shader's address needs to be at
768 * the same top 32 bit as the fragment shader.
769 * TODO: Ensure that's always the case.
770 */
771 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
772 (fs->bo->gpu & (0xffffffffull << 32)));
773 brts[i].shader = blend[i].shader.gpu;
774 brts[i].unk2 = 0x0;
775 } else if (ctx->pipe_framebuffer.nr_cbufs > i) {
776 enum pipe_format format = ctx->pipe_framebuffer.cbufs[i]->format;
777 const struct util_format_description *format_desc;
778 format_desc = util_format_description(format);
779
780 brts[i].equation = *blend[i].equation.equation;
781
782 /* TODO: this is a bit more complicated */
783 brts[i].constant = blend[i].equation.constant;
784
785 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
786
787 /* 0x19 disables blending and forces REPLACE
788 * mode (equivalent to rgb_mode = alpha_mode =
789 * x122, colour mask = 0xF). 0x1a allows
790 * blending. */
791 brts[i].unk2 = blend[i].no_blending ? 0x19 : 0x1a;
792
793 brts[i].shader_type = fs->blend_types[i];
794 } else {
795 /* Dummy attachment for depth-only */
796 brts[i].unk2 = 0x3;
797 brts[i].shader_type = fs->blend_types[i];
798 }
799 } else {
800 struct midgard_blend_rt *mrts = rts;
801 mrts[i].flags = flags;
802
803 if (blend[i].is_shader) {
804 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
805 } else {
806 mrts[i].blend.equation = *blend[i].equation.equation;
807 mrts[i].blend.constant = blend[i].equation.constant;
808 }
809 }
810 }
811 }
812
813 static void
814 panfrost_frag_shader_meta_init(struct panfrost_context *ctx,
815 struct mali_shader_meta *fragmeta,
816 void *rts)
817 {
818 const struct panfrost_device *dev = pan_device(ctx->base.screen);
819 struct panfrost_shader_state *fs;
820
821 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
822
823 fragmeta->alpha_coverage = ~MALI_ALPHA_COVERAGE(0.000000);
824 fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x3010;
825 fragmeta->unknown2_4 = 0x4e0;
826
827 /* unknown2_4 has 0x10 bit set on T6XX and T720. We don't know why this
828 * is required (independent of 32-bit/64-bit descriptors), or why it's
829 * not used on later GPU revisions. Otherwise, all shader jobs fault on
830 * these earlier chips (perhaps this is a chicken bit of some kind).
831 * More investigation is needed. */
832
833 SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
834
835 if (dev->quirks & IS_BIFROST) {
836 /* TODO */
837 } else {
838 /* Depending on whether it's legal to in the given shader, we try to
839 * enable early-z testing (or forward-pixel kill?) */
840
841 SET_BIT(fragmeta->midgard1.flags_lo, MALI_EARLY_Z,
842 !fs->can_discard && !fs->writes_depth);
843
844 /* Add the writes Z/S flags if needed. */
845 SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_Z, fs->writes_depth);
846 SET_BIT(fragmeta->midgard1.flags_hi, MALI_WRITES_S, fs->writes_stencil);
847
848 /* Any time texturing is used, derivatives are implicitly calculated,
849 * so we need to enable helper invocations */
850
851 SET_BIT(fragmeta->midgard1.flags_lo, MALI_HELPER_INVOCATIONS,
852 fs->helper_invocations);
853
854 const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
855
856 bool depth_enabled = fs->writes_depth ||
857 (zsa && zsa->depth.enabled && zsa->depth.func != PIPE_FUNC_ALWAYS);
858
859 SET_BIT(fragmeta->midgard1.flags_lo, 0x400, !depth_enabled && fs->can_discard);
860 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_ZS, depth_enabled && fs->can_discard);
861 }
862
863 panfrost_frag_meta_rasterizer_update(ctx, fragmeta);
864 panfrost_frag_meta_zsa_update(ctx, fragmeta);
865 panfrost_frag_meta_blend_update(ctx, fragmeta, rts);
866 }
867
868 void
869 panfrost_emit_shader_meta(struct panfrost_batch *batch,
870 enum pipe_shader_type st,
871 struct mali_vertex_tiler_postfix *postfix)
872 {
873 struct panfrost_context *ctx = batch->ctx;
874 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
875
876 if (!ss) {
877 postfix->shader = 0;
878 return;
879 }
880
881 struct mali_shader_meta meta;
882
883 panfrost_shader_meta_init(ctx, st, &meta);
884
885 /* Add the shader BO to the batch. */
886 panfrost_batch_add_bo(batch, ss->bo,
887 PAN_BO_ACCESS_PRIVATE |
888 PAN_BO_ACCESS_READ |
889 panfrost_bo_access_for_stage(st));
890
891 mali_ptr shader_ptr;
892
893 if (st == PIPE_SHADER_FRAGMENT) {
894 struct panfrost_device *dev = pan_device(ctx->base.screen);
895 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
896 size_t desc_size = sizeof(meta);
897 void *rts = NULL;
898 struct panfrost_transfer xfer;
899 unsigned rt_size;
900
901 if (dev->quirks & MIDGARD_SFBD)
902 rt_size = 0;
903 else if (dev->quirks & IS_BIFROST)
904 rt_size = sizeof(struct bifrost_blend_rt);
905 else
906 rt_size = sizeof(struct midgard_blend_rt);
907
908 desc_size += rt_size * rt_count;
909
910 if (rt_size)
911 rts = rzalloc_size(ctx, rt_size * rt_count);
912
913 panfrost_frag_shader_meta_init(ctx, &meta, rts);
914
915 xfer = panfrost_allocate_transient(batch, desc_size);
916
917 memcpy(xfer.cpu, &meta, sizeof(meta));
918 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
919
920 if (rt_size)
921 ralloc_free(rts);
922
923 shader_ptr = xfer.gpu;
924 } else {
925 shader_ptr = panfrost_upload_transient(batch, &meta,
926 sizeof(meta));
927 }
928
929 postfix->shader = shader_ptr;
930 }
931
932 static void
933 panfrost_mali_viewport_init(struct panfrost_context *ctx,
934 struct mali_viewport *mvp)
935 {
936 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
937
938 /* Clip bounds are encoded as floats. The viewport itself is encoded as
939 * (somewhat) asymmetric ints. */
940
941 const struct pipe_scissor_state *ss = &ctx->scissor;
942
943 memset(mvp, 0, sizeof(*mvp));
944
945 /* By default, do no viewport clipping, i.e. clip to (-inf, inf) in
946 * each direction. Clipping to the viewport in theory should work, but
947 * in practice causes issues when we're not explicitly trying to
948 * scissor */
949
950 *mvp = (struct mali_viewport) {
951 .clip_minx = -INFINITY,
952 .clip_miny = -INFINITY,
953 .clip_maxx = INFINITY,
954 .clip_maxy = INFINITY,
955 };
956
957 /* Always scissor to the viewport by default. */
958 float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
959 float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
960
961 float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
962 float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
963
964 float minz = (vp->translate[2] - fabsf(vp->scale[2]));
965 float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
966
967 /* Apply the scissor test */
968
969 unsigned minx, miny, maxx, maxy;
970
971 if (ss && ctx->rasterizer && ctx->rasterizer->base.scissor) {
972 minx = MAX2(ss->minx, vp_minx);
973 miny = MAX2(ss->miny, vp_miny);
974 maxx = MIN2(ss->maxx, vp_maxx);
975 maxy = MIN2(ss->maxy, vp_maxy);
976 } else {
977 minx = vp_minx;
978 miny = vp_miny;
979 maxx = vp_maxx;
980 maxy = vp_maxy;
981 }
982
983 /* Hardware needs the min/max to be strictly ordered, so flip if we
984 * need to. The viewport transformation in the vertex shader will
985 * handle the negatives if we don't */
986
987 if (miny > maxy) {
988 unsigned temp = miny;
989 miny = maxy;
990 maxy = temp;
991 }
992
993 if (minx > maxx) {
994 unsigned temp = minx;
995 minx = maxx;
996 maxx = temp;
997 }
998
999 if (minz > maxz) {
1000 float temp = minz;
1001 minz = maxz;
1002 maxz = temp;
1003 }
1004
1005 /* Clamp to the framebuffer size as a last check */
1006
1007 minx = MIN2(ctx->pipe_framebuffer.width, minx);
1008 maxx = MIN2(ctx->pipe_framebuffer.width, maxx);
1009
1010 miny = MIN2(ctx->pipe_framebuffer.height, miny);
1011 maxy = MIN2(ctx->pipe_framebuffer.height, maxy);
1012
1013 /* Upload */
1014
1015 mvp->viewport0[0] = minx;
1016 mvp->viewport1[0] = MALI_POSITIVE(maxx);
1017
1018 mvp->viewport0[1] = miny;
1019 mvp->viewport1[1] = MALI_POSITIVE(maxy);
1020
1021 mvp->clip_minz = minz;
1022 mvp->clip_maxz = maxz;
1023 }
1024
1025 void
1026 panfrost_emit_viewport(struct panfrost_batch *batch,
1027 struct mali_vertex_tiler_postfix *tiler_postfix)
1028 {
1029 struct panfrost_context *ctx = batch->ctx;
1030 struct mali_viewport mvp;
1031
1032 panfrost_mali_viewport_init(batch->ctx, &mvp);
1033
1034 /* Update the job, unless we're doing wallpapering (whose lack of
1035 * scissor we can ignore, since if we "miss" a tile of wallpaper, it'll
1036 * just... be faster :) */
1037
1038 if (!ctx->wallpaper_batch)
1039 panfrost_batch_union_scissor(batch, mvp.viewport0[0],
1040 mvp.viewport0[1],
1041 mvp.viewport1[0] + 1,
1042 mvp.viewport1[1] + 1);
1043
1044 tiler_postfix->viewport = panfrost_upload_transient(batch, &mvp,
1045 sizeof(mvp));
1046 }
1047
1048 static mali_ptr
1049 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
1050 enum pipe_shader_type st,
1051 struct panfrost_constant_buffer *buf,
1052 unsigned index)
1053 {
1054 struct pipe_constant_buffer *cb = &buf->cb[index];
1055 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1056
1057 if (rsrc) {
1058 panfrost_batch_add_bo(batch, rsrc->bo,
1059 PAN_BO_ACCESS_SHARED |
1060 PAN_BO_ACCESS_READ |
1061 panfrost_bo_access_for_stage(st));
1062
1063 /* Alignment gauranteed by
1064 * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
1065 return rsrc->bo->gpu + cb->buffer_offset;
1066 } else if (cb->user_buffer) {
1067 return panfrost_upload_transient(batch,
1068 cb->user_buffer +
1069 cb->buffer_offset,
1070 cb->buffer_size);
1071 } else {
1072 unreachable("No constant buffer");
1073 }
1074 }
1075
1076 struct sysval_uniform {
1077 union {
1078 float f[4];
1079 int32_t i[4];
1080 uint32_t u[4];
1081 uint64_t du[2];
1082 };
1083 };
1084
1085 static void
1086 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
1087 struct sysval_uniform *uniform)
1088 {
1089 struct panfrost_context *ctx = batch->ctx;
1090 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1091
1092 uniform->f[0] = vp->scale[0];
1093 uniform->f[1] = vp->scale[1];
1094 uniform->f[2] = vp->scale[2];
1095 }
1096
1097 static void
1098 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
1099 struct sysval_uniform *uniform)
1100 {
1101 struct panfrost_context *ctx = batch->ctx;
1102 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1103
1104 uniform->f[0] = vp->translate[0];
1105 uniform->f[1] = vp->translate[1];
1106 uniform->f[2] = vp->translate[2];
1107 }
1108
1109 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
1110 enum pipe_shader_type st,
1111 unsigned int sysvalid,
1112 struct sysval_uniform *uniform)
1113 {
1114 struct panfrost_context *ctx = batch->ctx;
1115 unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
1116 unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
1117 bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
1118 struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
1119
1120 assert(dim);
1121 uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
1122
1123 if (dim > 1)
1124 uniform->i[1] = u_minify(tex->texture->height0,
1125 tex->u.tex.first_level);
1126
1127 if (dim > 2)
1128 uniform->i[2] = u_minify(tex->texture->depth0,
1129 tex->u.tex.first_level);
1130
1131 if (is_array)
1132 uniform->i[dim] = tex->texture->array_size;
1133 }
1134
1135 static void
1136 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
1137 enum pipe_shader_type st,
1138 unsigned ssbo_id,
1139 struct sysval_uniform *uniform)
1140 {
1141 struct panfrost_context *ctx = batch->ctx;
1142
1143 assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
1144 struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
1145
1146 /* Compute address */
1147 struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
1148
1149 panfrost_batch_add_bo(batch, bo,
1150 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
1151 panfrost_bo_access_for_stage(st));
1152
1153 /* Upload address and size as sysval */
1154 uniform->du[0] = bo->gpu + sb.buffer_offset;
1155 uniform->u[2] = sb.buffer_size;
1156 }
1157
1158 static void
1159 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1160 enum pipe_shader_type st,
1161 unsigned samp_idx,
1162 struct sysval_uniform *uniform)
1163 {
1164 struct panfrost_context *ctx = batch->ctx;
1165 struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1166
1167 uniform->f[0] = sampl->min_lod;
1168 uniform->f[1] = sampl->max_lod;
1169 uniform->f[2] = sampl->lod_bias;
1170
1171 /* Even without any errata, Midgard represents "no mipmapping" as
1172 * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1173 * panfrost_create_sampler_state which also explains our choice of
1174 * epsilon value (again to keep behaviour consistent) */
1175
1176 if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1177 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1178 }
1179
1180 static void
1181 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1182 struct sysval_uniform *uniform)
1183 {
1184 struct panfrost_context *ctx = batch->ctx;
1185
1186 uniform->u[0] = ctx->compute_grid->grid[0];
1187 uniform->u[1] = ctx->compute_grid->grid[1];
1188 uniform->u[2] = ctx->compute_grid->grid[2];
1189 }
1190
1191 static void
1192 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1193 struct panfrost_shader_state *ss,
1194 enum pipe_shader_type st)
1195 {
1196 struct sysval_uniform *uniforms = (void *)buf;
1197
1198 for (unsigned i = 0; i < ss->sysval_count; ++i) {
1199 int sysval = ss->sysval[i];
1200
1201 switch (PAN_SYSVAL_TYPE(sysval)) {
1202 case PAN_SYSVAL_VIEWPORT_SCALE:
1203 panfrost_upload_viewport_scale_sysval(batch,
1204 &uniforms[i]);
1205 break;
1206 case PAN_SYSVAL_VIEWPORT_OFFSET:
1207 panfrost_upload_viewport_offset_sysval(batch,
1208 &uniforms[i]);
1209 break;
1210 case PAN_SYSVAL_TEXTURE_SIZE:
1211 panfrost_upload_txs_sysval(batch, st,
1212 PAN_SYSVAL_ID(sysval),
1213 &uniforms[i]);
1214 break;
1215 case PAN_SYSVAL_SSBO:
1216 panfrost_upload_ssbo_sysval(batch, st,
1217 PAN_SYSVAL_ID(sysval),
1218 &uniforms[i]);
1219 break;
1220 case PAN_SYSVAL_NUM_WORK_GROUPS:
1221 panfrost_upload_num_work_groups_sysval(batch,
1222 &uniforms[i]);
1223 break;
1224 case PAN_SYSVAL_SAMPLER:
1225 panfrost_upload_sampler_sysval(batch, st,
1226 PAN_SYSVAL_ID(sysval),
1227 &uniforms[i]);
1228 break;
1229 default:
1230 assert(0);
1231 }
1232 }
1233 }
1234
1235 static const void *
1236 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1237 unsigned index)
1238 {
1239 struct pipe_constant_buffer *cb = &buf->cb[index];
1240 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1241
1242 if (rsrc)
1243 return rsrc->bo->cpu;
1244 else if (cb->user_buffer)
1245 return cb->user_buffer;
1246 else
1247 unreachable("No constant buffer");
1248 }
1249
1250 void
1251 panfrost_emit_const_buf(struct panfrost_batch *batch,
1252 enum pipe_shader_type stage,
1253 struct mali_vertex_tiler_postfix *postfix)
1254 {
1255 struct panfrost_context *ctx = batch->ctx;
1256 struct panfrost_shader_variants *all = ctx->shader[stage];
1257
1258 if (!all)
1259 return;
1260
1261 struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1262
1263 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1264
1265 /* Uniforms are implicitly UBO #0 */
1266 bool has_uniforms = buf->enabled_mask & (1 << 0);
1267
1268 /* Allocate room for the sysval and the uniforms */
1269 size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1270 size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1271 size_t size = sys_size + uniform_size;
1272 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1273 size);
1274
1275 /* Upload sysvals requested by the shader */
1276 panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1277
1278 /* Upload uniforms */
1279 if (has_uniforms && uniform_size) {
1280 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1281 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1282 }
1283
1284 /* Next up, attach UBOs. UBO #0 is the uniforms we just
1285 * uploaded */
1286
1287 unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1288 assert(ubo_count >= 1);
1289
1290 size_t sz = sizeof(uint64_t) * ubo_count;
1291 uint64_t ubos[PAN_MAX_CONST_BUFFERS];
1292 int uniform_count = ss->uniform_count;
1293
1294 /* Upload uniforms as a UBO */
1295 ubos[0] = MALI_MAKE_UBO(2 + uniform_count, transfer.gpu);
1296
1297 /* The rest are honest-to-goodness UBOs */
1298
1299 for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1300 size_t usz = buf->cb[ubo].buffer_size;
1301 bool enabled = buf->enabled_mask & (1 << ubo);
1302 bool empty = usz == 0;
1303
1304 if (!enabled || empty) {
1305 /* Stub out disabled UBOs to catch accesses */
1306 ubos[ubo] = MALI_MAKE_UBO(0, 0xDEAD0000);
1307 continue;
1308 }
1309
1310 mali_ptr gpu = panfrost_map_constant_buffer_gpu(batch, stage,
1311 buf, ubo);
1312
1313 unsigned bytes_per_field = 16;
1314 unsigned aligned = ALIGN_POT(usz, bytes_per_field);
1315 ubos[ubo] = MALI_MAKE_UBO(aligned / bytes_per_field, gpu);
1316 }
1317
1318 mali_ptr ubufs = panfrost_upload_transient(batch, ubos, sz);
1319 postfix->uniforms = transfer.gpu;
1320 postfix->uniform_buffers = ubufs;
1321
1322 buf->dirty_mask = 0;
1323 }
1324
1325 void
1326 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1327 const struct pipe_grid_info *info,
1328 struct midgard_payload_vertex_tiler *vtp)
1329 {
1330 struct panfrost_context *ctx = batch->ctx;
1331 struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1332 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1333 unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1334 128));
1335 unsigned shared_size = single_size * info->grid[0] * info->grid[1] *
1336 info->grid[2] * 4;
1337 struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1338 shared_size,
1339 1);
1340
1341 struct mali_shared_memory shared = {
1342 .shared_memory = bo->gpu,
1343 .shared_workgroup_count =
1344 util_logbase2_ceil(info->grid[0]) +
1345 util_logbase2_ceil(info->grid[1]) +
1346 util_logbase2_ceil(info->grid[2]),
1347 .shared_unk1 = 0x2,
1348 .shared_shift = util_logbase2(single_size) - 1
1349 };
1350
1351 vtp->postfix.shared_memory = panfrost_upload_transient(batch, &shared,
1352 sizeof(shared));
1353 }
1354
1355 static mali_ptr
1356 panfrost_get_tex_desc(struct panfrost_batch *batch,
1357 enum pipe_shader_type st,
1358 struct panfrost_sampler_view *view)
1359 {
1360 if (!view)
1361 return (mali_ptr) 0;
1362
1363 struct pipe_sampler_view *pview = &view->base;
1364 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1365
1366 /* Add the BO to the job so it's retained until the job is done. */
1367
1368 panfrost_batch_add_bo(batch, rsrc->bo,
1369 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1370 panfrost_bo_access_for_stage(st));
1371
1372 panfrost_batch_add_bo(batch, view->midgard_bo,
1373 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1374 panfrost_bo_access_for_stage(st));
1375
1376 return view->midgard_bo->gpu;
1377 }
1378
1379 void
1380 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1381 enum pipe_shader_type stage,
1382 struct mali_vertex_tiler_postfix *postfix)
1383 {
1384 struct panfrost_context *ctx = batch->ctx;
1385 struct panfrost_device *device = pan_device(ctx->base.screen);
1386
1387 if (!ctx->sampler_view_count[stage])
1388 return;
1389
1390 if (device->quirks & IS_BIFROST) {
1391 struct bifrost_texture_descriptor *descriptors;
1392
1393 descriptors = malloc(sizeof(struct bifrost_texture_descriptor) *
1394 ctx->sampler_view_count[stage]);
1395
1396 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1397 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1398 struct pipe_sampler_view *pview = &view->base;
1399 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1400
1401 /* Add the BOs to the job so they are retained until the job is done. */
1402
1403 panfrost_batch_add_bo(batch, rsrc->bo,
1404 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1405 panfrost_bo_access_for_stage(stage));
1406
1407 panfrost_batch_add_bo(batch, view->bifrost_bo,
1408 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1409 panfrost_bo_access_for_stage(stage));
1410
1411 memcpy(&descriptors[i], view->bifrost_descriptor, sizeof(*view->bifrost_descriptor));
1412 }
1413
1414 postfix->textures = panfrost_upload_transient(batch,
1415 descriptors,
1416 sizeof(struct bifrost_texture_descriptor) *
1417 ctx->sampler_view_count[stage]);
1418
1419 free(descriptors);
1420 } else {
1421 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1422
1423 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i)
1424 trampolines[i] = panfrost_get_tex_desc(batch, stage,
1425 ctx->sampler_views[stage][i]);
1426
1427 postfix->textures = panfrost_upload_transient(batch,
1428 trampolines,
1429 sizeof(uint64_t) *
1430 ctx->sampler_view_count[stage]);
1431 }
1432 }
1433
1434 void
1435 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1436 enum pipe_shader_type stage,
1437 struct mali_vertex_tiler_postfix *postfix)
1438 {
1439 struct panfrost_context *ctx = batch->ctx;
1440 struct panfrost_device *device = pan_device(ctx->base.screen);
1441
1442 if (!ctx->sampler_count[stage])
1443 return;
1444
1445 if (device->quirks & IS_BIFROST) {
1446 size_t desc_size = sizeof(struct bifrost_sampler_descriptor);
1447 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1448 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1449 transfer_size);
1450 struct bifrost_sampler_descriptor *desc = (struct bifrost_sampler_descriptor *)transfer.cpu;
1451
1452 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1453 desc[i] = ctx->samplers[stage][i]->bifrost_hw;
1454
1455 postfix->sampler_descriptor = transfer.gpu;
1456 } else {
1457 size_t desc_size = sizeof(struct mali_sampler_descriptor);
1458 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1459 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1460 transfer_size);
1461 struct mali_sampler_descriptor *desc = (struct mali_sampler_descriptor *)transfer.cpu;
1462
1463 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1464 desc[i] = ctx->samplers[stage][i]->midgard_hw;
1465
1466 postfix->sampler_descriptor = transfer.gpu;
1467 }
1468 }
1469
1470 void
1471 panfrost_emit_vertex_attr_meta(struct panfrost_batch *batch,
1472 struct mali_vertex_tiler_postfix *vertex_postfix)
1473 {
1474 struct panfrost_context *ctx = batch->ctx;
1475
1476 if (!ctx->vertex)
1477 return;
1478
1479 struct panfrost_vertex_state *so = ctx->vertex;
1480
1481 panfrost_vertex_state_upd_attr_offs(ctx, vertex_postfix);
1482 vertex_postfix->attribute_meta = panfrost_upload_transient(batch, so->hw,
1483 sizeof(*so->hw) *
1484 PAN_MAX_ATTRIBUTE);
1485 }
1486
1487 void
1488 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1489 struct mali_vertex_tiler_postfix *vertex_postfix)
1490 {
1491 struct panfrost_context *ctx = batch->ctx;
1492 struct panfrost_vertex_state *so = ctx->vertex;
1493
1494 /* Staged mali_attr, and index into them. i =/= k, depending on the
1495 * vertex buffer mask and instancing. Twice as much room is allocated,
1496 * for a worst case of NPOT_DIVIDEs which take up extra slot */
1497 union mali_attr attrs[PIPE_MAX_ATTRIBS * 2];
1498 unsigned k = 0;
1499
1500 for (unsigned i = 0; i < so->num_elements; ++i) {
1501 /* We map a mali_attr to be 1:1 with the mali_attr_meta, which
1502 * means duplicating some vertex buffers (who cares? aside from
1503 * maybe some caching implications but I somehow doubt that
1504 * matters) */
1505
1506 struct pipe_vertex_element *elem = &so->pipe[i];
1507 unsigned vbi = elem->vertex_buffer_index;
1508
1509 /* The exception to 1:1 mapping is that we can have multiple
1510 * entries (NPOT divisors), so we fixup anyways */
1511
1512 so->hw[i].index = k;
1513
1514 if (!(ctx->vb_mask & (1 << vbi)))
1515 continue;
1516
1517 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1518 struct panfrost_resource *rsrc;
1519
1520 rsrc = pan_resource(buf->buffer.resource);
1521 if (!rsrc)
1522 continue;
1523
1524 /* Align to 64 bytes by masking off the lower bits. This
1525 * will be adjusted back when we fixup the src_offset in
1526 * mali_attr_meta */
1527
1528 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1529 mali_ptr addr = raw_addr & ~63;
1530 unsigned chopped_addr = raw_addr - addr;
1531
1532 /* Add a dependency of the batch on the vertex buffer */
1533 panfrost_batch_add_bo(batch, rsrc->bo,
1534 PAN_BO_ACCESS_SHARED |
1535 PAN_BO_ACCESS_READ |
1536 PAN_BO_ACCESS_VERTEX_TILER);
1537
1538 /* Set common fields */
1539 attrs[k].elements = addr;
1540 attrs[k].stride = buf->stride;
1541
1542 /* Since we advanced the base pointer, we shrink the buffer
1543 * size */
1544 attrs[k].size = rsrc->base.width0 - buf->buffer_offset;
1545
1546 /* We need to add the extra size we masked off (for
1547 * correctness) so the data doesn't get clamped away */
1548 attrs[k].size += chopped_addr;
1549
1550 /* For non-instancing make sure we initialize */
1551 attrs[k].shift = attrs[k].extra_flags = 0;
1552
1553 /* Instancing uses a dramatically different code path than
1554 * linear, so dispatch for the actual emission now that the
1555 * common code is finished */
1556
1557 unsigned divisor = elem->instance_divisor;
1558
1559 if (divisor && ctx->instance_count == 1) {
1560 /* Silly corner case where there's a divisor(=1) but
1561 * there's no legitimate instancing. So we want *every*
1562 * attribute to be the same. So set stride to zero so
1563 * we don't go anywhere. */
1564
1565 attrs[k].size = attrs[k].stride + chopped_addr;
1566 attrs[k].stride = 0;
1567 attrs[k++].elements |= MALI_ATTR_LINEAR;
1568 } else if (ctx->instance_count <= 1) {
1569 /* Normal, non-instanced attributes */
1570 attrs[k++].elements |= MALI_ATTR_LINEAR;
1571 } else {
1572 unsigned instance_shift = vertex_postfix->instance_shift;
1573 unsigned instance_odd = vertex_postfix->instance_odd;
1574
1575 k += panfrost_vertex_instanced(ctx->padded_count,
1576 instance_shift,
1577 instance_odd,
1578 divisor, &attrs[k]);
1579 }
1580 }
1581
1582 /* Add special gl_VertexID/gl_InstanceID buffers */
1583
1584 panfrost_vertex_id(ctx->padded_count, &attrs[k]);
1585 so->hw[PAN_VERTEX_ID].index = k++;
1586 panfrost_instance_id(ctx->padded_count, &attrs[k]);
1587 so->hw[PAN_INSTANCE_ID].index = k++;
1588
1589 /* Upload whatever we emitted and go */
1590
1591 vertex_postfix->attributes = panfrost_upload_transient(batch, attrs,
1592 k * sizeof(*attrs));
1593 }
1594
1595 static mali_ptr
1596 panfrost_emit_varyings(struct panfrost_batch *batch, union mali_attr *slot,
1597 unsigned stride, unsigned count)
1598 {
1599 /* Fill out the descriptor */
1600 slot->stride = stride;
1601 slot->size = stride * count;
1602 slot->shift = slot->extra_flags = 0;
1603
1604 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1605 slot->size);
1606
1607 slot->elements = transfer.gpu | MALI_ATTR_LINEAR;
1608
1609 return transfer.gpu;
1610 }
1611
1612 static void
1613 panfrost_emit_streamout(struct panfrost_batch *batch, union mali_attr *slot,
1614 unsigned stride, unsigned offset, unsigned count,
1615 struct pipe_stream_output_target *target)
1616 {
1617 /* Fill out the descriptor */
1618 slot->stride = stride * 4;
1619 slot->shift = slot->extra_flags = 0;
1620
1621 unsigned max_size = target->buffer_size;
1622 unsigned expected_size = slot->stride * count;
1623
1624 slot->size = MIN2(max_size, expected_size);
1625
1626 /* Grab the BO and bind it to the batch */
1627 struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1628
1629 /* Varyings are WRITE from the perspective of the VERTEX but READ from
1630 * the perspective of the TILER and FRAGMENT.
1631 */
1632 panfrost_batch_add_bo(batch, bo,
1633 PAN_BO_ACCESS_SHARED |
1634 PAN_BO_ACCESS_RW |
1635 PAN_BO_ACCESS_VERTEX_TILER |
1636 PAN_BO_ACCESS_FRAGMENT);
1637
1638 mali_ptr addr = bo->gpu + target->buffer_offset + (offset * slot->stride);
1639 slot->elements = addr;
1640 }
1641
1642 /* Given a shader and buffer indices, link varying metadata together */
1643
1644 static bool
1645 is_special_varying(gl_varying_slot loc)
1646 {
1647 switch (loc) {
1648 case VARYING_SLOT_POS:
1649 case VARYING_SLOT_PSIZ:
1650 case VARYING_SLOT_PNTC:
1651 case VARYING_SLOT_FACE:
1652 return true;
1653 default:
1654 return false;
1655 }
1656 }
1657
1658 static void
1659 panfrost_emit_varying_meta(void *outptr, struct panfrost_shader_state *ss,
1660 signed general, signed gl_Position,
1661 signed gl_PointSize, signed gl_PointCoord,
1662 signed gl_FrontFacing)
1663 {
1664 struct mali_attr_meta *out = (struct mali_attr_meta *) outptr;
1665
1666 for (unsigned i = 0; i < ss->varying_count; ++i) {
1667 gl_varying_slot location = ss->varyings_loc[i];
1668 int index = -1;
1669
1670 switch (location) {
1671 case VARYING_SLOT_POS:
1672 index = gl_Position;
1673 break;
1674 case VARYING_SLOT_PSIZ:
1675 index = gl_PointSize;
1676 break;
1677 case VARYING_SLOT_PNTC:
1678 index = gl_PointCoord;
1679 break;
1680 case VARYING_SLOT_FACE:
1681 index = gl_FrontFacing;
1682 break;
1683 default:
1684 index = general;
1685 break;
1686 }
1687
1688 assert(index >= 0);
1689 out[i].index = index;
1690 }
1691 }
1692
1693 static bool
1694 has_point_coord(unsigned mask, gl_varying_slot loc)
1695 {
1696 if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1697 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1698 else if (loc == VARYING_SLOT_PNTC)
1699 return (mask & (1 << 8));
1700 else
1701 return false;
1702 }
1703
1704 /* Helpers for manipulating stream out information so we can pack varyings
1705 * accordingly. Compute the src_offset for a given captured varying */
1706
1707 static struct pipe_stream_output *
1708 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1709 {
1710 for (unsigned i = 0; i < info->num_outputs; ++i) {
1711 if (info->output[i].register_index == loc)
1712 return &info->output[i];
1713 }
1714
1715 unreachable("Varying not captured");
1716 }
1717
1718 void
1719 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1720 unsigned vertex_count,
1721 struct mali_vertex_tiler_postfix *vertex_postfix,
1722 struct mali_vertex_tiler_postfix *tiler_postfix,
1723 union midgard_primitive_size *primitive_size)
1724 {
1725 /* Load the shaders */
1726 struct panfrost_context *ctx = batch->ctx;
1727 struct panfrost_shader_state *vs, *fs;
1728 unsigned int num_gen_varyings = 0;
1729 size_t vs_size, fs_size;
1730
1731 /* Allocate the varying descriptor */
1732
1733 vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1734 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1735 vs_size = sizeof(struct mali_attr_meta) * vs->varying_count;
1736 fs_size = sizeof(struct mali_attr_meta) * fs->varying_count;
1737
1738 struct panfrost_transfer trans = panfrost_allocate_transient(batch,
1739 vs_size +
1740 fs_size);
1741
1742 struct pipe_stream_output_info *so = &vs->stream_output;
1743
1744 /* Check if this varying is linked by us. This is the case for
1745 * general-purpose, non-captured varyings. If it is, link it. If it's
1746 * not, use the provided stream out information to determine the
1747 * offset, since it was already linked for us. */
1748
1749 for (unsigned i = 0; i < vs->varying_count; i++) {
1750 gl_varying_slot loc = vs->varyings_loc[i];
1751
1752 bool special = is_special_varying(loc);
1753 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1754
1755 if (captured) {
1756 struct pipe_stream_output *o = pan_get_so(so, loc);
1757
1758 unsigned dst_offset = o->dst_offset * 4; /* dwords */
1759 vs->varyings[i].src_offset = dst_offset;
1760 } else if (!special) {
1761 vs->varyings[i].src_offset = 16 * (num_gen_varyings++);
1762 }
1763 }
1764
1765 /* Conversely, we need to set src_offset for the captured varyings.
1766 * Here, the layout is defined by the stream out info, not us */
1767
1768 /* Link up with fragment varyings */
1769 bool reads_point_coord = fs->reads_point_coord;
1770
1771 for (unsigned i = 0; i < fs->varying_count; i++) {
1772 gl_varying_slot loc = fs->varyings_loc[i];
1773 unsigned src_offset;
1774 signed vs_idx = -1;
1775
1776 /* Link up */
1777 for (unsigned j = 0; j < vs->varying_count; ++j) {
1778 if (vs->varyings_loc[j] == loc) {
1779 vs_idx = j;
1780 break;
1781 }
1782 }
1783
1784 /* Either assign or reuse */
1785 if (vs_idx >= 0)
1786 src_offset = vs->varyings[vs_idx].src_offset;
1787 else
1788 src_offset = 16 * (num_gen_varyings++);
1789
1790 fs->varyings[i].src_offset = src_offset;
1791
1792 if (has_point_coord(fs->point_sprite_mask, loc))
1793 reads_point_coord = true;
1794 }
1795
1796 memcpy(trans.cpu, vs->varyings, vs_size);
1797 memcpy(trans.cpu + vs_size, fs->varyings, fs_size);
1798
1799 union mali_attr varyings[PIPE_MAX_ATTRIBS] = {0};
1800
1801 /* Figure out how many streamout buffers could be bound */
1802 unsigned so_count = ctx->streamout.num_targets;
1803 for (unsigned i = 0; i < vs->varying_count; i++) {
1804 gl_varying_slot loc = vs->varyings_loc[i];
1805
1806 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1807 if (!captured) continue;
1808
1809 struct pipe_stream_output *o = pan_get_so(so, loc);
1810 so_count = MAX2(so_count, o->output_buffer + 1);
1811 }
1812
1813 signed idx = so_count;
1814 signed general = idx++;
1815 signed gl_Position = idx++;
1816 signed gl_PointSize = vs->writes_point_size ? (idx++) : -1;
1817 signed gl_PointCoord = reads_point_coord ? (idx++) : -1;
1818 signed gl_FrontFacing = fs->reads_face ? (idx++) : -1;
1819 signed gl_FragCoord = fs->reads_frag_coord ? (idx++) : -1;
1820
1821 /* Emit the stream out buffers */
1822
1823 unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1824 ctx->vertex_count);
1825
1826 for (unsigned i = 0; i < so_count; ++i) {
1827 if (i < ctx->streamout.num_targets) {
1828 panfrost_emit_streamout(batch, &varyings[i],
1829 so->stride[i],
1830 ctx->streamout.offsets[i],
1831 out_count,
1832 ctx->streamout.targets[i]);
1833 } else {
1834 /* Emit a dummy buffer */
1835 panfrost_emit_varyings(batch, &varyings[i],
1836 so->stride[i] * 4,
1837 out_count);
1838
1839 /* Clear the attribute type */
1840 varyings[i].elements &= ~0xF;
1841 }
1842 }
1843
1844 panfrost_emit_varyings(batch, &varyings[general],
1845 num_gen_varyings * 16,
1846 vertex_count);
1847
1848 mali_ptr varyings_p;
1849
1850 /* fp32 vec4 gl_Position */
1851 varyings_p = panfrost_emit_varyings(batch, &varyings[gl_Position],
1852 sizeof(float) * 4, vertex_count);
1853 tiler_postfix->position_varying = varyings_p;
1854
1855
1856 if (panfrost_writes_point_size(ctx)) {
1857 varyings_p = panfrost_emit_varyings(batch,
1858 &varyings[gl_PointSize],
1859 2, vertex_count);
1860 primitive_size->pointer = varyings_p;
1861 }
1862
1863 if (reads_point_coord)
1864 varyings[gl_PointCoord].elements = MALI_VARYING_POINT_COORD;
1865
1866 if (fs->reads_face)
1867 varyings[gl_FrontFacing].elements = MALI_VARYING_FRONT_FACING;
1868
1869 if (fs->reads_frag_coord)
1870 varyings[gl_FragCoord].elements = MALI_VARYING_FRAG_COORD;
1871
1872 struct panfrost_device *device = pan_device(ctx->base.screen);
1873 assert(!(device->quirks & IS_BIFROST) || !(reads_point_coord));
1874
1875 /* Let's go ahead and link varying meta to the buffer in question, now
1876 * that that information is available. VARYING_SLOT_POS is mapped to
1877 * gl_FragCoord for fragment shaders but gl_Positionf or vertex shaders
1878 * */
1879
1880 panfrost_emit_varying_meta(trans.cpu, vs, general, gl_Position,
1881 gl_PointSize, gl_PointCoord,
1882 gl_FrontFacing);
1883
1884 panfrost_emit_varying_meta(trans.cpu + vs_size, fs, general,
1885 gl_FragCoord, gl_PointSize,
1886 gl_PointCoord, gl_FrontFacing);
1887
1888 /* Replace streamout */
1889
1890 struct mali_attr_meta *ovs = (struct mali_attr_meta *)trans.cpu;
1891 struct mali_attr_meta *ofs = ovs + vs->varying_count;
1892
1893 for (unsigned i = 0; i < vs->varying_count; i++) {
1894 gl_varying_slot loc = vs->varyings_loc[i];
1895
1896 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1897 if (!captured)
1898 continue;
1899
1900 struct pipe_stream_output *o = pan_get_so(so, loc);
1901 ovs[i].index = o->output_buffer;
1902
1903 assert(o->stream == 0);
1904 ovs[i].format = (vs->varyings[i].format & ~MALI_NR_CHANNELS(4))
1905 | MALI_NR_CHANNELS(o->num_components);
1906
1907 if (device->quirks & HAS_SWIZZLES)
1908 ovs[i].swizzle = panfrost_get_default_swizzle(o->num_components);
1909 else
1910 ovs[i].swizzle = panfrost_bifrost_swizzle(o->num_components);
1911
1912 /* Link to the fragment */
1913 signed fs_idx = -1;
1914
1915 /* Link up */
1916 for (unsigned j = 0; j < fs->varying_count; ++j) {
1917 if (fs->varyings_loc[j] == loc) {
1918 fs_idx = j;
1919 break;
1920 }
1921 }
1922
1923 if (fs_idx >= 0) {
1924 ofs[fs_idx].index = ovs[i].index;
1925 ofs[fs_idx].format = ovs[i].format;
1926 ofs[fs_idx].swizzle = ovs[i].swizzle;
1927 }
1928 }
1929
1930 /* Replace point sprite */
1931 for (unsigned i = 0; i < fs->varying_count; i++) {
1932 /* If we have a point sprite replacement, handle that here. We
1933 * have to translate location first. TODO: Flip y in shader.
1934 * We're already keying ... just time crunch .. */
1935
1936 if (has_point_coord(fs->point_sprite_mask,
1937 fs->varyings_loc[i])) {
1938 ofs[i].index = gl_PointCoord;
1939
1940 /* Swizzle out the z/w to 0/1 */
1941 ofs[i].format = MALI_RG16F;
1942 ofs[i].swizzle = panfrost_get_default_swizzle(2);
1943 }
1944 }
1945
1946 /* Fix up unaligned addresses */
1947 for (unsigned i = 0; i < so_count; ++i) {
1948 if (varyings[i].elements < MALI_RECORD_SPECIAL)
1949 continue;
1950
1951 unsigned align = (varyings[i].elements & 63);
1952
1953 /* While we're at it, the SO buffers are linear */
1954
1955 if (!align) {
1956 varyings[i].elements |= MALI_ATTR_LINEAR;
1957 continue;
1958 }
1959
1960 /* We need to adjust alignment */
1961 varyings[i].elements &= ~63;
1962 varyings[i].elements |= MALI_ATTR_LINEAR;
1963 varyings[i].size += align;
1964
1965 for (unsigned v = 0; v < vs->varying_count; ++v) {
1966 if (ovs[v].index != i)
1967 continue;
1968
1969 ovs[v].src_offset = vs->varyings[v].src_offset + align;
1970 }
1971
1972 for (unsigned f = 0; f < fs->varying_count; ++f) {
1973 if (ofs[f].index != i)
1974 continue;
1975
1976 ofs[f].src_offset = fs->varyings[f].src_offset + align;
1977 }
1978 }
1979
1980 varyings_p = panfrost_upload_transient(batch, varyings,
1981 idx * sizeof(*varyings));
1982 vertex_postfix->varyings = varyings_p;
1983 tiler_postfix->varyings = varyings_p;
1984
1985 vertex_postfix->varying_meta = trans.gpu;
1986 tiler_postfix->varying_meta = trans.gpu + vs_size;
1987 }
1988
1989 void
1990 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
1991 struct mali_vertex_tiler_prefix *vertex_prefix,
1992 struct mali_vertex_tiler_postfix *vertex_postfix,
1993 struct mali_vertex_tiler_prefix *tiler_prefix,
1994 struct mali_vertex_tiler_postfix *tiler_postfix,
1995 union midgard_primitive_size *primitive_size)
1996 {
1997 struct panfrost_context *ctx = batch->ctx;
1998 struct panfrost_device *device = pan_device(ctx->base.screen);
1999 bool wallpapering = ctx->wallpaper_batch && batch->tiler_dep;
2000 struct bifrost_payload_vertex bifrost_vertex = {0,};
2001 struct bifrost_payload_tiler bifrost_tiler = {0,};
2002 struct midgard_payload_vertex_tiler midgard_vertex = {0,};
2003 struct midgard_payload_vertex_tiler midgard_tiler = {0,};
2004 void *vp, *tp;
2005 size_t vp_size, tp_size;
2006
2007 if (device->quirks & IS_BIFROST) {
2008 bifrost_vertex.prefix = *vertex_prefix;
2009 bifrost_vertex.postfix = *vertex_postfix;
2010 vp = &bifrost_vertex;
2011 vp_size = sizeof(bifrost_vertex);
2012
2013 bifrost_tiler.prefix = *tiler_prefix;
2014 bifrost_tiler.tiler.primitive_size = *primitive_size;
2015 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
2016 bifrost_tiler.postfix = *tiler_postfix;
2017 tp = &bifrost_tiler;
2018 tp_size = sizeof(bifrost_tiler);
2019 } else {
2020 midgard_vertex.prefix = *vertex_prefix;
2021 midgard_vertex.postfix = *vertex_postfix;
2022 vp = &midgard_vertex;
2023 vp_size = sizeof(midgard_vertex);
2024
2025 midgard_tiler.prefix = *tiler_prefix;
2026 midgard_tiler.postfix = *tiler_postfix;
2027 midgard_tiler.primitive_size = *primitive_size;
2028 tp = &midgard_tiler;
2029 tp_size = sizeof(midgard_tiler);
2030 }
2031
2032 if (wallpapering) {
2033 /* Inject in reverse order, with "predicted" job indices.
2034 * THIS IS A HACK XXX */
2035 panfrost_new_job(batch, JOB_TYPE_TILER, false,
2036 batch->job_index + 2, tp, tp_size, true);
2037 panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
2038 vp, vp_size, true);
2039 return;
2040 }
2041
2042 /* If rasterizer discard is enable, only submit the vertex */
2043
2044 bool rasterizer_discard = ctx->rasterizer &&
2045 ctx->rasterizer->base.rasterizer_discard;
2046
2047 unsigned vertex = panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
2048 vp, vp_size, false);
2049
2050 if (rasterizer_discard)
2051 return;
2052
2053 panfrost_new_job(batch, JOB_TYPE_TILER, false, vertex, tp, tp_size,
2054 false);
2055 }
2056
2057 /* TODO: stop hardcoding this */
2058 mali_ptr
2059 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2060 {
2061 uint16_t locations[] = {
2062 128, 128,
2063 0, 256,
2064 0, 256,
2065 0, 256,
2066 0, 256,
2067 0, 256,
2068 0, 256,
2069 0, 256,
2070 0, 256,
2071 0, 256,
2072 0, 256,
2073 0, 256,
2074 0, 256,
2075 0, 256,
2076 0, 256,
2077 0, 256,
2078 0, 256,
2079 0, 256,
2080 0, 256,
2081 0, 256,
2082 0, 256,
2083 0, 256,
2084 0, 256,
2085 0, 256,
2086 0, 256,
2087 0, 256,
2088 0, 256,
2089 0, 256,
2090 0, 256,
2091 0, 256,
2092 0, 256,
2093 0, 256,
2094 128, 128,
2095 0, 0,
2096 0, 0,
2097 0, 0,
2098 0, 0,
2099 0, 0,
2100 0, 0,
2101 0, 0,
2102 0, 0,
2103 0, 0,
2104 0, 0,
2105 0, 0,
2106 0, 0,
2107 0, 0,
2108 0, 0,
2109 0, 0,
2110 };
2111
2112 return panfrost_upload_transient(batch, locations, 96 * sizeof(uint16_t));
2113 }