panfrost: Fix Bifrost blending with depth-only FBO
[mesa.git] / src / gallium / drivers / panfrost / pan_cmdstream.c
1 /*
2 * Copyright (C) 2018 Alyssa Rosenzweig
3 * Copyright (C) 2020 Collabora Ltd.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "util/macros.h"
26 #include "util/u_prim.h"
27 #include "util/u_vbuf.h"
28
29 #include "panfrost-quirks.h"
30
31 #include "pan_allocate.h"
32 #include "pan_bo.h"
33 #include "pan_cmdstream.h"
34 #include "pan_context.h"
35 #include "pan_job.h"
36
37 /* If a BO is accessed for a particular shader stage, will it be in the primary
38 * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
39 * fragment will be primary, e.g. compute jobs will be considered
40 * "vertex/tiler" by analogy */
41
42 static inline uint32_t
43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
44 {
45 assert(stage == PIPE_SHADER_FRAGMENT ||
46 stage == PIPE_SHADER_VERTEX ||
47 stage == PIPE_SHADER_COMPUTE);
48
49 return stage == PIPE_SHADER_FRAGMENT ?
50 PAN_BO_ACCESS_FRAGMENT :
51 PAN_BO_ACCESS_VERTEX_TILER;
52 }
53
54 static void
55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
56 struct mali_vertex_tiler_postfix *postfix)
57 {
58 struct panfrost_device *dev = pan_device(ctx->base.screen);
59 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
60
61 unsigned shift = panfrost_get_stack_shift(batch->stack_size);
62 struct mali_shared_memory shared = {
63 .stack_shift = shift,
64 .scratchpad = panfrost_batch_get_scratchpad(batch, shift, dev->thread_tls_alloc, dev->core_count)->gpu,
65 .shared_workgroup_count = ~0,
66 };
67 postfix->shared_memory = panfrost_upload_transient(batch, &shared, sizeof(shared));
68 }
69
70 static void
71 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
72 struct mali_vertex_tiler_postfix *postfix)
73 {
74 struct panfrost_device *dev = pan_device(ctx->base.screen);
75 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
76
77 /* If we haven't, reserve space for the framebuffer */
78
79 if (!batch->framebuffer.gpu) {
80 unsigned size = (dev->quirks & MIDGARD_SFBD) ?
81 sizeof(struct mali_single_framebuffer) :
82 sizeof(struct mali_framebuffer);
83
84 batch->framebuffer = panfrost_allocate_transient(batch, size);
85
86 /* Tag the pointer */
87 if (!(dev->quirks & MIDGARD_SFBD))
88 batch->framebuffer.gpu |= MALI_MFBD;
89 }
90
91 postfix->shared_memory = batch->framebuffer.gpu;
92 }
93
94 static void
95 panfrost_vt_update_rasterizer(struct panfrost_context *ctx,
96 struct mali_vertex_tiler_prefix *prefix,
97 struct mali_vertex_tiler_postfix *postfix)
98 {
99 struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
100
101 postfix->gl_enables |= 0x7;
102 SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
103 rasterizer && rasterizer->base.front_ccw);
104 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
105 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_FRONT));
106 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
107 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_BACK));
108 SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
109 rasterizer && rasterizer->base.flatshade_first);
110 }
111
112 void
113 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
114 struct mali_vertex_tiler_prefix *prefix,
115 union midgard_primitive_size *primitive_size)
116 {
117 struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
118
119 if (!panfrost_writes_point_size(ctx)) {
120 bool points = prefix->draw_mode == MALI_POINTS;
121 float val = 0.0f;
122
123 if (rasterizer)
124 val = points ?
125 rasterizer->base.point_size :
126 rasterizer->base.line_width;
127
128 primitive_size->constant = val;
129 }
130 }
131
132 static void
133 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
134 struct mali_vertex_tiler_postfix *postfix)
135 {
136 SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
137 if (ctx->occlusion_query)
138 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
139 else
140 postfix->occlusion_counter = 0;
141 }
142
143 void
144 panfrost_vt_init(struct panfrost_context *ctx,
145 enum pipe_shader_type stage,
146 struct mali_vertex_tiler_prefix *prefix,
147 struct mali_vertex_tiler_postfix *postfix)
148 {
149 struct panfrost_device *device = pan_device(ctx->base.screen);
150
151 if (!ctx->shader[stage])
152 return;
153
154 memset(prefix, 0, sizeof(*prefix));
155 memset(postfix, 0, sizeof(*postfix));
156
157 if (device->quirks & IS_BIFROST) {
158 postfix->gl_enables = 0x2;
159 panfrost_vt_emit_shared_memory(ctx, postfix);
160 } else {
161 postfix->gl_enables = 0x6;
162 panfrost_vt_attach_framebuffer(ctx, postfix);
163 }
164
165 if (stage == PIPE_SHADER_FRAGMENT) {
166 panfrost_vt_update_occlusion_query(ctx, postfix);
167 panfrost_vt_update_rasterizer(ctx, prefix, postfix);
168 }
169 }
170
171 static unsigned
172 panfrost_translate_index_size(unsigned size)
173 {
174 switch (size) {
175 case 1:
176 return MALI_DRAW_INDEXED_UINT8;
177
178 case 2:
179 return MALI_DRAW_INDEXED_UINT16;
180
181 case 4:
182 return MALI_DRAW_INDEXED_UINT32;
183
184 default:
185 unreachable("Invalid index size");
186 }
187 }
188
189 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
190 * good for the duration of the draw (transient), could last longer. Also get
191 * the bounds on the index buffer for the range accessed by the draw. We do
192 * these operations together because there are natural optimizations which
193 * require them to be together. */
194
195 static mali_ptr
196 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
197 const struct pipe_draw_info *info,
198 unsigned *min_index, unsigned *max_index)
199 {
200 struct panfrost_resource *rsrc = pan_resource(info->index.resource);
201 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
202 off_t offset = info->start * info->index_size;
203 bool needs_indices = true;
204 mali_ptr out = 0;
205
206 if (info->max_index != ~0u) {
207 *min_index = info->min_index;
208 *max_index = info->max_index;
209 needs_indices = false;
210 }
211
212 if (!info->has_user_indices) {
213 /* Only resources can be directly mapped */
214 panfrost_batch_add_bo(batch, rsrc->bo,
215 PAN_BO_ACCESS_SHARED |
216 PAN_BO_ACCESS_READ |
217 PAN_BO_ACCESS_VERTEX_TILER);
218 out = rsrc->bo->gpu + offset;
219
220 /* Check the cache */
221 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
222 info->start,
223 info->count,
224 min_index,
225 max_index);
226 } else {
227 /* Otherwise, we need to upload to transient memory */
228 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
229 out = panfrost_upload_transient(batch, ibuf8 + offset,
230 info->count *
231 info->index_size);
232 }
233
234 if (needs_indices) {
235 /* Fallback */
236 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
237
238 if (!info->has_user_indices)
239 panfrost_minmax_cache_add(rsrc->index_cache,
240 info->start, info->count,
241 *min_index, *max_index);
242 }
243
244 return out;
245 }
246
247 void
248 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
249 const struct pipe_draw_info *info,
250 enum mali_draw_mode draw_mode,
251 struct mali_vertex_tiler_postfix *vertex_postfix,
252 struct mali_vertex_tiler_prefix *tiler_prefix,
253 struct mali_vertex_tiler_postfix *tiler_postfix,
254 unsigned *vertex_count,
255 unsigned *padded_count)
256 {
257 tiler_prefix->draw_mode = draw_mode;
258
259 unsigned draw_flags = 0;
260
261 if (panfrost_writes_point_size(ctx))
262 draw_flags |= MALI_DRAW_VARYING_SIZE;
263
264 if (info->primitive_restart)
265 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
266
267 /* These doesn't make much sense */
268
269 draw_flags |= 0x3000;
270
271 if (info->index_size) {
272 unsigned min_index = 0, max_index = 0;
273
274 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
275 info,
276 &min_index,
277 &max_index);
278
279 /* Use the corresponding values */
280 *vertex_count = max_index - min_index + 1;
281 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
282 tiler_prefix->offset_bias_correction = -min_index;
283 tiler_prefix->index_count = MALI_POSITIVE(info->count);
284 draw_flags |= panfrost_translate_index_size(info->index_size);
285 } else {
286 tiler_prefix->indices = 0;
287 *vertex_count = ctx->vertex_count;
288 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
289 tiler_prefix->offset_bias_correction = 0;
290 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
291 }
292
293 tiler_prefix->unknown_draw = draw_flags;
294
295 /* Encode the padded vertex count */
296
297 if (info->instance_count > 1) {
298 *padded_count = panfrost_padded_vertex_count(*vertex_count);
299
300 unsigned shift = __builtin_ctz(ctx->padded_count);
301 unsigned k = ctx->padded_count >> (shift + 1);
302
303 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
304 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
305 } else {
306 *padded_count = *vertex_count;
307
308 /* Reset instancing state */
309 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
310 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
311 }
312 }
313
314 static void
315 panfrost_shader_meta_init(struct panfrost_context *ctx,
316 enum pipe_shader_type st,
317 struct mali_shader_meta *meta)
318 {
319 const struct panfrost_device *dev = pan_device(ctx->base.screen);
320 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
321
322 memset(meta, 0, sizeof(*meta));
323 meta->shader = (ss->bo ? ss->bo->gpu : 0) | ss->first_tag;
324 meta->attribute_count = ss->attribute_count;
325 meta->varying_count = ss->varying_count;
326 meta->texture_count = ctx->sampler_view_count[st];
327 meta->sampler_count = ctx->sampler_count[st];
328
329 if (dev->quirks & IS_BIFROST) {
330 if (st == PIPE_SHADER_VERTEX)
331 meta->bifrost1.unk1 = 0x800000;
332 else {
333 /* First clause ATEST |= 0x4000000.
334 * Less than 32 regs |= 0x200 */
335 meta->bifrost1.unk1 = 0x958020;
336 }
337
338 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
339 if (st == PIPE_SHADER_VERTEX)
340 meta->bifrost2.preload_regs = 0xC0;
341 else
342 meta->bifrost2.preload_regs = 0x1;
343 meta->bifrost2.uniform_count = MIN2(ss->uniform_count,
344 ss->uniform_cutoff);
345 } else {
346 meta->midgard1.uniform_count = MIN2(ss->uniform_count,
347 ss->uniform_cutoff);
348 meta->midgard1.work_count = ss->work_reg_count;
349 meta->midgard1.flags_hi = 0x8; /* XXX */
350 meta->midgard1.flags_lo = 0x220;
351 meta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
352 }
353 }
354
355 static unsigned
356 panfrost_translate_compare_func(enum pipe_compare_func in)
357 {
358 switch (in) {
359 case PIPE_FUNC_NEVER:
360 return MALI_FUNC_NEVER;
361
362 case PIPE_FUNC_LESS:
363 return MALI_FUNC_LESS;
364
365 case PIPE_FUNC_EQUAL:
366 return MALI_FUNC_EQUAL;
367
368 case PIPE_FUNC_LEQUAL:
369 return MALI_FUNC_LEQUAL;
370
371 case PIPE_FUNC_GREATER:
372 return MALI_FUNC_GREATER;
373
374 case PIPE_FUNC_NOTEQUAL:
375 return MALI_FUNC_NOTEQUAL;
376
377 case PIPE_FUNC_GEQUAL:
378 return MALI_FUNC_GEQUAL;
379
380 case PIPE_FUNC_ALWAYS:
381 return MALI_FUNC_ALWAYS;
382
383 default:
384 unreachable("Invalid func");
385 }
386 }
387
388 static unsigned
389 panfrost_translate_stencil_op(enum pipe_stencil_op in)
390 {
391 switch (in) {
392 case PIPE_STENCIL_OP_KEEP:
393 return MALI_STENCIL_KEEP;
394
395 case PIPE_STENCIL_OP_ZERO:
396 return MALI_STENCIL_ZERO;
397
398 case PIPE_STENCIL_OP_REPLACE:
399 return MALI_STENCIL_REPLACE;
400
401 case PIPE_STENCIL_OP_INCR:
402 return MALI_STENCIL_INCR;
403
404 case PIPE_STENCIL_OP_DECR:
405 return MALI_STENCIL_DECR;
406
407 case PIPE_STENCIL_OP_INCR_WRAP:
408 return MALI_STENCIL_INCR_WRAP;
409
410 case PIPE_STENCIL_OP_DECR_WRAP:
411 return MALI_STENCIL_DECR_WRAP;
412
413 case PIPE_STENCIL_OP_INVERT:
414 return MALI_STENCIL_INVERT;
415
416 default:
417 unreachable("Invalid stencil op");
418 }
419 }
420
421 static unsigned
422 translate_tex_wrap(enum pipe_tex_wrap w)
423 {
424 switch (w) {
425 case PIPE_TEX_WRAP_REPEAT:
426 return MALI_WRAP_REPEAT;
427
428 case PIPE_TEX_WRAP_CLAMP:
429 return MALI_WRAP_CLAMP;
430
431 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
432 return MALI_WRAP_CLAMP_TO_EDGE;
433
434 case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
435 return MALI_WRAP_CLAMP_TO_BORDER;
436
437 case PIPE_TEX_WRAP_MIRROR_REPEAT:
438 return MALI_WRAP_MIRRORED_REPEAT;
439
440 case PIPE_TEX_WRAP_MIRROR_CLAMP:
441 return MALI_WRAP_MIRRORED_CLAMP;
442
443 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
444 return MALI_WRAP_MIRRORED_CLAMP_TO_EDGE;
445
446 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
447 return MALI_WRAP_MIRRORED_CLAMP_TO_BORDER;
448
449 default:
450 unreachable("Invalid wrap");
451 }
452 }
453
454 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
455 struct mali_sampler_descriptor *hw)
456 {
457 unsigned func = panfrost_translate_compare_func(cso->compare_func);
458 bool min_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
459 bool mag_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
460 bool mip_linear = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR;
461 unsigned min_filter = min_nearest ? MALI_SAMP_MIN_NEAREST : 0;
462 unsigned mag_filter = mag_nearest ? MALI_SAMP_MAG_NEAREST : 0;
463 unsigned mip_filter = mip_linear ?
464 (MALI_SAMP_MIP_LINEAR_1 | MALI_SAMP_MIP_LINEAR_2) : 0;
465 unsigned normalized = cso->normalized_coords ? MALI_SAMP_NORM_COORDS : 0;
466
467 *hw = (struct mali_sampler_descriptor) {
468 .filter_mode = min_filter | mag_filter | mip_filter |
469 normalized,
470 .wrap_s = translate_tex_wrap(cso->wrap_s),
471 .wrap_t = translate_tex_wrap(cso->wrap_t),
472 .wrap_r = translate_tex_wrap(cso->wrap_r),
473 .compare_func = panfrost_flip_compare_func(func),
474 .border_color = {
475 cso->border_color.f[0],
476 cso->border_color.f[1],
477 cso->border_color.f[2],
478 cso->border_color.f[3]
479 },
480 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
481 .max_lod = FIXED_16(cso->max_lod, false),
482 .lod_bias = FIXED_16(cso->lod_bias, true), /* can be negative */
483 .seamless_cube_map = cso->seamless_cube_map,
484 };
485
486 /* If necessary, we disable mipmapping in the sampler descriptor by
487 * clamping the LOD as tight as possible (from 0 to epsilon,
488 * essentially -- remember these are fixed point numbers, so
489 * epsilon=1/256) */
490
491 if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
492 hw->max_lod = hw->min_lod + 1;
493 }
494
495 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
496 struct bifrost_sampler_descriptor *hw)
497 {
498 *hw = (struct bifrost_sampler_descriptor) {
499 .unk1 = 0x1,
500 .wrap_s = translate_tex_wrap(cso->wrap_s),
501 .wrap_t = translate_tex_wrap(cso->wrap_t),
502 .wrap_r = translate_tex_wrap(cso->wrap_r),
503 .unk8 = 0x8,
504 .min_filter = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST,
505 .norm_coords = cso->normalized_coords,
506 .mip_filter = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR,
507 .mag_filter = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR,
508 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
509 .max_lod = FIXED_16(cso->max_lod, false),
510 };
511
512 /* If necessary, we disable mipmapping in the sampler descriptor by
513 * clamping the LOD as tight as possible (from 0 to epsilon,
514 * essentially -- remember these are fixed point numbers, so
515 * epsilon=1/256) */
516
517 if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
518 hw->max_lod = hw->min_lod + 1;
519 }
520
521 static void
522 panfrost_make_stencil_state(const struct pipe_stencil_state *in,
523 struct mali_stencil_test *out)
524 {
525 out->ref = 0; /* Gallium gets it from elsewhere */
526
527 out->mask = in->valuemask;
528 out->func = panfrost_translate_compare_func(in->func);
529 out->sfail = panfrost_translate_stencil_op(in->fail_op);
530 out->dpfail = panfrost_translate_stencil_op(in->zfail_op);
531 out->dppass = panfrost_translate_stencil_op(in->zpass_op);
532 }
533
534 static void
535 panfrost_frag_meta_rasterizer_update(struct panfrost_context *ctx,
536 struct mali_shader_meta *fragmeta)
537 {
538 if (!ctx->rasterizer) {
539 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, true);
540 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, false);
541 fragmeta->depth_units = 0.0f;
542 fragmeta->depth_factor = 0.0f;
543 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, false);
544 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, false);
545 return;
546 }
547
548 bool msaa = ctx->rasterizer->base.multisample;
549
550 /* TODO: Sample size */
551 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
552 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
553 fragmeta->depth_units = ctx->rasterizer->base.offset_units * 2.0f;
554 fragmeta->depth_factor = ctx->rasterizer->base.offset_scale;
555
556 /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
557
558 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A,
559 ctx->rasterizer->base.offset_tri);
560 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B,
561 ctx->rasterizer->base.offset_tri);
562 }
563
564 static void
565 panfrost_frag_meta_zsa_update(struct panfrost_context *ctx,
566 struct mali_shader_meta *fragmeta)
567 {
568 const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
569 int zfunc = PIPE_FUNC_ALWAYS;
570
571 if (!zsa) {
572 struct pipe_stencil_state default_stencil = {
573 .enabled = 0,
574 .func = PIPE_FUNC_ALWAYS,
575 .fail_op = MALI_STENCIL_KEEP,
576 .zfail_op = MALI_STENCIL_KEEP,
577 .zpass_op = MALI_STENCIL_KEEP,
578 .writemask = 0xFF,
579 .valuemask = 0xFF
580 };
581
582 panfrost_make_stencil_state(&default_stencil,
583 &fragmeta->stencil_front);
584 fragmeta->stencil_mask_front = default_stencil.writemask;
585 fragmeta->stencil_back = fragmeta->stencil_front;
586 fragmeta->stencil_mask_back = default_stencil.writemask;
587 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST, false);
588 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK, false);
589 } else {
590 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
591 zsa->stencil[0].enabled);
592 panfrost_make_stencil_state(&zsa->stencil[0],
593 &fragmeta->stencil_front);
594 fragmeta->stencil_mask_front = zsa->stencil[0].writemask;
595 fragmeta->stencil_front.ref = ctx->stencil_ref.ref_value[0];
596
597 /* If back-stencil is not enabled, use the front values */
598
599 if (zsa->stencil[1].enabled) {
600 panfrost_make_stencil_state(&zsa->stencil[1],
601 &fragmeta->stencil_back);
602 fragmeta->stencil_mask_back = zsa->stencil[1].writemask;
603 fragmeta->stencil_back.ref = ctx->stencil_ref.ref_value[1];
604 } else {
605 fragmeta->stencil_back = fragmeta->stencil_front;
606 fragmeta->stencil_mask_back = fragmeta->stencil_mask_front;
607 fragmeta->stencil_back.ref = fragmeta->stencil_front.ref;
608 }
609
610 if (zsa->depth.enabled)
611 zfunc = zsa->depth.func;
612
613 /* Depth state (TODO: Refactor) */
614
615 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
616 zsa->depth.writemask);
617 }
618
619 fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
620 fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(zfunc));
621 }
622
623 static bool
624 panfrost_fs_required(
625 struct panfrost_shader_state *fs,
626 struct panfrost_blend_final *blend,
627 unsigned rt_count)
628 {
629 /* If we generally have side effects */
630 if (fs->fs_sidefx)
631 return true;
632
633 /* If colour is written we need to execute */
634 for (unsigned i = 0; i < rt_count; ++i) {
635 if (!blend[i].no_colour)
636 return true;
637 }
638
639 /* If depth is written and not implied we need to execute.
640 * TODO: Predicate on Z/S writes being enabled */
641 return (fs->writes_depth || fs->writes_stencil);
642 }
643
644 static void
645 panfrost_frag_meta_blend_update(struct panfrost_context *ctx,
646 struct mali_shader_meta *fragmeta,
647 void *rts)
648 {
649 const struct panfrost_device *dev = pan_device(ctx->base.screen);
650 struct panfrost_shader_state *fs;
651 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
652
653 SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
654 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
655 !ctx->blend->base.dither);
656
657 /* Get blending setup */
658 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
659
660 struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
661 unsigned shader_offset = 0;
662 struct panfrost_bo *shader_bo = NULL;
663
664 for (unsigned c = 0; c < rt_count; ++c)
665 blend[c] = panfrost_get_blend_for_context(ctx, c, &shader_bo,
666 &shader_offset);
667
668 /* Disable shader execution if we can */
669 if (dev->quirks & MIDGARD_SHADERLESS
670 && !panfrost_fs_required(fs, blend, rt_count)) {
671 fragmeta->shader = 0;
672 fragmeta->attribute_count = 0;
673 fragmeta->varying_count = 0;
674 fragmeta->texture_count = 0;
675 fragmeta->sampler_count = 0;
676
677 /* This feature is not known to work on Bifrost */
678 fragmeta->midgard1.work_count = 1;
679 fragmeta->midgard1.uniform_count = 0;
680 fragmeta->midgard1.uniform_buffer_count = 0;
681 }
682
683 /* If there is a blend shader, work registers are shared. We impose 8
684 * work registers as a limit for blend shaders. Should be lower XXX */
685
686 if (!(dev->quirks & IS_BIFROST)) {
687 for (unsigned c = 0; c < rt_count; ++c) {
688 if (blend[c].is_shader) {
689 fragmeta->midgard1.work_count =
690 MAX2(fragmeta->midgard1.work_count, 8);
691 }
692 }
693 }
694
695 /* Even on MFBD, the shader descriptor gets blend shaders. It's *also*
696 * copied to the blend_meta appended (by convention), but this is the
697 * field actually read by the hardware. (Or maybe both are read...?).
698 * Specify the last RTi with a blend shader. */
699
700 fragmeta->blend.shader = 0;
701
702 for (signed rt = (rt_count - 1); rt >= 0; --rt) {
703 if (!blend[rt].is_shader)
704 continue;
705
706 fragmeta->blend.shader = blend[rt].shader.gpu |
707 blend[rt].shader.first_tag;
708 break;
709 }
710
711 if (dev->quirks & MIDGARD_SFBD) {
712 /* When only a single render target platform is used, the blend
713 * information is inside the shader meta itself. We additionally
714 * need to signal CAN_DISCARD for nontrivial blend modes (so
715 * we're able to read back the destination buffer) */
716
717 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
718 blend[0].is_shader);
719
720 if (!blend[0].is_shader) {
721 fragmeta->blend.equation = *blend[0].equation.equation;
722 fragmeta->blend.constant = blend[0].equation.constant;
723 }
724
725 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
726 !blend[0].no_blending || fs->can_discard);
727 return;
728 }
729
730 /* Additional blend descriptor tacked on for jobs using MFBD */
731
732 for (unsigned i = 0; i < rt_count; ++i) {
733 if (dev->quirks & IS_BIFROST) {
734 struct bifrost_blend_rt *brts = rts;
735
736 if (blend[i].is_shader) {
737 /* The blend shader's address needs to be at
738 * the same top 32 bit as the fragment shader.
739 * TODO: Ensure that's always the case.
740 */
741 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
742 (fs->bo->gpu & (0xffffffffull << 32)));
743 brts[i].shader = blend[i].shader.gpu;
744 brts[i].unk2 = 0x0;
745 brts[i].flags = 0x200;
746 } else if (ctx->pipe_framebuffer.nr_cbufs > i) {
747 enum pipe_format format = ctx->pipe_framebuffer.cbufs[i]->format;
748 const struct util_format_description *format_desc;
749 format_desc = util_format_description(format);
750
751 brts[i].equation = *blend[i].equation.equation;
752
753 /* TODO: this is a bit more complicated */
754 brts[i].constant = blend[i].equation.constant;
755
756 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
757 brts[i].unk2 = 0x19;
758
759 brts[i].shader_type = fs->blend_types[i];
760 brts[i].flags = 0x200;
761 } else {
762 /* Dummy attachment for depth-only */
763 brts[i].unk2 = 0x3;
764 brts[i].shader_type = fs->blend_types[i];
765 }
766 } else {
767 struct midgard_blend_rt *mrts = rts;
768
769 if (!blend[i].no_colour) {
770 mrts[i].flags = 0x200;
771
772 bool is_srgb = (ctx->pipe_framebuffer.nr_cbufs > i) &&
773 (ctx->pipe_framebuffer.cbufs[i]) &&
774 util_format_is_srgb(ctx->pipe_framebuffer.cbufs[i]->format);
775
776 SET_BIT(mrts[i].flags, MALI_BLEND_MRT_SHADER, blend[i].is_shader);
777 SET_BIT(mrts[i].flags, MALI_BLEND_LOAD_TIB, !blend[i].no_blending);
778 SET_BIT(mrts[i].flags, MALI_BLEND_SRGB, is_srgb);
779 SET_BIT(mrts[i].flags, MALI_BLEND_NO_DITHER, !ctx->blend->base.dither);
780 }
781
782 if (blend[i].is_shader) {
783 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
784 } else {
785 mrts[i].blend.equation = *blend[i].equation.equation;
786 mrts[i].blend.constant = blend[i].equation.constant;
787 }
788 }
789 }
790 }
791
792 static void
793 panfrost_frag_shader_meta_init(struct panfrost_context *ctx,
794 struct mali_shader_meta *fragmeta,
795 void *rts)
796 {
797 const struct panfrost_device *dev = pan_device(ctx->base.screen);
798 struct panfrost_shader_state *fs;
799
800 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
801
802 fragmeta->alpha_coverage = ~MALI_ALPHA_COVERAGE(0.000000);
803 fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x3010;
804 fragmeta->unknown2_4 = 0x4e0;
805
806 /* unknown2_4 has 0x10 bit set on T6XX and T720. We don't know why this
807 * is required (independent of 32-bit/64-bit descriptors), or why it's
808 * not used on later GPU revisions. Otherwise, all shader jobs fault on
809 * these earlier chips (perhaps this is a chicken bit of some kind).
810 * More investigation is needed. */
811
812 SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
813
814 if (dev->quirks & IS_BIFROST) {
815 /* TODO */
816 } else {
817 /* Depending on whether it's legal to in the given shader, we try to
818 * enable early-z testing (or forward-pixel kill?) */
819
820 SET_BIT(fragmeta->midgard1.flags_lo, MALI_EARLY_Z,
821 !fs->can_discard && !fs->writes_depth);
822
823 /* Add the writes Z/S flags if needed. */
824 SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_Z, fs->writes_depth);
825 SET_BIT(fragmeta->midgard1.flags_hi, MALI_WRITES_S, fs->writes_stencil);
826
827 /* Any time texturing is used, derivatives are implicitly calculated,
828 * so we need to enable helper invocations */
829
830 SET_BIT(fragmeta->midgard1.flags_lo, MALI_HELPER_INVOCATIONS,
831 fs->helper_invocations);
832
833 const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
834
835 bool depth_enabled = fs->writes_depth ||
836 (zsa && zsa->depth.enabled && zsa->depth.func != PIPE_FUNC_ALWAYS);
837
838 SET_BIT(fragmeta->midgard1.flags_lo, 0x400, !depth_enabled && fs->can_discard);
839 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_ZS, depth_enabled && fs->can_discard);
840 }
841
842 panfrost_frag_meta_rasterizer_update(ctx, fragmeta);
843 panfrost_frag_meta_zsa_update(ctx, fragmeta);
844 panfrost_frag_meta_blend_update(ctx, fragmeta, rts);
845 }
846
847 void
848 panfrost_emit_shader_meta(struct panfrost_batch *batch,
849 enum pipe_shader_type st,
850 struct mali_vertex_tiler_postfix *postfix)
851 {
852 struct panfrost_context *ctx = batch->ctx;
853 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
854
855 if (!ss) {
856 postfix->shader = 0;
857 return;
858 }
859
860 struct mali_shader_meta meta;
861
862 panfrost_shader_meta_init(ctx, st, &meta);
863
864 /* Add the shader BO to the batch. */
865 panfrost_batch_add_bo(batch, ss->bo,
866 PAN_BO_ACCESS_PRIVATE |
867 PAN_BO_ACCESS_READ |
868 panfrost_bo_access_for_stage(st));
869
870 mali_ptr shader_ptr;
871
872 if (st == PIPE_SHADER_FRAGMENT) {
873 struct panfrost_device *dev = pan_device(ctx->base.screen);
874 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
875 size_t desc_size = sizeof(meta);
876 void *rts = NULL;
877 struct panfrost_transfer xfer;
878 unsigned rt_size;
879
880 if (dev->quirks & MIDGARD_SFBD)
881 rt_size = 0;
882 else if (dev->quirks & IS_BIFROST)
883 rt_size = sizeof(struct bifrost_blend_rt);
884 else
885 rt_size = sizeof(struct midgard_blend_rt);
886
887 desc_size += rt_size * rt_count;
888
889 if (rt_size)
890 rts = rzalloc_size(ctx, rt_size * rt_count);
891
892 panfrost_frag_shader_meta_init(ctx, &meta, rts);
893
894 xfer = panfrost_allocate_transient(batch, desc_size);
895
896 memcpy(xfer.cpu, &meta, sizeof(meta));
897 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
898
899 if (rt_size)
900 ralloc_free(rts);
901
902 shader_ptr = xfer.gpu;
903 } else {
904 shader_ptr = panfrost_upload_transient(batch, &meta,
905 sizeof(meta));
906 }
907
908 postfix->shader = shader_ptr;
909 }
910
911 static void
912 panfrost_mali_viewport_init(struct panfrost_context *ctx,
913 struct mali_viewport *mvp)
914 {
915 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
916
917 /* Clip bounds are encoded as floats. The viewport itself is encoded as
918 * (somewhat) asymmetric ints. */
919
920 const struct pipe_scissor_state *ss = &ctx->scissor;
921
922 memset(mvp, 0, sizeof(*mvp));
923
924 /* By default, do no viewport clipping, i.e. clip to (-inf, inf) in
925 * each direction. Clipping to the viewport in theory should work, but
926 * in practice causes issues when we're not explicitly trying to
927 * scissor */
928
929 *mvp = (struct mali_viewport) {
930 .clip_minx = -INFINITY,
931 .clip_miny = -INFINITY,
932 .clip_maxx = INFINITY,
933 .clip_maxy = INFINITY,
934 };
935
936 /* Always scissor to the viewport by default. */
937 float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
938 float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
939
940 float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
941 float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
942
943 float minz = (vp->translate[2] - fabsf(vp->scale[2]));
944 float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
945
946 /* Apply the scissor test */
947
948 unsigned minx, miny, maxx, maxy;
949
950 if (ss && ctx->rasterizer && ctx->rasterizer->base.scissor) {
951 minx = MAX2(ss->minx, vp_minx);
952 miny = MAX2(ss->miny, vp_miny);
953 maxx = MIN2(ss->maxx, vp_maxx);
954 maxy = MIN2(ss->maxy, vp_maxy);
955 } else {
956 minx = vp_minx;
957 miny = vp_miny;
958 maxx = vp_maxx;
959 maxy = vp_maxy;
960 }
961
962 /* Hardware needs the min/max to be strictly ordered, so flip if we
963 * need to. The viewport transformation in the vertex shader will
964 * handle the negatives if we don't */
965
966 if (miny > maxy) {
967 unsigned temp = miny;
968 miny = maxy;
969 maxy = temp;
970 }
971
972 if (minx > maxx) {
973 unsigned temp = minx;
974 minx = maxx;
975 maxx = temp;
976 }
977
978 if (minz > maxz) {
979 float temp = minz;
980 minz = maxz;
981 maxz = temp;
982 }
983
984 /* Clamp to the framebuffer size as a last check */
985
986 minx = MIN2(ctx->pipe_framebuffer.width, minx);
987 maxx = MIN2(ctx->pipe_framebuffer.width, maxx);
988
989 miny = MIN2(ctx->pipe_framebuffer.height, miny);
990 maxy = MIN2(ctx->pipe_framebuffer.height, maxy);
991
992 /* Upload */
993
994 mvp->viewport0[0] = minx;
995 mvp->viewport1[0] = MALI_POSITIVE(maxx);
996
997 mvp->viewport0[1] = miny;
998 mvp->viewport1[1] = MALI_POSITIVE(maxy);
999
1000 mvp->clip_minz = minz;
1001 mvp->clip_maxz = maxz;
1002 }
1003
1004 void
1005 panfrost_emit_viewport(struct panfrost_batch *batch,
1006 struct mali_vertex_tiler_postfix *tiler_postfix)
1007 {
1008 struct panfrost_context *ctx = batch->ctx;
1009 struct mali_viewport mvp;
1010
1011 panfrost_mali_viewport_init(batch->ctx, &mvp);
1012
1013 /* Update the job, unless we're doing wallpapering (whose lack of
1014 * scissor we can ignore, since if we "miss" a tile of wallpaper, it'll
1015 * just... be faster :) */
1016
1017 if (!ctx->wallpaper_batch)
1018 panfrost_batch_union_scissor(batch, mvp.viewport0[0],
1019 mvp.viewport0[1],
1020 mvp.viewport1[0] + 1,
1021 mvp.viewport1[1] + 1);
1022
1023 tiler_postfix->viewport = panfrost_upload_transient(batch, &mvp,
1024 sizeof(mvp));
1025 }
1026
1027 static mali_ptr
1028 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
1029 enum pipe_shader_type st,
1030 struct panfrost_constant_buffer *buf,
1031 unsigned index)
1032 {
1033 struct pipe_constant_buffer *cb = &buf->cb[index];
1034 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1035
1036 if (rsrc) {
1037 panfrost_batch_add_bo(batch, rsrc->bo,
1038 PAN_BO_ACCESS_SHARED |
1039 PAN_BO_ACCESS_READ |
1040 panfrost_bo_access_for_stage(st));
1041
1042 /* Alignment gauranteed by
1043 * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
1044 return rsrc->bo->gpu + cb->buffer_offset;
1045 } else if (cb->user_buffer) {
1046 return panfrost_upload_transient(batch,
1047 cb->user_buffer +
1048 cb->buffer_offset,
1049 cb->buffer_size);
1050 } else {
1051 unreachable("No constant buffer");
1052 }
1053 }
1054
1055 struct sysval_uniform {
1056 union {
1057 float f[4];
1058 int32_t i[4];
1059 uint32_t u[4];
1060 uint64_t du[2];
1061 };
1062 };
1063
1064 static void
1065 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
1066 struct sysval_uniform *uniform)
1067 {
1068 struct panfrost_context *ctx = batch->ctx;
1069 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1070
1071 uniform->f[0] = vp->scale[0];
1072 uniform->f[1] = vp->scale[1];
1073 uniform->f[2] = vp->scale[2];
1074 }
1075
1076 static void
1077 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
1078 struct sysval_uniform *uniform)
1079 {
1080 struct panfrost_context *ctx = batch->ctx;
1081 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1082
1083 uniform->f[0] = vp->translate[0];
1084 uniform->f[1] = vp->translate[1];
1085 uniform->f[2] = vp->translate[2];
1086 }
1087
1088 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
1089 enum pipe_shader_type st,
1090 unsigned int sysvalid,
1091 struct sysval_uniform *uniform)
1092 {
1093 struct panfrost_context *ctx = batch->ctx;
1094 unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
1095 unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
1096 bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
1097 struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
1098
1099 assert(dim);
1100 uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
1101
1102 if (dim > 1)
1103 uniform->i[1] = u_minify(tex->texture->height0,
1104 tex->u.tex.first_level);
1105
1106 if (dim > 2)
1107 uniform->i[2] = u_minify(tex->texture->depth0,
1108 tex->u.tex.first_level);
1109
1110 if (is_array)
1111 uniform->i[dim] = tex->texture->array_size;
1112 }
1113
1114 static void
1115 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
1116 enum pipe_shader_type st,
1117 unsigned ssbo_id,
1118 struct sysval_uniform *uniform)
1119 {
1120 struct panfrost_context *ctx = batch->ctx;
1121
1122 assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
1123 struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
1124
1125 /* Compute address */
1126 struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
1127
1128 panfrost_batch_add_bo(batch, bo,
1129 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
1130 panfrost_bo_access_for_stage(st));
1131
1132 /* Upload address and size as sysval */
1133 uniform->du[0] = bo->gpu + sb.buffer_offset;
1134 uniform->u[2] = sb.buffer_size;
1135 }
1136
1137 static void
1138 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1139 enum pipe_shader_type st,
1140 unsigned samp_idx,
1141 struct sysval_uniform *uniform)
1142 {
1143 struct panfrost_context *ctx = batch->ctx;
1144 struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1145
1146 uniform->f[0] = sampl->min_lod;
1147 uniform->f[1] = sampl->max_lod;
1148 uniform->f[2] = sampl->lod_bias;
1149
1150 /* Even without any errata, Midgard represents "no mipmapping" as
1151 * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1152 * panfrost_create_sampler_state which also explains our choice of
1153 * epsilon value (again to keep behaviour consistent) */
1154
1155 if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1156 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1157 }
1158
1159 static void
1160 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1161 struct sysval_uniform *uniform)
1162 {
1163 struct panfrost_context *ctx = batch->ctx;
1164
1165 uniform->u[0] = ctx->compute_grid->grid[0];
1166 uniform->u[1] = ctx->compute_grid->grid[1];
1167 uniform->u[2] = ctx->compute_grid->grid[2];
1168 }
1169
1170 static void
1171 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1172 struct panfrost_shader_state *ss,
1173 enum pipe_shader_type st)
1174 {
1175 struct sysval_uniform *uniforms = (void *)buf;
1176
1177 for (unsigned i = 0; i < ss->sysval_count; ++i) {
1178 int sysval = ss->sysval[i];
1179
1180 switch (PAN_SYSVAL_TYPE(sysval)) {
1181 case PAN_SYSVAL_VIEWPORT_SCALE:
1182 panfrost_upload_viewport_scale_sysval(batch,
1183 &uniforms[i]);
1184 break;
1185 case PAN_SYSVAL_VIEWPORT_OFFSET:
1186 panfrost_upload_viewport_offset_sysval(batch,
1187 &uniforms[i]);
1188 break;
1189 case PAN_SYSVAL_TEXTURE_SIZE:
1190 panfrost_upload_txs_sysval(batch, st,
1191 PAN_SYSVAL_ID(sysval),
1192 &uniforms[i]);
1193 break;
1194 case PAN_SYSVAL_SSBO:
1195 panfrost_upload_ssbo_sysval(batch, st,
1196 PAN_SYSVAL_ID(sysval),
1197 &uniforms[i]);
1198 break;
1199 case PAN_SYSVAL_NUM_WORK_GROUPS:
1200 panfrost_upload_num_work_groups_sysval(batch,
1201 &uniforms[i]);
1202 break;
1203 case PAN_SYSVAL_SAMPLER:
1204 panfrost_upload_sampler_sysval(batch, st,
1205 PAN_SYSVAL_ID(sysval),
1206 &uniforms[i]);
1207 break;
1208 default:
1209 assert(0);
1210 }
1211 }
1212 }
1213
1214 static const void *
1215 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1216 unsigned index)
1217 {
1218 struct pipe_constant_buffer *cb = &buf->cb[index];
1219 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1220
1221 if (rsrc)
1222 return rsrc->bo->cpu;
1223 else if (cb->user_buffer)
1224 return cb->user_buffer;
1225 else
1226 unreachable("No constant buffer");
1227 }
1228
1229 void
1230 panfrost_emit_const_buf(struct panfrost_batch *batch,
1231 enum pipe_shader_type stage,
1232 struct mali_vertex_tiler_postfix *postfix)
1233 {
1234 struct panfrost_context *ctx = batch->ctx;
1235 struct panfrost_shader_variants *all = ctx->shader[stage];
1236
1237 if (!all)
1238 return;
1239
1240 struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1241
1242 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1243
1244 /* Uniforms are implicitly UBO #0 */
1245 bool has_uniforms = buf->enabled_mask & (1 << 0);
1246
1247 /* Allocate room for the sysval and the uniforms */
1248 size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1249 size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1250 size_t size = sys_size + uniform_size;
1251 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1252 size);
1253
1254 /* Upload sysvals requested by the shader */
1255 panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1256
1257 /* Upload uniforms */
1258 if (has_uniforms && uniform_size) {
1259 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1260 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1261 }
1262
1263 /* Next up, attach UBOs. UBO #0 is the uniforms we just
1264 * uploaded */
1265
1266 unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1267 assert(ubo_count >= 1);
1268
1269 size_t sz = sizeof(uint64_t) * ubo_count;
1270 uint64_t ubos[PAN_MAX_CONST_BUFFERS];
1271 int uniform_count = ss->uniform_count;
1272
1273 /* Upload uniforms as a UBO */
1274 ubos[0] = MALI_MAKE_UBO(2 + uniform_count, transfer.gpu);
1275
1276 /* The rest are honest-to-goodness UBOs */
1277
1278 for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1279 size_t usz = buf->cb[ubo].buffer_size;
1280 bool enabled = buf->enabled_mask & (1 << ubo);
1281 bool empty = usz == 0;
1282
1283 if (!enabled || empty) {
1284 /* Stub out disabled UBOs to catch accesses */
1285 ubos[ubo] = MALI_MAKE_UBO(0, 0xDEAD0000);
1286 continue;
1287 }
1288
1289 mali_ptr gpu = panfrost_map_constant_buffer_gpu(batch, stage,
1290 buf, ubo);
1291
1292 unsigned bytes_per_field = 16;
1293 unsigned aligned = ALIGN_POT(usz, bytes_per_field);
1294 ubos[ubo] = MALI_MAKE_UBO(aligned / bytes_per_field, gpu);
1295 }
1296
1297 mali_ptr ubufs = panfrost_upload_transient(batch, ubos, sz);
1298 postfix->uniforms = transfer.gpu;
1299 postfix->uniform_buffers = ubufs;
1300
1301 buf->dirty_mask = 0;
1302 }
1303
1304 void
1305 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1306 const struct pipe_grid_info *info,
1307 struct midgard_payload_vertex_tiler *vtp)
1308 {
1309 struct panfrost_context *ctx = batch->ctx;
1310 struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1311 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1312 unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1313 128));
1314 unsigned shared_size = single_size * info->grid[0] * info->grid[1] *
1315 info->grid[2] * 4;
1316 struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1317 shared_size,
1318 1);
1319
1320 struct mali_shared_memory shared = {
1321 .shared_memory = bo->gpu,
1322 .shared_workgroup_count =
1323 util_logbase2_ceil(info->grid[0]) +
1324 util_logbase2_ceil(info->grid[1]) +
1325 util_logbase2_ceil(info->grid[2]),
1326 .shared_unk1 = 0x2,
1327 .shared_shift = util_logbase2(single_size) - 1
1328 };
1329
1330 vtp->postfix.shared_memory = panfrost_upload_transient(batch, &shared,
1331 sizeof(shared));
1332 }
1333
1334 static mali_ptr
1335 panfrost_get_tex_desc(struct panfrost_batch *batch,
1336 enum pipe_shader_type st,
1337 struct panfrost_sampler_view *view)
1338 {
1339 if (!view)
1340 return (mali_ptr) 0;
1341
1342 struct pipe_sampler_view *pview = &view->base;
1343 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1344
1345 /* Add the BO to the job so it's retained until the job is done. */
1346
1347 panfrost_batch_add_bo(batch, rsrc->bo,
1348 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1349 panfrost_bo_access_for_stage(st));
1350
1351 panfrost_batch_add_bo(batch, view->midgard_bo,
1352 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1353 panfrost_bo_access_for_stage(st));
1354
1355 return view->midgard_bo->gpu;
1356 }
1357
1358 void
1359 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1360 enum pipe_shader_type stage,
1361 struct mali_vertex_tiler_postfix *postfix)
1362 {
1363 struct panfrost_context *ctx = batch->ctx;
1364 struct panfrost_device *device = pan_device(ctx->base.screen);
1365
1366 if (!ctx->sampler_view_count[stage])
1367 return;
1368
1369 if (device->quirks & IS_BIFROST) {
1370 struct bifrost_texture_descriptor *descriptors;
1371
1372 descriptors = malloc(sizeof(struct bifrost_texture_descriptor) *
1373 ctx->sampler_view_count[stage]);
1374
1375 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1376 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1377 struct pipe_sampler_view *pview = &view->base;
1378 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1379
1380 /* Add the BOs to the job so they are retained until the job is done. */
1381
1382 panfrost_batch_add_bo(batch, rsrc->bo,
1383 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1384 panfrost_bo_access_for_stage(stage));
1385
1386 panfrost_batch_add_bo(batch, view->bifrost_bo,
1387 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1388 panfrost_bo_access_for_stage(stage));
1389
1390 memcpy(&descriptors[i], view->bifrost_descriptor, sizeof(*view->bifrost_descriptor));
1391 }
1392
1393 postfix->textures = panfrost_upload_transient(batch,
1394 descriptors,
1395 sizeof(struct bifrost_texture_descriptor) *
1396 ctx->sampler_view_count[stage]);
1397
1398 free(descriptors);
1399 } else {
1400 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1401
1402 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i)
1403 trampolines[i] = panfrost_get_tex_desc(batch, stage,
1404 ctx->sampler_views[stage][i]);
1405
1406 postfix->textures = panfrost_upload_transient(batch,
1407 trampolines,
1408 sizeof(uint64_t) *
1409 ctx->sampler_view_count[stage]);
1410 }
1411 }
1412
1413 void
1414 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1415 enum pipe_shader_type stage,
1416 struct mali_vertex_tiler_postfix *postfix)
1417 {
1418 struct panfrost_context *ctx = batch->ctx;
1419 struct panfrost_device *device = pan_device(ctx->base.screen);
1420
1421 if (!ctx->sampler_count[stage])
1422 return;
1423
1424 if (device->quirks & IS_BIFROST) {
1425 size_t desc_size = sizeof(struct bifrost_sampler_descriptor);
1426 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1427 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1428 transfer_size);
1429 struct bifrost_sampler_descriptor *desc = (struct bifrost_sampler_descriptor *)transfer.cpu;
1430
1431 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1432 desc[i] = ctx->samplers[stage][i]->bifrost_hw;
1433
1434 postfix->sampler_descriptor = transfer.gpu;
1435 } else {
1436 size_t desc_size = sizeof(struct mali_sampler_descriptor);
1437 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1438 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1439 transfer_size);
1440 struct mali_sampler_descriptor *desc = (struct mali_sampler_descriptor *)transfer.cpu;
1441
1442 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1443 desc[i] = ctx->samplers[stage][i]->midgard_hw;
1444
1445 postfix->sampler_descriptor = transfer.gpu;
1446 }
1447 }
1448
1449 void
1450 panfrost_emit_vertex_attr_meta(struct panfrost_batch *batch,
1451 struct mali_vertex_tiler_postfix *vertex_postfix)
1452 {
1453 struct panfrost_context *ctx = batch->ctx;
1454
1455 if (!ctx->vertex)
1456 return;
1457
1458 struct panfrost_vertex_state *so = ctx->vertex;
1459
1460 panfrost_vertex_state_upd_attr_offs(ctx, vertex_postfix);
1461 vertex_postfix->attribute_meta = panfrost_upload_transient(batch, so->hw,
1462 sizeof(*so->hw) *
1463 PAN_MAX_ATTRIBUTE);
1464 }
1465
1466 void
1467 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1468 struct mali_vertex_tiler_postfix *vertex_postfix)
1469 {
1470 struct panfrost_context *ctx = batch->ctx;
1471 struct panfrost_vertex_state *so = ctx->vertex;
1472
1473 /* Staged mali_attr, and index into them. i =/= k, depending on the
1474 * vertex buffer mask and instancing. Twice as much room is allocated,
1475 * for a worst case of NPOT_DIVIDEs which take up extra slot */
1476 union mali_attr attrs[PIPE_MAX_ATTRIBS * 2];
1477 unsigned k = 0;
1478
1479 for (unsigned i = 0; i < so->num_elements; ++i) {
1480 /* We map a mali_attr to be 1:1 with the mali_attr_meta, which
1481 * means duplicating some vertex buffers (who cares? aside from
1482 * maybe some caching implications but I somehow doubt that
1483 * matters) */
1484
1485 struct pipe_vertex_element *elem = &so->pipe[i];
1486 unsigned vbi = elem->vertex_buffer_index;
1487
1488 /* The exception to 1:1 mapping is that we can have multiple
1489 * entries (NPOT divisors), so we fixup anyways */
1490
1491 so->hw[i].index = k;
1492
1493 if (!(ctx->vb_mask & (1 << vbi)))
1494 continue;
1495
1496 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1497 struct panfrost_resource *rsrc;
1498
1499 rsrc = pan_resource(buf->buffer.resource);
1500 if (!rsrc)
1501 continue;
1502
1503 /* Align to 64 bytes by masking off the lower bits. This
1504 * will be adjusted back when we fixup the src_offset in
1505 * mali_attr_meta */
1506
1507 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1508 mali_ptr addr = raw_addr & ~63;
1509 unsigned chopped_addr = raw_addr - addr;
1510
1511 /* Add a dependency of the batch on the vertex buffer */
1512 panfrost_batch_add_bo(batch, rsrc->bo,
1513 PAN_BO_ACCESS_SHARED |
1514 PAN_BO_ACCESS_READ |
1515 PAN_BO_ACCESS_VERTEX_TILER);
1516
1517 /* Set common fields */
1518 attrs[k].elements = addr;
1519 attrs[k].stride = buf->stride;
1520
1521 /* Since we advanced the base pointer, we shrink the buffer
1522 * size */
1523 attrs[k].size = rsrc->base.width0 - buf->buffer_offset;
1524
1525 /* We need to add the extra size we masked off (for
1526 * correctness) so the data doesn't get clamped away */
1527 attrs[k].size += chopped_addr;
1528
1529 /* For non-instancing make sure we initialize */
1530 attrs[k].shift = attrs[k].extra_flags = 0;
1531
1532 /* Instancing uses a dramatically different code path than
1533 * linear, so dispatch for the actual emission now that the
1534 * common code is finished */
1535
1536 unsigned divisor = elem->instance_divisor;
1537
1538 if (divisor && ctx->instance_count == 1) {
1539 /* Silly corner case where there's a divisor(=1) but
1540 * there's no legitimate instancing. So we want *every*
1541 * attribute to be the same. So set stride to zero so
1542 * we don't go anywhere. */
1543
1544 attrs[k].size = attrs[k].stride + chopped_addr;
1545 attrs[k].stride = 0;
1546 attrs[k++].elements |= MALI_ATTR_LINEAR;
1547 } else if (ctx->instance_count <= 1) {
1548 /* Normal, non-instanced attributes */
1549 attrs[k++].elements |= MALI_ATTR_LINEAR;
1550 } else {
1551 unsigned instance_shift = vertex_postfix->instance_shift;
1552 unsigned instance_odd = vertex_postfix->instance_odd;
1553
1554 k += panfrost_vertex_instanced(ctx->padded_count,
1555 instance_shift,
1556 instance_odd,
1557 divisor, &attrs[k]);
1558 }
1559 }
1560
1561 /* Add special gl_VertexID/gl_InstanceID buffers */
1562
1563 panfrost_vertex_id(ctx->padded_count, &attrs[k]);
1564 so->hw[PAN_VERTEX_ID].index = k++;
1565 panfrost_instance_id(ctx->padded_count, &attrs[k]);
1566 so->hw[PAN_INSTANCE_ID].index = k++;
1567
1568 /* Upload whatever we emitted and go */
1569
1570 vertex_postfix->attributes = panfrost_upload_transient(batch, attrs,
1571 k * sizeof(*attrs));
1572 }
1573
1574 static mali_ptr
1575 panfrost_emit_varyings(struct panfrost_batch *batch, union mali_attr *slot,
1576 unsigned stride, unsigned count)
1577 {
1578 /* Fill out the descriptor */
1579 slot->stride = stride;
1580 slot->size = stride * count;
1581 slot->shift = slot->extra_flags = 0;
1582
1583 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1584 slot->size);
1585
1586 slot->elements = transfer.gpu | MALI_ATTR_LINEAR;
1587
1588 return transfer.gpu;
1589 }
1590
1591 static void
1592 panfrost_emit_streamout(struct panfrost_batch *batch, union mali_attr *slot,
1593 unsigned stride, unsigned offset, unsigned count,
1594 struct pipe_stream_output_target *target)
1595 {
1596 /* Fill out the descriptor */
1597 slot->stride = stride * 4;
1598 slot->shift = slot->extra_flags = 0;
1599
1600 unsigned max_size = target->buffer_size;
1601 unsigned expected_size = slot->stride * count;
1602
1603 slot->size = MIN2(max_size, expected_size);
1604
1605 /* Grab the BO and bind it to the batch */
1606 struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1607
1608 /* Varyings are WRITE from the perspective of the VERTEX but READ from
1609 * the perspective of the TILER and FRAGMENT.
1610 */
1611 panfrost_batch_add_bo(batch, bo,
1612 PAN_BO_ACCESS_SHARED |
1613 PAN_BO_ACCESS_RW |
1614 PAN_BO_ACCESS_VERTEX_TILER |
1615 PAN_BO_ACCESS_FRAGMENT);
1616
1617 mali_ptr addr = bo->gpu + target->buffer_offset + (offset * slot->stride);
1618 slot->elements = addr;
1619 }
1620
1621 /* Given a shader and buffer indices, link varying metadata together */
1622
1623 static bool
1624 is_special_varying(gl_varying_slot loc)
1625 {
1626 switch (loc) {
1627 case VARYING_SLOT_POS:
1628 case VARYING_SLOT_PSIZ:
1629 case VARYING_SLOT_PNTC:
1630 case VARYING_SLOT_FACE:
1631 return true;
1632 default:
1633 return false;
1634 }
1635 }
1636
1637 static void
1638 panfrost_emit_varying_meta(void *outptr, struct panfrost_shader_state *ss,
1639 signed general, signed gl_Position,
1640 signed gl_PointSize, signed gl_PointCoord,
1641 signed gl_FrontFacing)
1642 {
1643 struct mali_attr_meta *out = (struct mali_attr_meta *) outptr;
1644
1645 for (unsigned i = 0; i < ss->varying_count; ++i) {
1646 gl_varying_slot location = ss->varyings_loc[i];
1647 int index = -1;
1648
1649 switch (location) {
1650 case VARYING_SLOT_POS:
1651 index = gl_Position;
1652 break;
1653 case VARYING_SLOT_PSIZ:
1654 index = gl_PointSize;
1655 break;
1656 case VARYING_SLOT_PNTC:
1657 index = gl_PointCoord;
1658 break;
1659 case VARYING_SLOT_FACE:
1660 index = gl_FrontFacing;
1661 break;
1662 default:
1663 index = general;
1664 break;
1665 }
1666
1667 assert(index >= 0);
1668 out[i].index = index;
1669 }
1670 }
1671
1672 static bool
1673 has_point_coord(unsigned mask, gl_varying_slot loc)
1674 {
1675 if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1676 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1677 else if (loc == VARYING_SLOT_PNTC)
1678 return (mask & (1 << 8));
1679 else
1680 return false;
1681 }
1682
1683 /* Helpers for manipulating stream out information so we can pack varyings
1684 * accordingly. Compute the src_offset for a given captured varying */
1685
1686 static struct pipe_stream_output *
1687 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1688 {
1689 for (unsigned i = 0; i < info->num_outputs; ++i) {
1690 if (info->output[i].register_index == loc)
1691 return &info->output[i];
1692 }
1693
1694 unreachable("Varying not captured");
1695 }
1696
1697 void
1698 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1699 unsigned vertex_count,
1700 struct mali_vertex_tiler_postfix *vertex_postfix,
1701 struct mali_vertex_tiler_postfix *tiler_postfix,
1702 union midgard_primitive_size *primitive_size)
1703 {
1704 /* Load the shaders */
1705 struct panfrost_context *ctx = batch->ctx;
1706 struct panfrost_shader_state *vs, *fs;
1707 unsigned int num_gen_varyings = 0;
1708 size_t vs_size, fs_size;
1709
1710 /* Allocate the varying descriptor */
1711
1712 vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1713 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1714 vs_size = sizeof(struct mali_attr_meta) * vs->varying_count;
1715 fs_size = sizeof(struct mali_attr_meta) * fs->varying_count;
1716
1717 struct panfrost_transfer trans = panfrost_allocate_transient(batch,
1718 vs_size +
1719 fs_size);
1720
1721 struct pipe_stream_output_info *so = &vs->stream_output;
1722
1723 /* Check if this varying is linked by us. This is the case for
1724 * general-purpose, non-captured varyings. If it is, link it. If it's
1725 * not, use the provided stream out information to determine the
1726 * offset, since it was already linked for us. */
1727
1728 for (unsigned i = 0; i < vs->varying_count; i++) {
1729 gl_varying_slot loc = vs->varyings_loc[i];
1730
1731 bool special = is_special_varying(loc);
1732 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1733
1734 if (captured) {
1735 struct pipe_stream_output *o = pan_get_so(so, loc);
1736
1737 unsigned dst_offset = o->dst_offset * 4; /* dwords */
1738 vs->varyings[i].src_offset = dst_offset;
1739 } else if (!special) {
1740 vs->varyings[i].src_offset = 16 * (num_gen_varyings++);
1741 }
1742 }
1743
1744 /* Conversely, we need to set src_offset for the captured varyings.
1745 * Here, the layout is defined by the stream out info, not us */
1746
1747 /* Link up with fragment varyings */
1748 bool reads_point_coord = fs->reads_point_coord;
1749
1750 for (unsigned i = 0; i < fs->varying_count; i++) {
1751 gl_varying_slot loc = fs->varyings_loc[i];
1752 unsigned src_offset;
1753 signed vs_idx = -1;
1754
1755 /* Link up */
1756 for (unsigned j = 0; j < vs->varying_count; ++j) {
1757 if (vs->varyings_loc[j] == loc) {
1758 vs_idx = j;
1759 break;
1760 }
1761 }
1762
1763 /* Either assign or reuse */
1764 if (vs_idx >= 0)
1765 src_offset = vs->varyings[vs_idx].src_offset;
1766 else
1767 src_offset = 16 * (num_gen_varyings++);
1768
1769 fs->varyings[i].src_offset = src_offset;
1770
1771 if (has_point_coord(fs->point_sprite_mask, loc))
1772 reads_point_coord = true;
1773 }
1774
1775 memcpy(trans.cpu, vs->varyings, vs_size);
1776 memcpy(trans.cpu + vs_size, fs->varyings, fs_size);
1777
1778 union mali_attr varyings[PIPE_MAX_ATTRIBS] = {0};
1779
1780 /* Figure out how many streamout buffers could be bound */
1781 unsigned so_count = ctx->streamout.num_targets;
1782 for (unsigned i = 0; i < vs->varying_count; i++) {
1783 gl_varying_slot loc = vs->varyings_loc[i];
1784
1785 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1786 if (!captured) continue;
1787
1788 struct pipe_stream_output *o = pan_get_so(so, loc);
1789 so_count = MAX2(so_count, o->output_buffer + 1);
1790 }
1791
1792 signed idx = so_count;
1793 signed general = idx++;
1794 signed gl_Position = idx++;
1795 signed gl_PointSize = vs->writes_point_size ? (idx++) : -1;
1796 signed gl_PointCoord = reads_point_coord ? (idx++) : -1;
1797 signed gl_FrontFacing = fs->reads_face ? (idx++) : -1;
1798 signed gl_FragCoord = fs->reads_frag_coord ? (idx++) : -1;
1799
1800 /* Emit the stream out buffers */
1801
1802 unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1803 ctx->vertex_count);
1804
1805 for (unsigned i = 0; i < so_count; ++i) {
1806 if (i < ctx->streamout.num_targets) {
1807 panfrost_emit_streamout(batch, &varyings[i],
1808 so->stride[i],
1809 ctx->streamout.offsets[i],
1810 out_count,
1811 ctx->streamout.targets[i]);
1812 } else {
1813 /* Emit a dummy buffer */
1814 panfrost_emit_varyings(batch, &varyings[i],
1815 so->stride[i] * 4,
1816 out_count);
1817
1818 /* Clear the attribute type */
1819 varyings[i].elements &= ~0xF;
1820 }
1821 }
1822
1823 panfrost_emit_varyings(batch, &varyings[general],
1824 num_gen_varyings * 16,
1825 vertex_count);
1826
1827 mali_ptr varyings_p;
1828
1829 /* fp32 vec4 gl_Position */
1830 varyings_p = panfrost_emit_varyings(batch, &varyings[gl_Position],
1831 sizeof(float) * 4, vertex_count);
1832 tiler_postfix->position_varying = varyings_p;
1833
1834
1835 if (panfrost_writes_point_size(ctx)) {
1836 varyings_p = panfrost_emit_varyings(batch,
1837 &varyings[gl_PointSize],
1838 2, vertex_count);
1839 primitive_size->pointer = varyings_p;
1840 }
1841
1842 if (reads_point_coord)
1843 varyings[gl_PointCoord].elements = MALI_VARYING_POINT_COORD;
1844
1845 if (fs->reads_face)
1846 varyings[gl_FrontFacing].elements = MALI_VARYING_FRONT_FACING;
1847
1848 if (fs->reads_frag_coord)
1849 varyings[gl_FragCoord].elements = MALI_VARYING_FRAG_COORD;
1850
1851 struct panfrost_device *device = pan_device(ctx->base.screen);
1852 assert(!(device->quirks & IS_BIFROST) || !(reads_point_coord));
1853
1854 /* Let's go ahead and link varying meta to the buffer in question, now
1855 * that that information is available. VARYING_SLOT_POS is mapped to
1856 * gl_FragCoord for fragment shaders but gl_Positionf or vertex shaders
1857 * */
1858
1859 panfrost_emit_varying_meta(trans.cpu, vs, general, gl_Position,
1860 gl_PointSize, gl_PointCoord,
1861 gl_FrontFacing);
1862
1863 panfrost_emit_varying_meta(trans.cpu + vs_size, fs, general,
1864 gl_FragCoord, gl_PointSize,
1865 gl_PointCoord, gl_FrontFacing);
1866
1867 /* Replace streamout */
1868
1869 struct mali_attr_meta *ovs = (struct mali_attr_meta *)trans.cpu;
1870 struct mali_attr_meta *ofs = ovs + vs->varying_count;
1871
1872 for (unsigned i = 0; i < vs->varying_count; i++) {
1873 gl_varying_slot loc = vs->varyings_loc[i];
1874
1875 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1876 if (!captured)
1877 continue;
1878
1879 struct pipe_stream_output *o = pan_get_so(so, loc);
1880 ovs[i].index = o->output_buffer;
1881
1882 assert(o->stream == 0);
1883 ovs[i].format = (vs->varyings[i].format & ~MALI_NR_CHANNELS(4))
1884 | MALI_NR_CHANNELS(o->num_components);
1885
1886 if (device->quirks & HAS_SWIZZLES)
1887 ovs[i].swizzle = panfrost_get_default_swizzle(o->num_components);
1888 else
1889 ovs[i].swizzle = panfrost_bifrost_swizzle(o->num_components);
1890
1891 /* Link to the fragment */
1892 signed fs_idx = -1;
1893
1894 /* Link up */
1895 for (unsigned j = 0; j < fs->varying_count; ++j) {
1896 if (fs->varyings_loc[j] == loc) {
1897 fs_idx = j;
1898 break;
1899 }
1900 }
1901
1902 if (fs_idx >= 0) {
1903 ofs[fs_idx].index = ovs[i].index;
1904 ofs[fs_idx].format = ovs[i].format;
1905 ofs[fs_idx].swizzle = ovs[i].swizzle;
1906 }
1907 }
1908
1909 /* Replace point sprite */
1910 for (unsigned i = 0; i < fs->varying_count; i++) {
1911 /* If we have a point sprite replacement, handle that here. We
1912 * have to translate location first. TODO: Flip y in shader.
1913 * We're already keying ... just time crunch .. */
1914
1915 if (has_point_coord(fs->point_sprite_mask,
1916 fs->varyings_loc[i])) {
1917 ofs[i].index = gl_PointCoord;
1918
1919 /* Swizzle out the z/w to 0/1 */
1920 ofs[i].format = MALI_RG16F;
1921 ofs[i].swizzle = panfrost_get_default_swizzle(2);
1922 }
1923 }
1924
1925 /* Fix up unaligned addresses */
1926 for (unsigned i = 0; i < so_count; ++i) {
1927 if (varyings[i].elements < MALI_RECORD_SPECIAL)
1928 continue;
1929
1930 unsigned align = (varyings[i].elements & 63);
1931
1932 /* While we're at it, the SO buffers are linear */
1933
1934 if (!align) {
1935 varyings[i].elements |= MALI_ATTR_LINEAR;
1936 continue;
1937 }
1938
1939 /* We need to adjust alignment */
1940 varyings[i].elements &= ~63;
1941 varyings[i].elements |= MALI_ATTR_LINEAR;
1942 varyings[i].size += align;
1943
1944 for (unsigned v = 0; v < vs->varying_count; ++v) {
1945 if (ovs[v].index != i)
1946 continue;
1947
1948 ovs[v].src_offset = vs->varyings[v].src_offset + align;
1949 }
1950
1951 for (unsigned f = 0; f < fs->varying_count; ++f) {
1952 if (ofs[f].index != i)
1953 continue;
1954
1955 ofs[f].src_offset = fs->varyings[f].src_offset + align;
1956 }
1957 }
1958
1959 varyings_p = panfrost_upload_transient(batch, varyings,
1960 idx * sizeof(*varyings));
1961 vertex_postfix->varyings = varyings_p;
1962 tiler_postfix->varyings = varyings_p;
1963
1964 vertex_postfix->varying_meta = trans.gpu;
1965 tiler_postfix->varying_meta = trans.gpu + vs_size;
1966 }
1967
1968 void
1969 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
1970 struct mali_vertex_tiler_prefix *vertex_prefix,
1971 struct mali_vertex_tiler_postfix *vertex_postfix,
1972 struct mali_vertex_tiler_prefix *tiler_prefix,
1973 struct mali_vertex_tiler_postfix *tiler_postfix,
1974 union midgard_primitive_size *primitive_size)
1975 {
1976 struct panfrost_context *ctx = batch->ctx;
1977 struct panfrost_device *device = pan_device(ctx->base.screen);
1978 bool wallpapering = ctx->wallpaper_batch && batch->tiler_dep;
1979 struct bifrost_payload_vertex bifrost_vertex = {0,};
1980 struct bifrost_payload_tiler bifrost_tiler = {0,};
1981 struct midgard_payload_vertex_tiler midgard_vertex = {0,};
1982 struct midgard_payload_vertex_tiler midgard_tiler = {0,};
1983 void *vp, *tp;
1984 size_t vp_size, tp_size;
1985
1986 if (device->quirks & IS_BIFROST) {
1987 bifrost_vertex.prefix = *vertex_prefix;
1988 bifrost_vertex.postfix = *vertex_postfix;
1989 vp = &bifrost_vertex;
1990 vp_size = sizeof(bifrost_vertex);
1991
1992 bifrost_tiler.prefix = *tiler_prefix;
1993 bifrost_tiler.tiler.primitive_size = *primitive_size;
1994 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
1995 bifrost_tiler.postfix = *tiler_postfix;
1996 tp = &bifrost_tiler;
1997 tp_size = sizeof(bifrost_tiler);
1998 } else {
1999 midgard_vertex.prefix = *vertex_prefix;
2000 midgard_vertex.postfix = *vertex_postfix;
2001 vp = &midgard_vertex;
2002 vp_size = sizeof(midgard_vertex);
2003
2004 midgard_tiler.prefix = *tiler_prefix;
2005 midgard_tiler.postfix = *tiler_postfix;
2006 midgard_tiler.primitive_size = *primitive_size;
2007 tp = &midgard_tiler;
2008 tp_size = sizeof(midgard_tiler);
2009 }
2010
2011 if (wallpapering) {
2012 /* Inject in reverse order, with "predicted" job indices.
2013 * THIS IS A HACK XXX */
2014 panfrost_new_job(batch, JOB_TYPE_TILER, false,
2015 batch->job_index + 2, tp, tp_size, true);
2016 panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
2017 vp, vp_size, true);
2018 return;
2019 }
2020
2021 /* If rasterizer discard is enable, only submit the vertex */
2022
2023 bool rasterizer_discard = ctx->rasterizer &&
2024 ctx->rasterizer->base.rasterizer_discard;
2025
2026 unsigned vertex = panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
2027 vp, vp_size, false);
2028
2029 if (rasterizer_discard)
2030 return;
2031
2032 panfrost_new_job(batch, JOB_TYPE_TILER, false, vertex, tp, tp_size,
2033 false);
2034 }
2035
2036 /* TODO: stop hardcoding this */
2037 mali_ptr
2038 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2039 {
2040 uint16_t locations[] = {
2041 128, 128,
2042 0, 256,
2043 0, 256,
2044 0, 256,
2045 0, 256,
2046 0, 256,
2047 0, 256,
2048 0, 256,
2049 0, 256,
2050 0, 256,
2051 0, 256,
2052 0, 256,
2053 0, 256,
2054 0, 256,
2055 0, 256,
2056 0, 256,
2057 0, 256,
2058 0, 256,
2059 0, 256,
2060 0, 256,
2061 0, 256,
2062 0, 256,
2063 0, 256,
2064 0, 256,
2065 0, 256,
2066 0, 256,
2067 0, 256,
2068 0, 256,
2069 0, 256,
2070 0, 256,
2071 0, 256,
2072 0, 256,
2073 128, 128,
2074 0, 0,
2075 0, 0,
2076 0, 0,
2077 0, 0,
2078 0, 0,
2079 0, 0,
2080 0, 0,
2081 0, 0,
2082 0, 0,
2083 0, 0,
2084 0, 0,
2085 0, 0,
2086 0, 0,
2087 0, 0,
2088 0, 0,
2089 };
2090
2091 return panfrost_upload_transient(batch, locations, 96 * sizeof(uint16_t));
2092 }