panfrost: Share MRT blend flag calculation with Bifrost
[mesa.git] / src / gallium / drivers / panfrost / pan_cmdstream.c
1 /*
2 * Copyright (C) 2018 Alyssa Rosenzweig
3 * Copyright (C) 2020 Collabora Ltd.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "util/macros.h"
26 #include "util/u_prim.h"
27 #include "util/u_vbuf.h"
28
29 #include "panfrost-quirks.h"
30
31 #include "pan_allocate.h"
32 #include "pan_bo.h"
33 #include "pan_cmdstream.h"
34 #include "pan_context.h"
35 #include "pan_job.h"
36
37 /* If a BO is accessed for a particular shader stage, will it be in the primary
38 * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
39 * fragment will be primary, e.g. compute jobs will be considered
40 * "vertex/tiler" by analogy */
41
42 static inline uint32_t
43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
44 {
45 assert(stage == PIPE_SHADER_FRAGMENT ||
46 stage == PIPE_SHADER_VERTEX ||
47 stage == PIPE_SHADER_COMPUTE);
48
49 return stage == PIPE_SHADER_FRAGMENT ?
50 PAN_BO_ACCESS_FRAGMENT :
51 PAN_BO_ACCESS_VERTEX_TILER;
52 }
53
54 static void
55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
56 struct mali_vertex_tiler_postfix *postfix)
57 {
58 struct panfrost_device *dev = pan_device(ctx->base.screen);
59 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
60
61 unsigned shift = panfrost_get_stack_shift(batch->stack_size);
62 struct mali_shared_memory shared = {
63 .stack_shift = shift,
64 .scratchpad = panfrost_batch_get_scratchpad(batch, shift, dev->thread_tls_alloc, dev->core_count)->gpu,
65 .shared_workgroup_count = ~0,
66 };
67 postfix->shared_memory = panfrost_upload_transient(batch, &shared, sizeof(shared));
68 }
69
70 static void
71 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
72 struct mali_vertex_tiler_postfix *postfix)
73 {
74 struct panfrost_device *dev = pan_device(ctx->base.screen);
75 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
76
77 /* If we haven't, reserve space for the framebuffer */
78
79 if (!batch->framebuffer.gpu) {
80 unsigned size = (dev->quirks & MIDGARD_SFBD) ?
81 sizeof(struct mali_single_framebuffer) :
82 sizeof(struct mali_framebuffer);
83
84 batch->framebuffer = panfrost_allocate_transient(batch, size);
85
86 /* Tag the pointer */
87 if (!(dev->quirks & MIDGARD_SFBD))
88 batch->framebuffer.gpu |= MALI_MFBD;
89 }
90
91 postfix->shared_memory = batch->framebuffer.gpu;
92 }
93
94 static void
95 panfrost_vt_update_rasterizer(struct panfrost_context *ctx,
96 struct mali_vertex_tiler_prefix *prefix,
97 struct mali_vertex_tiler_postfix *postfix)
98 {
99 struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
100
101 postfix->gl_enables |= 0x7;
102 SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
103 rasterizer && rasterizer->base.front_ccw);
104 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
105 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_FRONT));
106 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
107 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_BACK));
108 SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
109 rasterizer && rasterizer->base.flatshade_first);
110 }
111
112 void
113 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
114 struct mali_vertex_tiler_prefix *prefix,
115 union midgard_primitive_size *primitive_size)
116 {
117 struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
118
119 if (!panfrost_writes_point_size(ctx)) {
120 bool points = prefix->draw_mode == MALI_POINTS;
121 float val = 0.0f;
122
123 if (rasterizer)
124 val = points ?
125 rasterizer->base.point_size :
126 rasterizer->base.line_width;
127
128 primitive_size->constant = val;
129 }
130 }
131
132 static void
133 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
134 struct mali_vertex_tiler_postfix *postfix)
135 {
136 SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
137 if (ctx->occlusion_query)
138 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
139 else
140 postfix->occlusion_counter = 0;
141 }
142
143 void
144 panfrost_vt_init(struct panfrost_context *ctx,
145 enum pipe_shader_type stage,
146 struct mali_vertex_tiler_prefix *prefix,
147 struct mali_vertex_tiler_postfix *postfix)
148 {
149 struct panfrost_device *device = pan_device(ctx->base.screen);
150
151 if (!ctx->shader[stage])
152 return;
153
154 memset(prefix, 0, sizeof(*prefix));
155 memset(postfix, 0, sizeof(*postfix));
156
157 if (device->quirks & IS_BIFROST) {
158 postfix->gl_enables = 0x2;
159 panfrost_vt_emit_shared_memory(ctx, postfix);
160 } else {
161 postfix->gl_enables = 0x6;
162 panfrost_vt_attach_framebuffer(ctx, postfix);
163 }
164
165 if (stage == PIPE_SHADER_FRAGMENT) {
166 panfrost_vt_update_occlusion_query(ctx, postfix);
167 panfrost_vt_update_rasterizer(ctx, prefix, postfix);
168 }
169 }
170
171 static unsigned
172 panfrost_translate_index_size(unsigned size)
173 {
174 switch (size) {
175 case 1:
176 return MALI_DRAW_INDEXED_UINT8;
177
178 case 2:
179 return MALI_DRAW_INDEXED_UINT16;
180
181 case 4:
182 return MALI_DRAW_INDEXED_UINT32;
183
184 default:
185 unreachable("Invalid index size");
186 }
187 }
188
189 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
190 * good for the duration of the draw (transient), could last longer. Also get
191 * the bounds on the index buffer for the range accessed by the draw. We do
192 * these operations together because there are natural optimizations which
193 * require them to be together. */
194
195 static mali_ptr
196 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
197 const struct pipe_draw_info *info,
198 unsigned *min_index, unsigned *max_index)
199 {
200 struct panfrost_resource *rsrc = pan_resource(info->index.resource);
201 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
202 off_t offset = info->start * info->index_size;
203 bool needs_indices = true;
204 mali_ptr out = 0;
205
206 if (info->max_index != ~0u) {
207 *min_index = info->min_index;
208 *max_index = info->max_index;
209 needs_indices = false;
210 }
211
212 if (!info->has_user_indices) {
213 /* Only resources can be directly mapped */
214 panfrost_batch_add_bo(batch, rsrc->bo,
215 PAN_BO_ACCESS_SHARED |
216 PAN_BO_ACCESS_READ |
217 PAN_BO_ACCESS_VERTEX_TILER);
218 out = rsrc->bo->gpu + offset;
219
220 /* Check the cache */
221 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
222 info->start,
223 info->count,
224 min_index,
225 max_index);
226 } else {
227 /* Otherwise, we need to upload to transient memory */
228 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
229 out = panfrost_upload_transient(batch, ibuf8 + offset,
230 info->count *
231 info->index_size);
232 }
233
234 if (needs_indices) {
235 /* Fallback */
236 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
237
238 if (!info->has_user_indices)
239 panfrost_minmax_cache_add(rsrc->index_cache,
240 info->start, info->count,
241 *min_index, *max_index);
242 }
243
244 return out;
245 }
246
247 void
248 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
249 const struct pipe_draw_info *info,
250 enum mali_draw_mode draw_mode,
251 struct mali_vertex_tiler_postfix *vertex_postfix,
252 struct mali_vertex_tiler_prefix *tiler_prefix,
253 struct mali_vertex_tiler_postfix *tiler_postfix,
254 unsigned *vertex_count,
255 unsigned *padded_count)
256 {
257 tiler_prefix->draw_mode = draw_mode;
258
259 unsigned draw_flags = 0;
260
261 if (panfrost_writes_point_size(ctx))
262 draw_flags |= MALI_DRAW_VARYING_SIZE;
263
264 if (info->primitive_restart)
265 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
266
267 /* These doesn't make much sense */
268
269 draw_flags |= 0x3000;
270
271 if (info->index_size) {
272 unsigned min_index = 0, max_index = 0;
273
274 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
275 info,
276 &min_index,
277 &max_index);
278
279 /* Use the corresponding values */
280 *vertex_count = max_index - min_index + 1;
281 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
282 tiler_prefix->offset_bias_correction = -min_index;
283 tiler_prefix->index_count = MALI_POSITIVE(info->count);
284 draw_flags |= panfrost_translate_index_size(info->index_size);
285 } else {
286 tiler_prefix->indices = 0;
287 *vertex_count = ctx->vertex_count;
288 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
289 tiler_prefix->offset_bias_correction = 0;
290 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
291 }
292
293 tiler_prefix->unknown_draw = draw_flags;
294
295 /* Encode the padded vertex count */
296
297 if (info->instance_count > 1) {
298 *padded_count = panfrost_padded_vertex_count(*vertex_count);
299
300 unsigned shift = __builtin_ctz(ctx->padded_count);
301 unsigned k = ctx->padded_count >> (shift + 1);
302
303 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
304 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
305 } else {
306 *padded_count = *vertex_count;
307
308 /* Reset instancing state */
309 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
310 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
311 }
312 }
313
314 static void
315 panfrost_shader_meta_init(struct panfrost_context *ctx,
316 enum pipe_shader_type st,
317 struct mali_shader_meta *meta)
318 {
319 const struct panfrost_device *dev = pan_device(ctx->base.screen);
320 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
321
322 memset(meta, 0, sizeof(*meta));
323 meta->shader = (ss->bo ? ss->bo->gpu : 0) | ss->first_tag;
324 meta->attribute_count = ss->attribute_count;
325 meta->varying_count = ss->varying_count;
326 meta->texture_count = ctx->sampler_view_count[st];
327 meta->sampler_count = ctx->sampler_count[st];
328
329 if (dev->quirks & IS_BIFROST) {
330 if (st == PIPE_SHADER_VERTEX)
331 meta->bifrost1.unk1 = 0x800000;
332 else {
333 /* First clause ATEST |= 0x4000000.
334 * Less than 32 regs |= 0x200 */
335 meta->bifrost1.unk1 = 0x958020;
336 }
337
338 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
339 if (st == PIPE_SHADER_VERTEX)
340 meta->bifrost2.preload_regs = 0xC0;
341 else
342 meta->bifrost2.preload_regs = 0x1;
343 meta->bifrost2.uniform_count = MIN2(ss->uniform_count,
344 ss->uniform_cutoff);
345 } else {
346 meta->midgard1.uniform_count = MIN2(ss->uniform_count,
347 ss->uniform_cutoff);
348 meta->midgard1.work_count = ss->work_reg_count;
349 meta->midgard1.flags_hi = 0x8; /* XXX */
350 meta->midgard1.flags_lo = 0x220;
351 meta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
352 }
353 }
354
355 static unsigned
356 panfrost_translate_compare_func(enum pipe_compare_func in)
357 {
358 switch (in) {
359 case PIPE_FUNC_NEVER:
360 return MALI_FUNC_NEVER;
361
362 case PIPE_FUNC_LESS:
363 return MALI_FUNC_LESS;
364
365 case PIPE_FUNC_EQUAL:
366 return MALI_FUNC_EQUAL;
367
368 case PIPE_FUNC_LEQUAL:
369 return MALI_FUNC_LEQUAL;
370
371 case PIPE_FUNC_GREATER:
372 return MALI_FUNC_GREATER;
373
374 case PIPE_FUNC_NOTEQUAL:
375 return MALI_FUNC_NOTEQUAL;
376
377 case PIPE_FUNC_GEQUAL:
378 return MALI_FUNC_GEQUAL;
379
380 case PIPE_FUNC_ALWAYS:
381 return MALI_FUNC_ALWAYS;
382
383 default:
384 unreachable("Invalid func");
385 }
386 }
387
388 static unsigned
389 panfrost_translate_stencil_op(enum pipe_stencil_op in)
390 {
391 switch (in) {
392 case PIPE_STENCIL_OP_KEEP:
393 return MALI_STENCIL_KEEP;
394
395 case PIPE_STENCIL_OP_ZERO:
396 return MALI_STENCIL_ZERO;
397
398 case PIPE_STENCIL_OP_REPLACE:
399 return MALI_STENCIL_REPLACE;
400
401 case PIPE_STENCIL_OP_INCR:
402 return MALI_STENCIL_INCR;
403
404 case PIPE_STENCIL_OP_DECR:
405 return MALI_STENCIL_DECR;
406
407 case PIPE_STENCIL_OP_INCR_WRAP:
408 return MALI_STENCIL_INCR_WRAP;
409
410 case PIPE_STENCIL_OP_DECR_WRAP:
411 return MALI_STENCIL_DECR_WRAP;
412
413 case PIPE_STENCIL_OP_INVERT:
414 return MALI_STENCIL_INVERT;
415
416 default:
417 unreachable("Invalid stencil op");
418 }
419 }
420
421 static unsigned
422 translate_tex_wrap(enum pipe_tex_wrap w)
423 {
424 switch (w) {
425 case PIPE_TEX_WRAP_REPEAT:
426 return MALI_WRAP_REPEAT;
427
428 case PIPE_TEX_WRAP_CLAMP:
429 return MALI_WRAP_CLAMP;
430
431 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
432 return MALI_WRAP_CLAMP_TO_EDGE;
433
434 case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
435 return MALI_WRAP_CLAMP_TO_BORDER;
436
437 case PIPE_TEX_WRAP_MIRROR_REPEAT:
438 return MALI_WRAP_MIRRORED_REPEAT;
439
440 case PIPE_TEX_WRAP_MIRROR_CLAMP:
441 return MALI_WRAP_MIRRORED_CLAMP;
442
443 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
444 return MALI_WRAP_MIRRORED_CLAMP_TO_EDGE;
445
446 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
447 return MALI_WRAP_MIRRORED_CLAMP_TO_BORDER;
448
449 default:
450 unreachable("Invalid wrap");
451 }
452 }
453
454 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
455 struct mali_sampler_descriptor *hw)
456 {
457 unsigned func = panfrost_translate_compare_func(cso->compare_func);
458 bool min_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
459 bool mag_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
460 bool mip_linear = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR;
461 unsigned min_filter = min_nearest ? MALI_SAMP_MIN_NEAREST : 0;
462 unsigned mag_filter = mag_nearest ? MALI_SAMP_MAG_NEAREST : 0;
463 unsigned mip_filter = mip_linear ?
464 (MALI_SAMP_MIP_LINEAR_1 | MALI_SAMP_MIP_LINEAR_2) : 0;
465 unsigned normalized = cso->normalized_coords ? MALI_SAMP_NORM_COORDS : 0;
466
467 *hw = (struct mali_sampler_descriptor) {
468 .filter_mode = min_filter | mag_filter | mip_filter |
469 normalized,
470 .wrap_s = translate_tex_wrap(cso->wrap_s),
471 .wrap_t = translate_tex_wrap(cso->wrap_t),
472 .wrap_r = translate_tex_wrap(cso->wrap_r),
473 .compare_func = panfrost_flip_compare_func(func),
474 .border_color = {
475 cso->border_color.f[0],
476 cso->border_color.f[1],
477 cso->border_color.f[2],
478 cso->border_color.f[3]
479 },
480 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
481 .max_lod = FIXED_16(cso->max_lod, false),
482 .lod_bias = FIXED_16(cso->lod_bias, true), /* can be negative */
483 .seamless_cube_map = cso->seamless_cube_map,
484 };
485
486 /* If necessary, we disable mipmapping in the sampler descriptor by
487 * clamping the LOD as tight as possible (from 0 to epsilon,
488 * essentially -- remember these are fixed point numbers, so
489 * epsilon=1/256) */
490
491 if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
492 hw->max_lod = hw->min_lod + 1;
493 }
494
495 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
496 struct bifrost_sampler_descriptor *hw)
497 {
498 *hw = (struct bifrost_sampler_descriptor) {
499 .unk1 = 0x1,
500 .wrap_s = translate_tex_wrap(cso->wrap_s),
501 .wrap_t = translate_tex_wrap(cso->wrap_t),
502 .wrap_r = translate_tex_wrap(cso->wrap_r),
503 .unk8 = 0x8,
504 .min_filter = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST,
505 .norm_coords = cso->normalized_coords,
506 .mip_filter = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR,
507 .mag_filter = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR,
508 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
509 .max_lod = FIXED_16(cso->max_lod, false),
510 };
511
512 /* If necessary, we disable mipmapping in the sampler descriptor by
513 * clamping the LOD as tight as possible (from 0 to epsilon,
514 * essentially -- remember these are fixed point numbers, so
515 * epsilon=1/256) */
516
517 if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
518 hw->max_lod = hw->min_lod + 1;
519 }
520
521 static void
522 panfrost_make_stencil_state(const struct pipe_stencil_state *in,
523 struct mali_stencil_test *out)
524 {
525 out->ref = 0; /* Gallium gets it from elsewhere */
526
527 out->mask = in->valuemask;
528 out->func = panfrost_translate_compare_func(in->func);
529 out->sfail = panfrost_translate_stencil_op(in->fail_op);
530 out->dpfail = panfrost_translate_stencil_op(in->zfail_op);
531 out->dppass = panfrost_translate_stencil_op(in->zpass_op);
532 }
533
534 static void
535 panfrost_frag_meta_rasterizer_update(struct panfrost_context *ctx,
536 struct mali_shader_meta *fragmeta)
537 {
538 if (!ctx->rasterizer) {
539 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, true);
540 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, false);
541 fragmeta->depth_units = 0.0f;
542 fragmeta->depth_factor = 0.0f;
543 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, false);
544 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, false);
545 return;
546 }
547
548 bool msaa = ctx->rasterizer->base.multisample;
549
550 /* TODO: Sample size */
551 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
552 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
553 fragmeta->depth_units = ctx->rasterizer->base.offset_units * 2.0f;
554 fragmeta->depth_factor = ctx->rasterizer->base.offset_scale;
555
556 /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
557
558 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A,
559 ctx->rasterizer->base.offset_tri);
560 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B,
561 ctx->rasterizer->base.offset_tri);
562 }
563
564 static void
565 panfrost_frag_meta_zsa_update(struct panfrost_context *ctx,
566 struct mali_shader_meta *fragmeta)
567 {
568 const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
569 int zfunc = PIPE_FUNC_ALWAYS;
570
571 if (!zsa) {
572 struct pipe_stencil_state default_stencil = {
573 .enabled = 0,
574 .func = PIPE_FUNC_ALWAYS,
575 .fail_op = MALI_STENCIL_KEEP,
576 .zfail_op = MALI_STENCIL_KEEP,
577 .zpass_op = MALI_STENCIL_KEEP,
578 .writemask = 0xFF,
579 .valuemask = 0xFF
580 };
581
582 panfrost_make_stencil_state(&default_stencil,
583 &fragmeta->stencil_front);
584 fragmeta->stencil_mask_front = default_stencil.writemask;
585 fragmeta->stencil_back = fragmeta->stencil_front;
586 fragmeta->stencil_mask_back = default_stencil.writemask;
587 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST, false);
588 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK, false);
589 } else {
590 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
591 zsa->stencil[0].enabled);
592 panfrost_make_stencil_state(&zsa->stencil[0],
593 &fragmeta->stencil_front);
594 fragmeta->stencil_mask_front = zsa->stencil[0].writemask;
595 fragmeta->stencil_front.ref = ctx->stencil_ref.ref_value[0];
596
597 /* If back-stencil is not enabled, use the front values */
598
599 if (zsa->stencil[1].enabled) {
600 panfrost_make_stencil_state(&zsa->stencil[1],
601 &fragmeta->stencil_back);
602 fragmeta->stencil_mask_back = zsa->stencil[1].writemask;
603 fragmeta->stencil_back.ref = ctx->stencil_ref.ref_value[1];
604 } else {
605 fragmeta->stencil_back = fragmeta->stencil_front;
606 fragmeta->stencil_mask_back = fragmeta->stencil_mask_front;
607 fragmeta->stencil_back.ref = fragmeta->stencil_front.ref;
608 }
609
610 if (zsa->depth.enabled)
611 zfunc = zsa->depth.func;
612
613 /* Depth state (TODO: Refactor) */
614
615 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
616 zsa->depth.writemask);
617 }
618
619 fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
620 fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(zfunc));
621 }
622
623 static bool
624 panfrost_fs_required(
625 struct panfrost_shader_state *fs,
626 struct panfrost_blend_final *blend,
627 unsigned rt_count)
628 {
629 /* If we generally have side effects */
630 if (fs->fs_sidefx)
631 return true;
632
633 /* If colour is written we need to execute */
634 for (unsigned i = 0; i < rt_count; ++i) {
635 if (!blend[i].no_colour)
636 return true;
637 }
638
639 /* If depth is written and not implied we need to execute.
640 * TODO: Predicate on Z/S writes being enabled */
641 return (fs->writes_depth || fs->writes_stencil);
642 }
643
644 static void
645 panfrost_frag_meta_blend_update(struct panfrost_context *ctx,
646 struct mali_shader_meta *fragmeta,
647 void *rts)
648 {
649 const struct panfrost_device *dev = pan_device(ctx->base.screen);
650 struct panfrost_shader_state *fs;
651 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
652
653 SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
654 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
655 !ctx->blend->base.dither);
656
657 /* Get blending setup */
658 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
659
660 struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
661 unsigned shader_offset = 0;
662 struct panfrost_bo *shader_bo = NULL;
663
664 for (unsigned c = 0; c < rt_count; ++c)
665 blend[c] = panfrost_get_blend_for_context(ctx, c, &shader_bo,
666 &shader_offset);
667
668 /* Disable shader execution if we can */
669 if (dev->quirks & MIDGARD_SHADERLESS
670 && !panfrost_fs_required(fs, blend, rt_count)) {
671 fragmeta->shader = 0;
672 fragmeta->attribute_count = 0;
673 fragmeta->varying_count = 0;
674 fragmeta->texture_count = 0;
675 fragmeta->sampler_count = 0;
676
677 /* This feature is not known to work on Bifrost */
678 fragmeta->midgard1.work_count = 1;
679 fragmeta->midgard1.uniform_count = 0;
680 fragmeta->midgard1.uniform_buffer_count = 0;
681 }
682
683 /* If there is a blend shader, work registers are shared. We impose 8
684 * work registers as a limit for blend shaders. Should be lower XXX */
685
686 if (!(dev->quirks & IS_BIFROST)) {
687 for (unsigned c = 0; c < rt_count; ++c) {
688 if (blend[c].is_shader) {
689 fragmeta->midgard1.work_count =
690 MAX2(fragmeta->midgard1.work_count, 8);
691 }
692 }
693 }
694
695 /* Even on MFBD, the shader descriptor gets blend shaders. It's *also*
696 * copied to the blend_meta appended (by convention), but this is the
697 * field actually read by the hardware. (Or maybe both are read...?).
698 * Specify the last RTi with a blend shader. */
699
700 fragmeta->blend.shader = 0;
701
702 for (signed rt = (rt_count - 1); rt >= 0; --rt) {
703 if (!blend[rt].is_shader)
704 continue;
705
706 fragmeta->blend.shader = blend[rt].shader.gpu |
707 blend[rt].shader.first_tag;
708 break;
709 }
710
711 if (dev->quirks & MIDGARD_SFBD) {
712 /* When only a single render target platform is used, the blend
713 * information is inside the shader meta itself. We additionally
714 * need to signal CAN_DISCARD for nontrivial blend modes (so
715 * we're able to read back the destination buffer) */
716
717 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
718 blend[0].is_shader);
719
720 if (!blend[0].is_shader) {
721 fragmeta->blend.equation = *blend[0].equation.equation;
722 fragmeta->blend.constant = blend[0].equation.constant;
723 }
724
725 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
726 !blend[0].no_blending || fs->can_discard);
727 return;
728 }
729
730 /* Additional blend descriptor tacked on for jobs using MFBD */
731
732 for (unsigned i = 0; i < rt_count; ++i) {
733 unsigned flags = 0;
734
735 if (ctx->pipe_framebuffer.nr_cbufs > i && !blend[i].no_colour) {
736 flags = 0x200;
737
738 bool is_srgb = (ctx->pipe_framebuffer.nr_cbufs > i) &&
739 (ctx->pipe_framebuffer.cbufs[i]) &&
740 util_format_is_srgb(ctx->pipe_framebuffer.cbufs[i]->format);
741
742 SET_BIT(flags, MALI_BLEND_MRT_SHADER, blend[i].is_shader);
743 SET_BIT(flags, MALI_BLEND_LOAD_TIB, !blend[i].no_blending);
744 SET_BIT(flags, MALI_BLEND_SRGB, is_srgb);
745 SET_BIT(flags, MALI_BLEND_NO_DITHER, !ctx->blend->base.dither);
746 }
747
748 if (dev->quirks & IS_BIFROST) {
749 struct bifrost_blend_rt *brts = rts;
750
751 brts[i].flags = flags;
752
753 if (blend[i].is_shader) {
754 /* The blend shader's address needs to be at
755 * the same top 32 bit as the fragment shader.
756 * TODO: Ensure that's always the case.
757 */
758 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
759 (fs->bo->gpu & (0xffffffffull << 32)));
760 brts[i].shader = blend[i].shader.gpu;
761 brts[i].unk2 = 0x0;
762 } else if (ctx->pipe_framebuffer.nr_cbufs > i) {
763 enum pipe_format format = ctx->pipe_framebuffer.cbufs[i]->format;
764 const struct util_format_description *format_desc;
765 format_desc = util_format_description(format);
766
767 brts[i].equation = *blend[i].equation.equation;
768
769 /* TODO: this is a bit more complicated */
770 brts[i].constant = blend[i].equation.constant;
771
772 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
773 brts[i].unk2 = 0x19;
774
775 brts[i].shader_type = fs->blend_types[i];
776 } else {
777 /* Dummy attachment for depth-only */
778 brts[i].unk2 = 0x3;
779 brts[i].shader_type = fs->blend_types[i];
780 }
781 } else {
782 struct midgard_blend_rt *mrts = rts;
783 mrts[i].flags = flags;
784
785 if (blend[i].is_shader) {
786 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
787 } else {
788 mrts[i].blend.equation = *blend[i].equation.equation;
789 mrts[i].blend.constant = blend[i].equation.constant;
790 }
791 }
792 }
793 }
794
795 static void
796 panfrost_frag_shader_meta_init(struct panfrost_context *ctx,
797 struct mali_shader_meta *fragmeta,
798 void *rts)
799 {
800 const struct panfrost_device *dev = pan_device(ctx->base.screen);
801 struct panfrost_shader_state *fs;
802
803 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
804
805 fragmeta->alpha_coverage = ~MALI_ALPHA_COVERAGE(0.000000);
806 fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x3010;
807 fragmeta->unknown2_4 = 0x4e0;
808
809 /* unknown2_4 has 0x10 bit set on T6XX and T720. We don't know why this
810 * is required (independent of 32-bit/64-bit descriptors), or why it's
811 * not used on later GPU revisions. Otherwise, all shader jobs fault on
812 * these earlier chips (perhaps this is a chicken bit of some kind).
813 * More investigation is needed. */
814
815 SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
816
817 if (dev->quirks & IS_BIFROST) {
818 /* TODO */
819 } else {
820 /* Depending on whether it's legal to in the given shader, we try to
821 * enable early-z testing (or forward-pixel kill?) */
822
823 SET_BIT(fragmeta->midgard1.flags_lo, MALI_EARLY_Z,
824 !fs->can_discard && !fs->writes_depth);
825
826 /* Add the writes Z/S flags if needed. */
827 SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_Z, fs->writes_depth);
828 SET_BIT(fragmeta->midgard1.flags_hi, MALI_WRITES_S, fs->writes_stencil);
829
830 /* Any time texturing is used, derivatives are implicitly calculated,
831 * so we need to enable helper invocations */
832
833 SET_BIT(fragmeta->midgard1.flags_lo, MALI_HELPER_INVOCATIONS,
834 fs->helper_invocations);
835
836 const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
837
838 bool depth_enabled = fs->writes_depth ||
839 (zsa && zsa->depth.enabled && zsa->depth.func != PIPE_FUNC_ALWAYS);
840
841 SET_BIT(fragmeta->midgard1.flags_lo, 0x400, !depth_enabled && fs->can_discard);
842 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_ZS, depth_enabled && fs->can_discard);
843 }
844
845 panfrost_frag_meta_rasterizer_update(ctx, fragmeta);
846 panfrost_frag_meta_zsa_update(ctx, fragmeta);
847 panfrost_frag_meta_blend_update(ctx, fragmeta, rts);
848 }
849
850 void
851 panfrost_emit_shader_meta(struct panfrost_batch *batch,
852 enum pipe_shader_type st,
853 struct mali_vertex_tiler_postfix *postfix)
854 {
855 struct panfrost_context *ctx = batch->ctx;
856 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
857
858 if (!ss) {
859 postfix->shader = 0;
860 return;
861 }
862
863 struct mali_shader_meta meta;
864
865 panfrost_shader_meta_init(ctx, st, &meta);
866
867 /* Add the shader BO to the batch. */
868 panfrost_batch_add_bo(batch, ss->bo,
869 PAN_BO_ACCESS_PRIVATE |
870 PAN_BO_ACCESS_READ |
871 panfrost_bo_access_for_stage(st));
872
873 mali_ptr shader_ptr;
874
875 if (st == PIPE_SHADER_FRAGMENT) {
876 struct panfrost_device *dev = pan_device(ctx->base.screen);
877 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
878 size_t desc_size = sizeof(meta);
879 void *rts = NULL;
880 struct panfrost_transfer xfer;
881 unsigned rt_size;
882
883 if (dev->quirks & MIDGARD_SFBD)
884 rt_size = 0;
885 else if (dev->quirks & IS_BIFROST)
886 rt_size = sizeof(struct bifrost_blend_rt);
887 else
888 rt_size = sizeof(struct midgard_blend_rt);
889
890 desc_size += rt_size * rt_count;
891
892 if (rt_size)
893 rts = rzalloc_size(ctx, rt_size * rt_count);
894
895 panfrost_frag_shader_meta_init(ctx, &meta, rts);
896
897 xfer = panfrost_allocate_transient(batch, desc_size);
898
899 memcpy(xfer.cpu, &meta, sizeof(meta));
900 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
901
902 if (rt_size)
903 ralloc_free(rts);
904
905 shader_ptr = xfer.gpu;
906 } else {
907 shader_ptr = panfrost_upload_transient(batch, &meta,
908 sizeof(meta));
909 }
910
911 postfix->shader = shader_ptr;
912 }
913
914 static void
915 panfrost_mali_viewport_init(struct panfrost_context *ctx,
916 struct mali_viewport *mvp)
917 {
918 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
919
920 /* Clip bounds are encoded as floats. The viewport itself is encoded as
921 * (somewhat) asymmetric ints. */
922
923 const struct pipe_scissor_state *ss = &ctx->scissor;
924
925 memset(mvp, 0, sizeof(*mvp));
926
927 /* By default, do no viewport clipping, i.e. clip to (-inf, inf) in
928 * each direction. Clipping to the viewport in theory should work, but
929 * in practice causes issues when we're not explicitly trying to
930 * scissor */
931
932 *mvp = (struct mali_viewport) {
933 .clip_minx = -INFINITY,
934 .clip_miny = -INFINITY,
935 .clip_maxx = INFINITY,
936 .clip_maxy = INFINITY,
937 };
938
939 /* Always scissor to the viewport by default. */
940 float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
941 float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
942
943 float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
944 float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
945
946 float minz = (vp->translate[2] - fabsf(vp->scale[2]));
947 float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
948
949 /* Apply the scissor test */
950
951 unsigned minx, miny, maxx, maxy;
952
953 if (ss && ctx->rasterizer && ctx->rasterizer->base.scissor) {
954 minx = MAX2(ss->minx, vp_minx);
955 miny = MAX2(ss->miny, vp_miny);
956 maxx = MIN2(ss->maxx, vp_maxx);
957 maxy = MIN2(ss->maxy, vp_maxy);
958 } else {
959 minx = vp_minx;
960 miny = vp_miny;
961 maxx = vp_maxx;
962 maxy = vp_maxy;
963 }
964
965 /* Hardware needs the min/max to be strictly ordered, so flip if we
966 * need to. The viewport transformation in the vertex shader will
967 * handle the negatives if we don't */
968
969 if (miny > maxy) {
970 unsigned temp = miny;
971 miny = maxy;
972 maxy = temp;
973 }
974
975 if (minx > maxx) {
976 unsigned temp = minx;
977 minx = maxx;
978 maxx = temp;
979 }
980
981 if (minz > maxz) {
982 float temp = minz;
983 minz = maxz;
984 maxz = temp;
985 }
986
987 /* Clamp to the framebuffer size as a last check */
988
989 minx = MIN2(ctx->pipe_framebuffer.width, minx);
990 maxx = MIN2(ctx->pipe_framebuffer.width, maxx);
991
992 miny = MIN2(ctx->pipe_framebuffer.height, miny);
993 maxy = MIN2(ctx->pipe_framebuffer.height, maxy);
994
995 /* Upload */
996
997 mvp->viewport0[0] = minx;
998 mvp->viewport1[0] = MALI_POSITIVE(maxx);
999
1000 mvp->viewport0[1] = miny;
1001 mvp->viewport1[1] = MALI_POSITIVE(maxy);
1002
1003 mvp->clip_minz = minz;
1004 mvp->clip_maxz = maxz;
1005 }
1006
1007 void
1008 panfrost_emit_viewport(struct panfrost_batch *batch,
1009 struct mali_vertex_tiler_postfix *tiler_postfix)
1010 {
1011 struct panfrost_context *ctx = batch->ctx;
1012 struct mali_viewport mvp;
1013
1014 panfrost_mali_viewport_init(batch->ctx, &mvp);
1015
1016 /* Update the job, unless we're doing wallpapering (whose lack of
1017 * scissor we can ignore, since if we "miss" a tile of wallpaper, it'll
1018 * just... be faster :) */
1019
1020 if (!ctx->wallpaper_batch)
1021 panfrost_batch_union_scissor(batch, mvp.viewport0[0],
1022 mvp.viewport0[1],
1023 mvp.viewport1[0] + 1,
1024 mvp.viewport1[1] + 1);
1025
1026 tiler_postfix->viewport = panfrost_upload_transient(batch, &mvp,
1027 sizeof(mvp));
1028 }
1029
1030 static mali_ptr
1031 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
1032 enum pipe_shader_type st,
1033 struct panfrost_constant_buffer *buf,
1034 unsigned index)
1035 {
1036 struct pipe_constant_buffer *cb = &buf->cb[index];
1037 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1038
1039 if (rsrc) {
1040 panfrost_batch_add_bo(batch, rsrc->bo,
1041 PAN_BO_ACCESS_SHARED |
1042 PAN_BO_ACCESS_READ |
1043 panfrost_bo_access_for_stage(st));
1044
1045 /* Alignment gauranteed by
1046 * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
1047 return rsrc->bo->gpu + cb->buffer_offset;
1048 } else if (cb->user_buffer) {
1049 return panfrost_upload_transient(batch,
1050 cb->user_buffer +
1051 cb->buffer_offset,
1052 cb->buffer_size);
1053 } else {
1054 unreachable("No constant buffer");
1055 }
1056 }
1057
1058 struct sysval_uniform {
1059 union {
1060 float f[4];
1061 int32_t i[4];
1062 uint32_t u[4];
1063 uint64_t du[2];
1064 };
1065 };
1066
1067 static void
1068 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
1069 struct sysval_uniform *uniform)
1070 {
1071 struct panfrost_context *ctx = batch->ctx;
1072 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1073
1074 uniform->f[0] = vp->scale[0];
1075 uniform->f[1] = vp->scale[1];
1076 uniform->f[2] = vp->scale[2];
1077 }
1078
1079 static void
1080 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
1081 struct sysval_uniform *uniform)
1082 {
1083 struct panfrost_context *ctx = batch->ctx;
1084 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1085
1086 uniform->f[0] = vp->translate[0];
1087 uniform->f[1] = vp->translate[1];
1088 uniform->f[2] = vp->translate[2];
1089 }
1090
1091 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
1092 enum pipe_shader_type st,
1093 unsigned int sysvalid,
1094 struct sysval_uniform *uniform)
1095 {
1096 struct panfrost_context *ctx = batch->ctx;
1097 unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
1098 unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
1099 bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
1100 struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
1101
1102 assert(dim);
1103 uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
1104
1105 if (dim > 1)
1106 uniform->i[1] = u_minify(tex->texture->height0,
1107 tex->u.tex.first_level);
1108
1109 if (dim > 2)
1110 uniform->i[2] = u_minify(tex->texture->depth0,
1111 tex->u.tex.first_level);
1112
1113 if (is_array)
1114 uniform->i[dim] = tex->texture->array_size;
1115 }
1116
1117 static void
1118 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
1119 enum pipe_shader_type st,
1120 unsigned ssbo_id,
1121 struct sysval_uniform *uniform)
1122 {
1123 struct panfrost_context *ctx = batch->ctx;
1124
1125 assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
1126 struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
1127
1128 /* Compute address */
1129 struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
1130
1131 panfrost_batch_add_bo(batch, bo,
1132 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
1133 panfrost_bo_access_for_stage(st));
1134
1135 /* Upload address and size as sysval */
1136 uniform->du[0] = bo->gpu + sb.buffer_offset;
1137 uniform->u[2] = sb.buffer_size;
1138 }
1139
1140 static void
1141 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1142 enum pipe_shader_type st,
1143 unsigned samp_idx,
1144 struct sysval_uniform *uniform)
1145 {
1146 struct panfrost_context *ctx = batch->ctx;
1147 struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1148
1149 uniform->f[0] = sampl->min_lod;
1150 uniform->f[1] = sampl->max_lod;
1151 uniform->f[2] = sampl->lod_bias;
1152
1153 /* Even without any errata, Midgard represents "no mipmapping" as
1154 * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1155 * panfrost_create_sampler_state which also explains our choice of
1156 * epsilon value (again to keep behaviour consistent) */
1157
1158 if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1159 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1160 }
1161
1162 static void
1163 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1164 struct sysval_uniform *uniform)
1165 {
1166 struct panfrost_context *ctx = batch->ctx;
1167
1168 uniform->u[0] = ctx->compute_grid->grid[0];
1169 uniform->u[1] = ctx->compute_grid->grid[1];
1170 uniform->u[2] = ctx->compute_grid->grid[2];
1171 }
1172
1173 static void
1174 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1175 struct panfrost_shader_state *ss,
1176 enum pipe_shader_type st)
1177 {
1178 struct sysval_uniform *uniforms = (void *)buf;
1179
1180 for (unsigned i = 0; i < ss->sysval_count; ++i) {
1181 int sysval = ss->sysval[i];
1182
1183 switch (PAN_SYSVAL_TYPE(sysval)) {
1184 case PAN_SYSVAL_VIEWPORT_SCALE:
1185 panfrost_upload_viewport_scale_sysval(batch,
1186 &uniforms[i]);
1187 break;
1188 case PAN_SYSVAL_VIEWPORT_OFFSET:
1189 panfrost_upload_viewport_offset_sysval(batch,
1190 &uniforms[i]);
1191 break;
1192 case PAN_SYSVAL_TEXTURE_SIZE:
1193 panfrost_upload_txs_sysval(batch, st,
1194 PAN_SYSVAL_ID(sysval),
1195 &uniforms[i]);
1196 break;
1197 case PAN_SYSVAL_SSBO:
1198 panfrost_upload_ssbo_sysval(batch, st,
1199 PAN_SYSVAL_ID(sysval),
1200 &uniforms[i]);
1201 break;
1202 case PAN_SYSVAL_NUM_WORK_GROUPS:
1203 panfrost_upload_num_work_groups_sysval(batch,
1204 &uniforms[i]);
1205 break;
1206 case PAN_SYSVAL_SAMPLER:
1207 panfrost_upload_sampler_sysval(batch, st,
1208 PAN_SYSVAL_ID(sysval),
1209 &uniforms[i]);
1210 break;
1211 default:
1212 assert(0);
1213 }
1214 }
1215 }
1216
1217 static const void *
1218 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1219 unsigned index)
1220 {
1221 struct pipe_constant_buffer *cb = &buf->cb[index];
1222 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1223
1224 if (rsrc)
1225 return rsrc->bo->cpu;
1226 else if (cb->user_buffer)
1227 return cb->user_buffer;
1228 else
1229 unreachable("No constant buffer");
1230 }
1231
1232 void
1233 panfrost_emit_const_buf(struct panfrost_batch *batch,
1234 enum pipe_shader_type stage,
1235 struct mali_vertex_tiler_postfix *postfix)
1236 {
1237 struct panfrost_context *ctx = batch->ctx;
1238 struct panfrost_shader_variants *all = ctx->shader[stage];
1239
1240 if (!all)
1241 return;
1242
1243 struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1244
1245 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1246
1247 /* Uniforms are implicitly UBO #0 */
1248 bool has_uniforms = buf->enabled_mask & (1 << 0);
1249
1250 /* Allocate room for the sysval and the uniforms */
1251 size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1252 size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1253 size_t size = sys_size + uniform_size;
1254 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1255 size);
1256
1257 /* Upload sysvals requested by the shader */
1258 panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1259
1260 /* Upload uniforms */
1261 if (has_uniforms && uniform_size) {
1262 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1263 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1264 }
1265
1266 /* Next up, attach UBOs. UBO #0 is the uniforms we just
1267 * uploaded */
1268
1269 unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1270 assert(ubo_count >= 1);
1271
1272 size_t sz = sizeof(uint64_t) * ubo_count;
1273 uint64_t ubos[PAN_MAX_CONST_BUFFERS];
1274 int uniform_count = ss->uniform_count;
1275
1276 /* Upload uniforms as a UBO */
1277 ubos[0] = MALI_MAKE_UBO(2 + uniform_count, transfer.gpu);
1278
1279 /* The rest are honest-to-goodness UBOs */
1280
1281 for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1282 size_t usz = buf->cb[ubo].buffer_size;
1283 bool enabled = buf->enabled_mask & (1 << ubo);
1284 bool empty = usz == 0;
1285
1286 if (!enabled || empty) {
1287 /* Stub out disabled UBOs to catch accesses */
1288 ubos[ubo] = MALI_MAKE_UBO(0, 0xDEAD0000);
1289 continue;
1290 }
1291
1292 mali_ptr gpu = panfrost_map_constant_buffer_gpu(batch, stage,
1293 buf, ubo);
1294
1295 unsigned bytes_per_field = 16;
1296 unsigned aligned = ALIGN_POT(usz, bytes_per_field);
1297 ubos[ubo] = MALI_MAKE_UBO(aligned / bytes_per_field, gpu);
1298 }
1299
1300 mali_ptr ubufs = panfrost_upload_transient(batch, ubos, sz);
1301 postfix->uniforms = transfer.gpu;
1302 postfix->uniform_buffers = ubufs;
1303
1304 buf->dirty_mask = 0;
1305 }
1306
1307 void
1308 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1309 const struct pipe_grid_info *info,
1310 struct midgard_payload_vertex_tiler *vtp)
1311 {
1312 struct panfrost_context *ctx = batch->ctx;
1313 struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1314 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1315 unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1316 128));
1317 unsigned shared_size = single_size * info->grid[0] * info->grid[1] *
1318 info->grid[2] * 4;
1319 struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1320 shared_size,
1321 1);
1322
1323 struct mali_shared_memory shared = {
1324 .shared_memory = bo->gpu,
1325 .shared_workgroup_count =
1326 util_logbase2_ceil(info->grid[0]) +
1327 util_logbase2_ceil(info->grid[1]) +
1328 util_logbase2_ceil(info->grid[2]),
1329 .shared_unk1 = 0x2,
1330 .shared_shift = util_logbase2(single_size) - 1
1331 };
1332
1333 vtp->postfix.shared_memory = panfrost_upload_transient(batch, &shared,
1334 sizeof(shared));
1335 }
1336
1337 static mali_ptr
1338 panfrost_get_tex_desc(struct panfrost_batch *batch,
1339 enum pipe_shader_type st,
1340 struct panfrost_sampler_view *view)
1341 {
1342 if (!view)
1343 return (mali_ptr) 0;
1344
1345 struct pipe_sampler_view *pview = &view->base;
1346 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1347
1348 /* Add the BO to the job so it's retained until the job is done. */
1349
1350 panfrost_batch_add_bo(batch, rsrc->bo,
1351 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1352 panfrost_bo_access_for_stage(st));
1353
1354 panfrost_batch_add_bo(batch, view->midgard_bo,
1355 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1356 panfrost_bo_access_for_stage(st));
1357
1358 return view->midgard_bo->gpu;
1359 }
1360
1361 void
1362 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1363 enum pipe_shader_type stage,
1364 struct mali_vertex_tiler_postfix *postfix)
1365 {
1366 struct panfrost_context *ctx = batch->ctx;
1367 struct panfrost_device *device = pan_device(ctx->base.screen);
1368
1369 if (!ctx->sampler_view_count[stage])
1370 return;
1371
1372 if (device->quirks & IS_BIFROST) {
1373 struct bifrost_texture_descriptor *descriptors;
1374
1375 descriptors = malloc(sizeof(struct bifrost_texture_descriptor) *
1376 ctx->sampler_view_count[stage]);
1377
1378 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1379 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1380 struct pipe_sampler_view *pview = &view->base;
1381 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1382
1383 /* Add the BOs to the job so they are retained until the job is done. */
1384
1385 panfrost_batch_add_bo(batch, rsrc->bo,
1386 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1387 panfrost_bo_access_for_stage(stage));
1388
1389 panfrost_batch_add_bo(batch, view->bifrost_bo,
1390 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1391 panfrost_bo_access_for_stage(stage));
1392
1393 memcpy(&descriptors[i], view->bifrost_descriptor, sizeof(*view->bifrost_descriptor));
1394 }
1395
1396 postfix->textures = panfrost_upload_transient(batch,
1397 descriptors,
1398 sizeof(struct bifrost_texture_descriptor) *
1399 ctx->sampler_view_count[stage]);
1400
1401 free(descriptors);
1402 } else {
1403 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1404
1405 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i)
1406 trampolines[i] = panfrost_get_tex_desc(batch, stage,
1407 ctx->sampler_views[stage][i]);
1408
1409 postfix->textures = panfrost_upload_transient(batch,
1410 trampolines,
1411 sizeof(uint64_t) *
1412 ctx->sampler_view_count[stage]);
1413 }
1414 }
1415
1416 void
1417 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1418 enum pipe_shader_type stage,
1419 struct mali_vertex_tiler_postfix *postfix)
1420 {
1421 struct panfrost_context *ctx = batch->ctx;
1422 struct panfrost_device *device = pan_device(ctx->base.screen);
1423
1424 if (!ctx->sampler_count[stage])
1425 return;
1426
1427 if (device->quirks & IS_BIFROST) {
1428 size_t desc_size = sizeof(struct bifrost_sampler_descriptor);
1429 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1430 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1431 transfer_size);
1432 struct bifrost_sampler_descriptor *desc = (struct bifrost_sampler_descriptor *)transfer.cpu;
1433
1434 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1435 desc[i] = ctx->samplers[stage][i]->bifrost_hw;
1436
1437 postfix->sampler_descriptor = transfer.gpu;
1438 } else {
1439 size_t desc_size = sizeof(struct mali_sampler_descriptor);
1440 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1441 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1442 transfer_size);
1443 struct mali_sampler_descriptor *desc = (struct mali_sampler_descriptor *)transfer.cpu;
1444
1445 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1446 desc[i] = ctx->samplers[stage][i]->midgard_hw;
1447
1448 postfix->sampler_descriptor = transfer.gpu;
1449 }
1450 }
1451
1452 void
1453 panfrost_emit_vertex_attr_meta(struct panfrost_batch *batch,
1454 struct mali_vertex_tiler_postfix *vertex_postfix)
1455 {
1456 struct panfrost_context *ctx = batch->ctx;
1457
1458 if (!ctx->vertex)
1459 return;
1460
1461 struct panfrost_vertex_state *so = ctx->vertex;
1462
1463 panfrost_vertex_state_upd_attr_offs(ctx, vertex_postfix);
1464 vertex_postfix->attribute_meta = panfrost_upload_transient(batch, so->hw,
1465 sizeof(*so->hw) *
1466 PAN_MAX_ATTRIBUTE);
1467 }
1468
1469 void
1470 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1471 struct mali_vertex_tiler_postfix *vertex_postfix)
1472 {
1473 struct panfrost_context *ctx = batch->ctx;
1474 struct panfrost_vertex_state *so = ctx->vertex;
1475
1476 /* Staged mali_attr, and index into them. i =/= k, depending on the
1477 * vertex buffer mask and instancing. Twice as much room is allocated,
1478 * for a worst case of NPOT_DIVIDEs which take up extra slot */
1479 union mali_attr attrs[PIPE_MAX_ATTRIBS * 2];
1480 unsigned k = 0;
1481
1482 for (unsigned i = 0; i < so->num_elements; ++i) {
1483 /* We map a mali_attr to be 1:1 with the mali_attr_meta, which
1484 * means duplicating some vertex buffers (who cares? aside from
1485 * maybe some caching implications but I somehow doubt that
1486 * matters) */
1487
1488 struct pipe_vertex_element *elem = &so->pipe[i];
1489 unsigned vbi = elem->vertex_buffer_index;
1490
1491 /* The exception to 1:1 mapping is that we can have multiple
1492 * entries (NPOT divisors), so we fixup anyways */
1493
1494 so->hw[i].index = k;
1495
1496 if (!(ctx->vb_mask & (1 << vbi)))
1497 continue;
1498
1499 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1500 struct panfrost_resource *rsrc;
1501
1502 rsrc = pan_resource(buf->buffer.resource);
1503 if (!rsrc)
1504 continue;
1505
1506 /* Align to 64 bytes by masking off the lower bits. This
1507 * will be adjusted back when we fixup the src_offset in
1508 * mali_attr_meta */
1509
1510 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1511 mali_ptr addr = raw_addr & ~63;
1512 unsigned chopped_addr = raw_addr - addr;
1513
1514 /* Add a dependency of the batch on the vertex buffer */
1515 panfrost_batch_add_bo(batch, rsrc->bo,
1516 PAN_BO_ACCESS_SHARED |
1517 PAN_BO_ACCESS_READ |
1518 PAN_BO_ACCESS_VERTEX_TILER);
1519
1520 /* Set common fields */
1521 attrs[k].elements = addr;
1522 attrs[k].stride = buf->stride;
1523
1524 /* Since we advanced the base pointer, we shrink the buffer
1525 * size */
1526 attrs[k].size = rsrc->base.width0 - buf->buffer_offset;
1527
1528 /* We need to add the extra size we masked off (for
1529 * correctness) so the data doesn't get clamped away */
1530 attrs[k].size += chopped_addr;
1531
1532 /* For non-instancing make sure we initialize */
1533 attrs[k].shift = attrs[k].extra_flags = 0;
1534
1535 /* Instancing uses a dramatically different code path than
1536 * linear, so dispatch for the actual emission now that the
1537 * common code is finished */
1538
1539 unsigned divisor = elem->instance_divisor;
1540
1541 if (divisor && ctx->instance_count == 1) {
1542 /* Silly corner case where there's a divisor(=1) but
1543 * there's no legitimate instancing. So we want *every*
1544 * attribute to be the same. So set stride to zero so
1545 * we don't go anywhere. */
1546
1547 attrs[k].size = attrs[k].stride + chopped_addr;
1548 attrs[k].stride = 0;
1549 attrs[k++].elements |= MALI_ATTR_LINEAR;
1550 } else if (ctx->instance_count <= 1) {
1551 /* Normal, non-instanced attributes */
1552 attrs[k++].elements |= MALI_ATTR_LINEAR;
1553 } else {
1554 unsigned instance_shift = vertex_postfix->instance_shift;
1555 unsigned instance_odd = vertex_postfix->instance_odd;
1556
1557 k += panfrost_vertex_instanced(ctx->padded_count,
1558 instance_shift,
1559 instance_odd,
1560 divisor, &attrs[k]);
1561 }
1562 }
1563
1564 /* Add special gl_VertexID/gl_InstanceID buffers */
1565
1566 panfrost_vertex_id(ctx->padded_count, &attrs[k]);
1567 so->hw[PAN_VERTEX_ID].index = k++;
1568 panfrost_instance_id(ctx->padded_count, &attrs[k]);
1569 so->hw[PAN_INSTANCE_ID].index = k++;
1570
1571 /* Upload whatever we emitted and go */
1572
1573 vertex_postfix->attributes = panfrost_upload_transient(batch, attrs,
1574 k * sizeof(*attrs));
1575 }
1576
1577 static mali_ptr
1578 panfrost_emit_varyings(struct panfrost_batch *batch, union mali_attr *slot,
1579 unsigned stride, unsigned count)
1580 {
1581 /* Fill out the descriptor */
1582 slot->stride = stride;
1583 slot->size = stride * count;
1584 slot->shift = slot->extra_flags = 0;
1585
1586 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1587 slot->size);
1588
1589 slot->elements = transfer.gpu | MALI_ATTR_LINEAR;
1590
1591 return transfer.gpu;
1592 }
1593
1594 static void
1595 panfrost_emit_streamout(struct panfrost_batch *batch, union mali_attr *slot,
1596 unsigned stride, unsigned offset, unsigned count,
1597 struct pipe_stream_output_target *target)
1598 {
1599 /* Fill out the descriptor */
1600 slot->stride = stride * 4;
1601 slot->shift = slot->extra_flags = 0;
1602
1603 unsigned max_size = target->buffer_size;
1604 unsigned expected_size = slot->stride * count;
1605
1606 slot->size = MIN2(max_size, expected_size);
1607
1608 /* Grab the BO and bind it to the batch */
1609 struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1610
1611 /* Varyings are WRITE from the perspective of the VERTEX but READ from
1612 * the perspective of the TILER and FRAGMENT.
1613 */
1614 panfrost_batch_add_bo(batch, bo,
1615 PAN_BO_ACCESS_SHARED |
1616 PAN_BO_ACCESS_RW |
1617 PAN_BO_ACCESS_VERTEX_TILER |
1618 PAN_BO_ACCESS_FRAGMENT);
1619
1620 mali_ptr addr = bo->gpu + target->buffer_offset + (offset * slot->stride);
1621 slot->elements = addr;
1622 }
1623
1624 /* Given a shader and buffer indices, link varying metadata together */
1625
1626 static bool
1627 is_special_varying(gl_varying_slot loc)
1628 {
1629 switch (loc) {
1630 case VARYING_SLOT_POS:
1631 case VARYING_SLOT_PSIZ:
1632 case VARYING_SLOT_PNTC:
1633 case VARYING_SLOT_FACE:
1634 return true;
1635 default:
1636 return false;
1637 }
1638 }
1639
1640 static void
1641 panfrost_emit_varying_meta(void *outptr, struct panfrost_shader_state *ss,
1642 signed general, signed gl_Position,
1643 signed gl_PointSize, signed gl_PointCoord,
1644 signed gl_FrontFacing)
1645 {
1646 struct mali_attr_meta *out = (struct mali_attr_meta *) outptr;
1647
1648 for (unsigned i = 0; i < ss->varying_count; ++i) {
1649 gl_varying_slot location = ss->varyings_loc[i];
1650 int index = -1;
1651
1652 switch (location) {
1653 case VARYING_SLOT_POS:
1654 index = gl_Position;
1655 break;
1656 case VARYING_SLOT_PSIZ:
1657 index = gl_PointSize;
1658 break;
1659 case VARYING_SLOT_PNTC:
1660 index = gl_PointCoord;
1661 break;
1662 case VARYING_SLOT_FACE:
1663 index = gl_FrontFacing;
1664 break;
1665 default:
1666 index = general;
1667 break;
1668 }
1669
1670 assert(index >= 0);
1671 out[i].index = index;
1672 }
1673 }
1674
1675 static bool
1676 has_point_coord(unsigned mask, gl_varying_slot loc)
1677 {
1678 if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1679 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1680 else if (loc == VARYING_SLOT_PNTC)
1681 return (mask & (1 << 8));
1682 else
1683 return false;
1684 }
1685
1686 /* Helpers for manipulating stream out information so we can pack varyings
1687 * accordingly. Compute the src_offset for a given captured varying */
1688
1689 static struct pipe_stream_output *
1690 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1691 {
1692 for (unsigned i = 0; i < info->num_outputs; ++i) {
1693 if (info->output[i].register_index == loc)
1694 return &info->output[i];
1695 }
1696
1697 unreachable("Varying not captured");
1698 }
1699
1700 void
1701 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1702 unsigned vertex_count,
1703 struct mali_vertex_tiler_postfix *vertex_postfix,
1704 struct mali_vertex_tiler_postfix *tiler_postfix,
1705 union midgard_primitive_size *primitive_size)
1706 {
1707 /* Load the shaders */
1708 struct panfrost_context *ctx = batch->ctx;
1709 struct panfrost_shader_state *vs, *fs;
1710 unsigned int num_gen_varyings = 0;
1711 size_t vs_size, fs_size;
1712
1713 /* Allocate the varying descriptor */
1714
1715 vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1716 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1717 vs_size = sizeof(struct mali_attr_meta) * vs->varying_count;
1718 fs_size = sizeof(struct mali_attr_meta) * fs->varying_count;
1719
1720 struct panfrost_transfer trans = panfrost_allocate_transient(batch,
1721 vs_size +
1722 fs_size);
1723
1724 struct pipe_stream_output_info *so = &vs->stream_output;
1725
1726 /* Check if this varying is linked by us. This is the case for
1727 * general-purpose, non-captured varyings. If it is, link it. If it's
1728 * not, use the provided stream out information to determine the
1729 * offset, since it was already linked for us. */
1730
1731 for (unsigned i = 0; i < vs->varying_count; i++) {
1732 gl_varying_slot loc = vs->varyings_loc[i];
1733
1734 bool special = is_special_varying(loc);
1735 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1736
1737 if (captured) {
1738 struct pipe_stream_output *o = pan_get_so(so, loc);
1739
1740 unsigned dst_offset = o->dst_offset * 4; /* dwords */
1741 vs->varyings[i].src_offset = dst_offset;
1742 } else if (!special) {
1743 vs->varyings[i].src_offset = 16 * (num_gen_varyings++);
1744 }
1745 }
1746
1747 /* Conversely, we need to set src_offset for the captured varyings.
1748 * Here, the layout is defined by the stream out info, not us */
1749
1750 /* Link up with fragment varyings */
1751 bool reads_point_coord = fs->reads_point_coord;
1752
1753 for (unsigned i = 0; i < fs->varying_count; i++) {
1754 gl_varying_slot loc = fs->varyings_loc[i];
1755 unsigned src_offset;
1756 signed vs_idx = -1;
1757
1758 /* Link up */
1759 for (unsigned j = 0; j < vs->varying_count; ++j) {
1760 if (vs->varyings_loc[j] == loc) {
1761 vs_idx = j;
1762 break;
1763 }
1764 }
1765
1766 /* Either assign or reuse */
1767 if (vs_idx >= 0)
1768 src_offset = vs->varyings[vs_idx].src_offset;
1769 else
1770 src_offset = 16 * (num_gen_varyings++);
1771
1772 fs->varyings[i].src_offset = src_offset;
1773
1774 if (has_point_coord(fs->point_sprite_mask, loc))
1775 reads_point_coord = true;
1776 }
1777
1778 memcpy(trans.cpu, vs->varyings, vs_size);
1779 memcpy(trans.cpu + vs_size, fs->varyings, fs_size);
1780
1781 union mali_attr varyings[PIPE_MAX_ATTRIBS] = {0};
1782
1783 /* Figure out how many streamout buffers could be bound */
1784 unsigned so_count = ctx->streamout.num_targets;
1785 for (unsigned i = 0; i < vs->varying_count; i++) {
1786 gl_varying_slot loc = vs->varyings_loc[i];
1787
1788 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1789 if (!captured) continue;
1790
1791 struct pipe_stream_output *o = pan_get_so(so, loc);
1792 so_count = MAX2(so_count, o->output_buffer + 1);
1793 }
1794
1795 signed idx = so_count;
1796 signed general = idx++;
1797 signed gl_Position = idx++;
1798 signed gl_PointSize = vs->writes_point_size ? (idx++) : -1;
1799 signed gl_PointCoord = reads_point_coord ? (idx++) : -1;
1800 signed gl_FrontFacing = fs->reads_face ? (idx++) : -1;
1801 signed gl_FragCoord = fs->reads_frag_coord ? (idx++) : -1;
1802
1803 /* Emit the stream out buffers */
1804
1805 unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1806 ctx->vertex_count);
1807
1808 for (unsigned i = 0; i < so_count; ++i) {
1809 if (i < ctx->streamout.num_targets) {
1810 panfrost_emit_streamout(batch, &varyings[i],
1811 so->stride[i],
1812 ctx->streamout.offsets[i],
1813 out_count,
1814 ctx->streamout.targets[i]);
1815 } else {
1816 /* Emit a dummy buffer */
1817 panfrost_emit_varyings(batch, &varyings[i],
1818 so->stride[i] * 4,
1819 out_count);
1820
1821 /* Clear the attribute type */
1822 varyings[i].elements &= ~0xF;
1823 }
1824 }
1825
1826 panfrost_emit_varyings(batch, &varyings[general],
1827 num_gen_varyings * 16,
1828 vertex_count);
1829
1830 mali_ptr varyings_p;
1831
1832 /* fp32 vec4 gl_Position */
1833 varyings_p = panfrost_emit_varyings(batch, &varyings[gl_Position],
1834 sizeof(float) * 4, vertex_count);
1835 tiler_postfix->position_varying = varyings_p;
1836
1837
1838 if (panfrost_writes_point_size(ctx)) {
1839 varyings_p = panfrost_emit_varyings(batch,
1840 &varyings[gl_PointSize],
1841 2, vertex_count);
1842 primitive_size->pointer = varyings_p;
1843 }
1844
1845 if (reads_point_coord)
1846 varyings[gl_PointCoord].elements = MALI_VARYING_POINT_COORD;
1847
1848 if (fs->reads_face)
1849 varyings[gl_FrontFacing].elements = MALI_VARYING_FRONT_FACING;
1850
1851 if (fs->reads_frag_coord)
1852 varyings[gl_FragCoord].elements = MALI_VARYING_FRAG_COORD;
1853
1854 struct panfrost_device *device = pan_device(ctx->base.screen);
1855 assert(!(device->quirks & IS_BIFROST) || !(reads_point_coord));
1856
1857 /* Let's go ahead and link varying meta to the buffer in question, now
1858 * that that information is available. VARYING_SLOT_POS is mapped to
1859 * gl_FragCoord for fragment shaders but gl_Positionf or vertex shaders
1860 * */
1861
1862 panfrost_emit_varying_meta(trans.cpu, vs, general, gl_Position,
1863 gl_PointSize, gl_PointCoord,
1864 gl_FrontFacing);
1865
1866 panfrost_emit_varying_meta(trans.cpu + vs_size, fs, general,
1867 gl_FragCoord, gl_PointSize,
1868 gl_PointCoord, gl_FrontFacing);
1869
1870 /* Replace streamout */
1871
1872 struct mali_attr_meta *ovs = (struct mali_attr_meta *)trans.cpu;
1873 struct mali_attr_meta *ofs = ovs + vs->varying_count;
1874
1875 for (unsigned i = 0; i < vs->varying_count; i++) {
1876 gl_varying_slot loc = vs->varyings_loc[i];
1877
1878 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1879 if (!captured)
1880 continue;
1881
1882 struct pipe_stream_output *o = pan_get_so(so, loc);
1883 ovs[i].index = o->output_buffer;
1884
1885 assert(o->stream == 0);
1886 ovs[i].format = (vs->varyings[i].format & ~MALI_NR_CHANNELS(4))
1887 | MALI_NR_CHANNELS(o->num_components);
1888
1889 if (device->quirks & HAS_SWIZZLES)
1890 ovs[i].swizzle = panfrost_get_default_swizzle(o->num_components);
1891 else
1892 ovs[i].swizzle = panfrost_bifrost_swizzle(o->num_components);
1893
1894 /* Link to the fragment */
1895 signed fs_idx = -1;
1896
1897 /* Link up */
1898 for (unsigned j = 0; j < fs->varying_count; ++j) {
1899 if (fs->varyings_loc[j] == loc) {
1900 fs_idx = j;
1901 break;
1902 }
1903 }
1904
1905 if (fs_idx >= 0) {
1906 ofs[fs_idx].index = ovs[i].index;
1907 ofs[fs_idx].format = ovs[i].format;
1908 ofs[fs_idx].swizzle = ovs[i].swizzle;
1909 }
1910 }
1911
1912 /* Replace point sprite */
1913 for (unsigned i = 0; i < fs->varying_count; i++) {
1914 /* If we have a point sprite replacement, handle that here. We
1915 * have to translate location first. TODO: Flip y in shader.
1916 * We're already keying ... just time crunch .. */
1917
1918 if (has_point_coord(fs->point_sprite_mask,
1919 fs->varyings_loc[i])) {
1920 ofs[i].index = gl_PointCoord;
1921
1922 /* Swizzle out the z/w to 0/1 */
1923 ofs[i].format = MALI_RG16F;
1924 ofs[i].swizzle = panfrost_get_default_swizzle(2);
1925 }
1926 }
1927
1928 /* Fix up unaligned addresses */
1929 for (unsigned i = 0; i < so_count; ++i) {
1930 if (varyings[i].elements < MALI_RECORD_SPECIAL)
1931 continue;
1932
1933 unsigned align = (varyings[i].elements & 63);
1934
1935 /* While we're at it, the SO buffers are linear */
1936
1937 if (!align) {
1938 varyings[i].elements |= MALI_ATTR_LINEAR;
1939 continue;
1940 }
1941
1942 /* We need to adjust alignment */
1943 varyings[i].elements &= ~63;
1944 varyings[i].elements |= MALI_ATTR_LINEAR;
1945 varyings[i].size += align;
1946
1947 for (unsigned v = 0; v < vs->varying_count; ++v) {
1948 if (ovs[v].index != i)
1949 continue;
1950
1951 ovs[v].src_offset = vs->varyings[v].src_offset + align;
1952 }
1953
1954 for (unsigned f = 0; f < fs->varying_count; ++f) {
1955 if (ofs[f].index != i)
1956 continue;
1957
1958 ofs[f].src_offset = fs->varyings[f].src_offset + align;
1959 }
1960 }
1961
1962 varyings_p = panfrost_upload_transient(batch, varyings,
1963 idx * sizeof(*varyings));
1964 vertex_postfix->varyings = varyings_p;
1965 tiler_postfix->varyings = varyings_p;
1966
1967 vertex_postfix->varying_meta = trans.gpu;
1968 tiler_postfix->varying_meta = trans.gpu + vs_size;
1969 }
1970
1971 void
1972 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
1973 struct mali_vertex_tiler_prefix *vertex_prefix,
1974 struct mali_vertex_tiler_postfix *vertex_postfix,
1975 struct mali_vertex_tiler_prefix *tiler_prefix,
1976 struct mali_vertex_tiler_postfix *tiler_postfix,
1977 union midgard_primitive_size *primitive_size)
1978 {
1979 struct panfrost_context *ctx = batch->ctx;
1980 struct panfrost_device *device = pan_device(ctx->base.screen);
1981 bool wallpapering = ctx->wallpaper_batch && batch->tiler_dep;
1982 struct bifrost_payload_vertex bifrost_vertex = {0,};
1983 struct bifrost_payload_tiler bifrost_tiler = {0,};
1984 struct midgard_payload_vertex_tiler midgard_vertex = {0,};
1985 struct midgard_payload_vertex_tiler midgard_tiler = {0,};
1986 void *vp, *tp;
1987 size_t vp_size, tp_size;
1988
1989 if (device->quirks & IS_BIFROST) {
1990 bifrost_vertex.prefix = *vertex_prefix;
1991 bifrost_vertex.postfix = *vertex_postfix;
1992 vp = &bifrost_vertex;
1993 vp_size = sizeof(bifrost_vertex);
1994
1995 bifrost_tiler.prefix = *tiler_prefix;
1996 bifrost_tiler.tiler.primitive_size = *primitive_size;
1997 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
1998 bifrost_tiler.postfix = *tiler_postfix;
1999 tp = &bifrost_tiler;
2000 tp_size = sizeof(bifrost_tiler);
2001 } else {
2002 midgard_vertex.prefix = *vertex_prefix;
2003 midgard_vertex.postfix = *vertex_postfix;
2004 vp = &midgard_vertex;
2005 vp_size = sizeof(midgard_vertex);
2006
2007 midgard_tiler.prefix = *tiler_prefix;
2008 midgard_tiler.postfix = *tiler_postfix;
2009 midgard_tiler.primitive_size = *primitive_size;
2010 tp = &midgard_tiler;
2011 tp_size = sizeof(midgard_tiler);
2012 }
2013
2014 if (wallpapering) {
2015 /* Inject in reverse order, with "predicted" job indices.
2016 * THIS IS A HACK XXX */
2017 panfrost_new_job(batch, JOB_TYPE_TILER, false,
2018 batch->job_index + 2, tp, tp_size, true);
2019 panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
2020 vp, vp_size, true);
2021 return;
2022 }
2023
2024 /* If rasterizer discard is enable, only submit the vertex */
2025
2026 bool rasterizer_discard = ctx->rasterizer &&
2027 ctx->rasterizer->base.rasterizer_discard;
2028
2029 unsigned vertex = panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
2030 vp, vp_size, false);
2031
2032 if (rasterizer_discard)
2033 return;
2034
2035 panfrost_new_job(batch, JOB_TYPE_TILER, false, vertex, tp, tp_size,
2036 false);
2037 }
2038
2039 /* TODO: stop hardcoding this */
2040 mali_ptr
2041 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2042 {
2043 uint16_t locations[] = {
2044 128, 128,
2045 0, 256,
2046 0, 256,
2047 0, 256,
2048 0, 256,
2049 0, 256,
2050 0, 256,
2051 0, 256,
2052 0, 256,
2053 0, 256,
2054 0, 256,
2055 0, 256,
2056 0, 256,
2057 0, 256,
2058 0, 256,
2059 0, 256,
2060 0, 256,
2061 0, 256,
2062 0, 256,
2063 0, 256,
2064 0, 256,
2065 0, 256,
2066 0, 256,
2067 0, 256,
2068 0, 256,
2069 0, 256,
2070 0, 256,
2071 0, 256,
2072 0, 256,
2073 0, 256,
2074 0, 256,
2075 0, 256,
2076 128, 128,
2077 0, 0,
2078 0, 0,
2079 0, 0,
2080 0, 0,
2081 0, 0,
2082 0, 0,
2083 0, 0,
2084 0, 0,
2085 0, 0,
2086 0, 0,
2087 0, 0,
2088 0, 0,
2089 0, 0,
2090 0, 0,
2091 0, 0,
2092 };
2093
2094 return panfrost_upload_transient(batch, locations, 96 * sizeof(uint16_t));
2095 }