panfrost: Limit blend shader work count
[mesa.git] / src / gallium / drivers / panfrost / pan_cmdstream.c
1 /*
2 * Copyright (C) 2018 Alyssa Rosenzweig
3 * Copyright (C) 2020 Collabora Ltd.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "util/macros.h"
26 #include "util/u_prim.h"
27 #include "util/u_vbuf.h"
28
29 #include "panfrost-quirks.h"
30
31 #include "pan_allocate.h"
32 #include "pan_bo.h"
33 #include "pan_cmdstream.h"
34 #include "pan_context.h"
35 #include "pan_job.h"
36
37 /* If a BO is accessed for a particular shader stage, will it be in the primary
38 * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
39 * fragment will be primary, e.g. compute jobs will be considered
40 * "vertex/tiler" by analogy */
41
42 static inline uint32_t
43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
44 {
45 assert(stage == PIPE_SHADER_FRAGMENT ||
46 stage == PIPE_SHADER_VERTEX ||
47 stage == PIPE_SHADER_COMPUTE);
48
49 return stage == PIPE_SHADER_FRAGMENT ?
50 PAN_BO_ACCESS_FRAGMENT :
51 PAN_BO_ACCESS_VERTEX_TILER;
52 }
53
54 static void
55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
56 struct mali_vertex_tiler_postfix *postfix)
57 {
58 struct panfrost_device *dev = pan_device(ctx->base.screen);
59 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
60
61 unsigned shift = panfrost_get_stack_shift(batch->stack_size);
62 struct mali_shared_memory shared = {
63 .stack_shift = shift,
64 .scratchpad = panfrost_batch_get_scratchpad(batch, shift, dev->thread_tls_alloc, dev->core_count)->gpu,
65 .shared_workgroup_count = ~0,
66 };
67 postfix->shared_memory = panfrost_upload_transient(batch, &shared, sizeof(shared));
68 }
69
70 static void
71 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
72 struct mali_vertex_tiler_postfix *postfix)
73 {
74 struct panfrost_device *dev = pan_device(ctx->base.screen);
75 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
76
77 /* If we haven't, reserve space for the framebuffer */
78
79 if (!batch->framebuffer.gpu) {
80 unsigned size = (dev->quirks & MIDGARD_SFBD) ?
81 sizeof(struct mali_single_framebuffer) :
82 sizeof(struct mali_framebuffer);
83
84 batch->framebuffer = panfrost_allocate_transient(batch, size);
85
86 /* Tag the pointer */
87 if (!(dev->quirks & MIDGARD_SFBD))
88 batch->framebuffer.gpu |= MALI_MFBD;
89 }
90
91 postfix->shared_memory = batch->framebuffer.gpu;
92 }
93
94 static void
95 panfrost_vt_update_rasterizer(struct panfrost_context *ctx,
96 struct mali_vertex_tiler_prefix *prefix,
97 struct mali_vertex_tiler_postfix *postfix)
98 {
99 struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
100
101 postfix->gl_enables |= 0x7;
102 SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
103 rasterizer && rasterizer->base.front_ccw);
104 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
105 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_FRONT));
106 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
107 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_BACK));
108 SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
109 rasterizer && rasterizer->base.flatshade_first);
110 }
111
112 void
113 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
114 struct mali_vertex_tiler_prefix *prefix,
115 union midgard_primitive_size *primitive_size)
116 {
117 struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
118
119 if (!panfrost_writes_point_size(ctx)) {
120 bool points = prefix->draw_mode == MALI_POINTS;
121 float val = 0.0f;
122
123 if (rasterizer)
124 val = points ?
125 rasterizer->base.point_size :
126 rasterizer->base.line_width;
127
128 primitive_size->constant = val;
129 }
130 }
131
132 static void
133 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
134 struct mali_vertex_tiler_postfix *postfix)
135 {
136 SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
137 if (ctx->occlusion_query)
138 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
139 else
140 postfix->occlusion_counter = 0;
141 }
142
143 void
144 panfrost_vt_init(struct panfrost_context *ctx,
145 enum pipe_shader_type stage,
146 struct mali_vertex_tiler_prefix *prefix,
147 struct mali_vertex_tiler_postfix *postfix)
148 {
149 struct panfrost_device *device = pan_device(ctx->base.screen);
150
151 if (!ctx->shader[stage])
152 return;
153
154 memset(prefix, 0, sizeof(*prefix));
155 memset(postfix, 0, sizeof(*postfix));
156
157 if (device->quirks & IS_BIFROST) {
158 postfix->gl_enables = 0x2;
159 panfrost_vt_emit_shared_memory(ctx, postfix);
160 } else {
161 postfix->gl_enables = 0x6;
162 panfrost_vt_attach_framebuffer(ctx, postfix);
163 }
164
165 if (stage == PIPE_SHADER_FRAGMENT) {
166 panfrost_vt_update_occlusion_query(ctx, postfix);
167 panfrost_vt_update_rasterizer(ctx, prefix, postfix);
168 }
169 }
170
171 static unsigned
172 panfrost_translate_index_size(unsigned size)
173 {
174 switch (size) {
175 case 1:
176 return MALI_DRAW_INDEXED_UINT8;
177
178 case 2:
179 return MALI_DRAW_INDEXED_UINT16;
180
181 case 4:
182 return MALI_DRAW_INDEXED_UINT32;
183
184 default:
185 unreachable("Invalid index size");
186 }
187 }
188
189 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
190 * good for the duration of the draw (transient), could last longer. Also get
191 * the bounds on the index buffer for the range accessed by the draw. We do
192 * these operations together because there are natural optimizations which
193 * require them to be together. */
194
195 static mali_ptr
196 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
197 const struct pipe_draw_info *info,
198 unsigned *min_index, unsigned *max_index)
199 {
200 struct panfrost_resource *rsrc = pan_resource(info->index.resource);
201 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
202 off_t offset = info->start * info->index_size;
203 bool needs_indices = true;
204 mali_ptr out = 0;
205
206 if (info->max_index != ~0u) {
207 *min_index = info->min_index;
208 *max_index = info->max_index;
209 needs_indices = false;
210 }
211
212 if (!info->has_user_indices) {
213 /* Only resources can be directly mapped */
214 panfrost_batch_add_bo(batch, rsrc->bo,
215 PAN_BO_ACCESS_SHARED |
216 PAN_BO_ACCESS_READ |
217 PAN_BO_ACCESS_VERTEX_TILER);
218 out = rsrc->bo->gpu + offset;
219
220 /* Check the cache */
221 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
222 info->start,
223 info->count,
224 min_index,
225 max_index);
226 } else {
227 /* Otherwise, we need to upload to transient memory */
228 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
229 out = panfrost_upload_transient(batch, ibuf8 + offset,
230 info->count *
231 info->index_size);
232 }
233
234 if (needs_indices) {
235 /* Fallback */
236 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
237
238 if (!info->has_user_indices)
239 panfrost_minmax_cache_add(rsrc->index_cache,
240 info->start, info->count,
241 *min_index, *max_index);
242 }
243
244 return out;
245 }
246
247 void
248 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
249 const struct pipe_draw_info *info,
250 enum mali_draw_mode draw_mode,
251 struct mali_vertex_tiler_postfix *vertex_postfix,
252 struct mali_vertex_tiler_prefix *tiler_prefix,
253 struct mali_vertex_tiler_postfix *tiler_postfix,
254 unsigned *vertex_count,
255 unsigned *padded_count)
256 {
257 tiler_prefix->draw_mode = draw_mode;
258
259 unsigned draw_flags = 0;
260
261 if (panfrost_writes_point_size(ctx))
262 draw_flags |= MALI_DRAW_VARYING_SIZE;
263
264 if (info->primitive_restart)
265 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
266
267 /* These doesn't make much sense */
268
269 draw_flags |= 0x3000;
270
271 if (info->index_size) {
272 unsigned min_index = 0, max_index = 0;
273
274 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
275 info,
276 &min_index,
277 &max_index);
278
279 /* Use the corresponding values */
280 *vertex_count = max_index - min_index + 1;
281 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
282 tiler_prefix->offset_bias_correction = -min_index;
283 tiler_prefix->index_count = MALI_POSITIVE(info->count);
284 draw_flags |= panfrost_translate_index_size(info->index_size);
285 } else {
286 tiler_prefix->indices = 0;
287 *vertex_count = ctx->vertex_count;
288 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
289 tiler_prefix->offset_bias_correction = 0;
290 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
291 }
292
293 tiler_prefix->unknown_draw = draw_flags;
294
295 /* Encode the padded vertex count */
296
297 if (info->instance_count > 1) {
298 *padded_count = panfrost_padded_vertex_count(*vertex_count);
299
300 unsigned shift = __builtin_ctz(ctx->padded_count);
301 unsigned k = ctx->padded_count >> (shift + 1);
302
303 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
304 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
305 } else {
306 *padded_count = *vertex_count;
307
308 /* Reset instancing state */
309 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
310 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
311 }
312 }
313
314 static void
315 panfrost_shader_meta_init(struct panfrost_context *ctx,
316 enum pipe_shader_type st,
317 struct mali_shader_meta *meta)
318 {
319 const struct panfrost_device *dev = pan_device(ctx->base.screen);
320 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
321
322 memset(meta, 0, sizeof(*meta));
323 meta->shader = (ss->bo ? ss->bo->gpu : 0) | ss->first_tag;
324 meta->attribute_count = ss->attribute_count;
325 meta->varying_count = ss->varying_count;
326 meta->texture_count = ctx->sampler_view_count[st];
327 meta->sampler_count = ctx->sampler_count[st];
328
329 if (dev->quirks & IS_BIFROST) {
330 if (st == PIPE_SHADER_VERTEX)
331 meta->bifrost1.unk1 = 0x800000;
332 else {
333 /* First clause ATEST |= 0x4000000.
334 * Less than 32 regs |= 0x200 */
335 meta->bifrost1.unk1 = 0x958020;
336 }
337
338 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
339 if (st == PIPE_SHADER_VERTEX)
340 meta->bifrost2.preload_regs = 0xC0;
341 else
342 meta->bifrost2.preload_regs = 0x1;
343 meta->bifrost2.uniform_count = MIN2(ss->uniform_count,
344 ss->uniform_cutoff);
345 } else {
346 meta->midgard1.uniform_count = MIN2(ss->uniform_count,
347 ss->uniform_cutoff);
348 meta->midgard1.work_count = ss->work_reg_count;
349 meta->midgard1.flags_hi = 0x8; /* XXX */
350 meta->midgard1.flags_lo = 0x220;
351 meta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
352 }
353 }
354
355 static unsigned
356 panfrost_translate_compare_func(enum pipe_compare_func in)
357 {
358 switch (in) {
359 case PIPE_FUNC_NEVER:
360 return MALI_FUNC_NEVER;
361
362 case PIPE_FUNC_LESS:
363 return MALI_FUNC_LESS;
364
365 case PIPE_FUNC_EQUAL:
366 return MALI_FUNC_EQUAL;
367
368 case PIPE_FUNC_LEQUAL:
369 return MALI_FUNC_LEQUAL;
370
371 case PIPE_FUNC_GREATER:
372 return MALI_FUNC_GREATER;
373
374 case PIPE_FUNC_NOTEQUAL:
375 return MALI_FUNC_NOTEQUAL;
376
377 case PIPE_FUNC_GEQUAL:
378 return MALI_FUNC_GEQUAL;
379
380 case PIPE_FUNC_ALWAYS:
381 return MALI_FUNC_ALWAYS;
382
383 default:
384 unreachable("Invalid func");
385 }
386 }
387
388 static unsigned
389 panfrost_translate_stencil_op(enum pipe_stencil_op in)
390 {
391 switch (in) {
392 case PIPE_STENCIL_OP_KEEP:
393 return MALI_STENCIL_KEEP;
394
395 case PIPE_STENCIL_OP_ZERO:
396 return MALI_STENCIL_ZERO;
397
398 case PIPE_STENCIL_OP_REPLACE:
399 return MALI_STENCIL_REPLACE;
400
401 case PIPE_STENCIL_OP_INCR:
402 return MALI_STENCIL_INCR;
403
404 case PIPE_STENCIL_OP_DECR:
405 return MALI_STENCIL_DECR;
406
407 case PIPE_STENCIL_OP_INCR_WRAP:
408 return MALI_STENCIL_INCR_WRAP;
409
410 case PIPE_STENCIL_OP_DECR_WRAP:
411 return MALI_STENCIL_DECR_WRAP;
412
413 case PIPE_STENCIL_OP_INVERT:
414 return MALI_STENCIL_INVERT;
415
416 default:
417 unreachable("Invalid stencil op");
418 }
419 }
420
421 static unsigned
422 translate_tex_wrap(enum pipe_tex_wrap w)
423 {
424 switch (w) {
425 case PIPE_TEX_WRAP_REPEAT:
426 return MALI_WRAP_REPEAT;
427
428 case PIPE_TEX_WRAP_CLAMP:
429 return MALI_WRAP_CLAMP;
430
431 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
432 return MALI_WRAP_CLAMP_TO_EDGE;
433
434 case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
435 return MALI_WRAP_CLAMP_TO_BORDER;
436
437 case PIPE_TEX_WRAP_MIRROR_REPEAT:
438 return MALI_WRAP_MIRRORED_REPEAT;
439
440 case PIPE_TEX_WRAP_MIRROR_CLAMP:
441 return MALI_WRAP_MIRRORED_CLAMP;
442
443 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
444 return MALI_WRAP_MIRRORED_CLAMP_TO_EDGE;
445
446 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
447 return MALI_WRAP_MIRRORED_CLAMP_TO_BORDER;
448
449 default:
450 unreachable("Invalid wrap");
451 }
452 }
453
454 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
455 struct mali_sampler_descriptor *hw)
456 {
457 unsigned func = panfrost_translate_compare_func(cso->compare_func);
458 bool min_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
459 bool mag_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
460 bool mip_linear = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR;
461 unsigned min_filter = min_nearest ? MALI_SAMP_MIN_NEAREST : 0;
462 unsigned mag_filter = mag_nearest ? MALI_SAMP_MAG_NEAREST : 0;
463 unsigned mip_filter = mip_linear ?
464 (MALI_SAMP_MIP_LINEAR_1 | MALI_SAMP_MIP_LINEAR_2) : 0;
465 unsigned normalized = cso->normalized_coords ? MALI_SAMP_NORM_COORDS : 0;
466
467 *hw = (struct mali_sampler_descriptor) {
468 .filter_mode = min_filter | mag_filter | mip_filter |
469 normalized,
470 .wrap_s = translate_tex_wrap(cso->wrap_s),
471 .wrap_t = translate_tex_wrap(cso->wrap_t),
472 .wrap_r = translate_tex_wrap(cso->wrap_r),
473 .compare_func = panfrost_flip_compare_func(func),
474 .border_color = {
475 cso->border_color.f[0],
476 cso->border_color.f[1],
477 cso->border_color.f[2],
478 cso->border_color.f[3]
479 },
480 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
481 .max_lod = FIXED_16(cso->max_lod, false),
482 .lod_bias = FIXED_16(cso->lod_bias, true), /* can be negative */
483 .seamless_cube_map = cso->seamless_cube_map,
484 };
485
486 /* If necessary, we disable mipmapping in the sampler descriptor by
487 * clamping the LOD as tight as possible (from 0 to epsilon,
488 * essentially -- remember these are fixed point numbers, so
489 * epsilon=1/256) */
490
491 if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
492 hw->max_lod = hw->min_lod + 1;
493 }
494
495 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
496 struct bifrost_sampler_descriptor *hw)
497 {
498 *hw = (struct bifrost_sampler_descriptor) {
499 .unk1 = 0x1,
500 .wrap_s = translate_tex_wrap(cso->wrap_s),
501 .wrap_t = translate_tex_wrap(cso->wrap_t),
502 .wrap_r = translate_tex_wrap(cso->wrap_r),
503 .unk8 = 0x8,
504 .min_filter = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST,
505 .norm_coords = cso->normalized_coords,
506 .mip_filter = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR,
507 .mag_filter = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR,
508 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
509 .max_lod = FIXED_16(cso->max_lod, false),
510 };
511
512 /* If necessary, we disable mipmapping in the sampler descriptor by
513 * clamping the LOD as tight as possible (from 0 to epsilon,
514 * essentially -- remember these are fixed point numbers, so
515 * epsilon=1/256) */
516
517 if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
518 hw->max_lod = hw->min_lod + 1;
519 }
520
521 static void
522 panfrost_make_stencil_state(const struct pipe_stencil_state *in,
523 struct mali_stencil_test *out)
524 {
525 out->ref = 0; /* Gallium gets it from elsewhere */
526
527 out->mask = in->valuemask;
528 out->func = panfrost_translate_compare_func(in->func);
529 out->sfail = panfrost_translate_stencil_op(in->fail_op);
530 out->dpfail = panfrost_translate_stencil_op(in->zfail_op);
531 out->dppass = panfrost_translate_stencil_op(in->zpass_op);
532 }
533
534 static void
535 panfrost_frag_meta_rasterizer_update(struct panfrost_context *ctx,
536 struct mali_shader_meta *fragmeta)
537 {
538 if (!ctx->rasterizer) {
539 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, true);
540 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, false);
541 fragmeta->depth_units = 0.0f;
542 fragmeta->depth_factor = 0.0f;
543 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, false);
544 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, false);
545 return;
546 }
547
548 bool msaa = ctx->rasterizer->base.multisample;
549
550 /* TODO: Sample size */
551 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
552 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
553 fragmeta->depth_units = ctx->rasterizer->base.offset_units * 2.0f;
554 fragmeta->depth_factor = ctx->rasterizer->base.offset_scale;
555
556 /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
557
558 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A,
559 ctx->rasterizer->base.offset_tri);
560 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B,
561 ctx->rasterizer->base.offset_tri);
562 }
563
564 static void
565 panfrost_frag_meta_zsa_update(struct panfrost_context *ctx,
566 struct mali_shader_meta *fragmeta)
567 {
568 const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
569 int zfunc = PIPE_FUNC_ALWAYS;
570
571 if (!zsa) {
572 struct pipe_stencil_state default_stencil = {
573 .enabled = 0,
574 .func = PIPE_FUNC_ALWAYS,
575 .fail_op = MALI_STENCIL_KEEP,
576 .zfail_op = MALI_STENCIL_KEEP,
577 .zpass_op = MALI_STENCIL_KEEP,
578 .writemask = 0xFF,
579 .valuemask = 0xFF
580 };
581
582 panfrost_make_stencil_state(&default_stencil,
583 &fragmeta->stencil_front);
584 fragmeta->stencil_mask_front = default_stencil.writemask;
585 fragmeta->stencil_back = fragmeta->stencil_front;
586 fragmeta->stencil_mask_back = default_stencil.writemask;
587 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST, false);
588 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK, false);
589 } else {
590 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
591 zsa->stencil[0].enabled);
592 panfrost_make_stencil_state(&zsa->stencil[0],
593 &fragmeta->stencil_front);
594 fragmeta->stencil_mask_front = zsa->stencil[0].writemask;
595 fragmeta->stencil_front.ref = ctx->stencil_ref.ref_value[0];
596
597 /* If back-stencil is not enabled, use the front values */
598
599 if (zsa->stencil[1].enabled) {
600 panfrost_make_stencil_state(&zsa->stencil[1],
601 &fragmeta->stencil_back);
602 fragmeta->stencil_mask_back = zsa->stencil[1].writemask;
603 fragmeta->stencil_back.ref = ctx->stencil_ref.ref_value[1];
604 } else {
605 fragmeta->stencil_back = fragmeta->stencil_front;
606 fragmeta->stencil_mask_back = fragmeta->stencil_mask_front;
607 fragmeta->stencil_back.ref = fragmeta->stencil_front.ref;
608 }
609
610 if (zsa->depth.enabled)
611 zfunc = zsa->depth.func;
612
613 /* Depth state (TODO: Refactor) */
614
615 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
616 zsa->depth.writemask);
617 }
618
619 fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
620 fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(zfunc));
621 }
622
623 static void
624 panfrost_frag_meta_blend_update(struct panfrost_context *ctx,
625 struct mali_shader_meta *fragmeta,
626 void *rts)
627 {
628 const struct panfrost_device *dev = pan_device(ctx->base.screen);
629
630 SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
631 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
632 !ctx->blend->base.dither);
633
634 /* Get blending setup */
635 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
636
637 struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
638 unsigned shader_offset = 0;
639 struct panfrost_bo *shader_bo = NULL;
640
641 for (unsigned c = 0; c < rt_count; ++c)
642 blend[c] = panfrost_get_blend_for_context(ctx, c, &shader_bo,
643 &shader_offset);
644
645 /* If there is a blend shader, work registers are shared. We impose 8
646 * work registers as a limit for blend shaders. Should be lower XXX */
647
648 if (!(dev->quirks & IS_BIFROST)) {
649 for (unsigned c = 0; c < rt_count; ++c) {
650 if (blend[c].is_shader) {
651 fragmeta->midgard1.work_count =
652 MAX2(fragmeta->midgard1.work_count, 8);
653 }
654 }
655 }
656
657 /* Even on MFBD, the shader descriptor gets blend shaders. It's *also*
658 * copied to the blend_meta appended (by convention), but this is the
659 * field actually read by the hardware. (Or maybe both are read...?).
660 * Specify the last RTi with a blend shader. */
661
662 fragmeta->blend.shader = 0;
663
664 for (signed rt = (rt_count - 1); rt >= 0; --rt) {
665 if (!blend[rt].is_shader)
666 continue;
667
668 fragmeta->blend.shader = blend[rt].shader.gpu |
669 blend[rt].shader.first_tag;
670 break;
671 }
672
673 if (dev->quirks & MIDGARD_SFBD) {
674 /* When only a single render target platform is used, the blend
675 * information is inside the shader meta itself. We additionally
676 * need to signal CAN_DISCARD for nontrivial blend modes (so
677 * we're able to read back the destination buffer) */
678
679 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
680 blend[0].is_shader);
681
682 if (!blend[0].is_shader) {
683 fragmeta->blend.equation = *blend[0].equation.equation;
684 fragmeta->blend.constant = blend[0].equation.constant;
685 }
686
687 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
688 !blend[0].no_blending);
689 return;
690 }
691
692 /* Additional blend descriptor tacked on for jobs using MFBD */
693
694 for (unsigned i = 0; i < rt_count; ++i) {
695 if (dev->quirks & IS_BIFROST) {
696 struct bifrost_blend_rt *brts = rts;
697 struct panfrost_shader_state *fs;
698 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
699
700 brts[i].flags = 0x200;
701 if (blend[i].is_shader) {
702 /* The blend shader's address needs to be at
703 * the same top 32 bit as the fragment shader.
704 * TODO: Ensure that's always the case.
705 */
706 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
707 (fs->bo->gpu & (0xffffffffull << 32)));
708 brts[i].shader = blend[i].shader.gpu;
709 brts[i].unk2 = 0x0;
710 } else {
711 enum pipe_format format = ctx->pipe_framebuffer.cbufs[i]->format;
712 const struct util_format_description *format_desc;
713 format_desc = util_format_description(format);
714
715 brts[i].equation = *blend[i].equation.equation;
716
717 /* TODO: this is a bit more complicated */
718 brts[i].constant = blend[i].equation.constant;
719
720 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
721 brts[i].unk2 = 0x19;
722
723 brts[i].shader_type = fs->blend_types[i];
724 }
725 } else {
726 struct midgard_blend_rt *mrts = rts;
727
728 mrts[i].flags = 0x200;
729
730 bool is_srgb = (ctx->pipe_framebuffer.nr_cbufs > i) &&
731 (ctx->pipe_framebuffer.cbufs[i]) &&
732 util_format_is_srgb(ctx->pipe_framebuffer.cbufs[i]->format);
733
734 SET_BIT(mrts[i].flags, MALI_BLEND_MRT_SHADER, blend[i].is_shader);
735 SET_BIT(mrts[i].flags, MALI_BLEND_LOAD_TIB, !blend[i].no_blending);
736 SET_BIT(mrts[i].flags, MALI_BLEND_SRGB, is_srgb);
737 SET_BIT(mrts[i].flags, MALI_BLEND_NO_DITHER, !ctx->blend->base.dither);
738
739 if (blend[i].is_shader) {
740 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
741 } else {
742 mrts[i].blend.equation = *blend[i].equation.equation;
743 mrts[i].blend.constant = blend[i].equation.constant;
744 }
745 }
746 }
747 }
748
749 static void
750 panfrost_frag_shader_meta_init(struct panfrost_context *ctx,
751 struct mali_shader_meta *fragmeta,
752 void *rts)
753 {
754 const struct panfrost_device *dev = pan_device(ctx->base.screen);
755 struct panfrost_shader_state *fs;
756
757 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
758
759 fragmeta->alpha_coverage = ~MALI_ALPHA_COVERAGE(0.000000);
760 fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x3010;
761 fragmeta->unknown2_4 = 0x4e0;
762
763 /* unknown2_4 has 0x10 bit set on T6XX and T720. We don't know why this
764 * is required (independent of 32-bit/64-bit descriptors), or why it's
765 * not used on later GPU revisions. Otherwise, all shader jobs fault on
766 * these earlier chips (perhaps this is a chicken bit of some kind).
767 * More investigation is needed. */
768
769 SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
770
771 if (dev->quirks & IS_BIFROST) {
772 /* TODO */
773 } else {
774 /* Depending on whether it's legal to in the given shader, we try to
775 * enable early-z testing (or forward-pixel kill?) */
776
777 SET_BIT(fragmeta->midgard1.flags_lo, MALI_EARLY_Z,
778 !fs->can_discard && !fs->writes_depth);
779
780 /* Add the writes Z/S flags if needed. */
781 SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_Z, fs->writes_depth);
782 SET_BIT(fragmeta->midgard1.flags_hi, MALI_WRITES_S, fs->writes_stencil);
783
784 /* Any time texturing is used, derivatives are implicitly calculated,
785 * so we need to enable helper invocations */
786
787 SET_BIT(fragmeta->midgard1.flags_lo, MALI_HELPER_INVOCATIONS,
788 fs->helper_invocations);
789
790 /* CAN_DISCARD should be set if the fragment shader possibly contains a
791 * 'discard' instruction. It is likely this is related to optimizations
792 * related to forward-pixel kill, as per "Mali Performance 3: Is
793 * EGL_BUFFER_PRESERVED a good thing?" by Peter Harris */
794
795 const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
796
797 bool depth_enabled = fs->writes_depth ||
798 (zsa && zsa->depth.enabled && zsa->depth.func != PIPE_FUNC_ALWAYS);
799
800 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD, fs->can_discard);
801 SET_BIT(fragmeta->midgard1.flags_lo, 0x400, !depth_enabled && fs->can_discard);
802 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_ZS, depth_enabled && fs->can_discard);
803 }
804
805 panfrost_frag_meta_rasterizer_update(ctx, fragmeta);
806 panfrost_frag_meta_zsa_update(ctx, fragmeta);
807 panfrost_frag_meta_blend_update(ctx, fragmeta, rts);
808 }
809
810 void
811 panfrost_emit_shader_meta(struct panfrost_batch *batch,
812 enum pipe_shader_type st,
813 struct mali_vertex_tiler_postfix *postfix)
814 {
815 struct panfrost_context *ctx = batch->ctx;
816 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
817
818 if (!ss) {
819 postfix->shader = 0;
820 return;
821 }
822
823 struct mali_shader_meta meta;
824
825 panfrost_shader_meta_init(ctx, st, &meta);
826
827 /* Add the shader BO to the batch. */
828 panfrost_batch_add_bo(batch, ss->bo,
829 PAN_BO_ACCESS_PRIVATE |
830 PAN_BO_ACCESS_READ |
831 panfrost_bo_access_for_stage(st));
832
833 mali_ptr shader_ptr;
834
835 if (st == PIPE_SHADER_FRAGMENT) {
836 struct panfrost_device *dev = pan_device(ctx->base.screen);
837 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
838 size_t desc_size = sizeof(meta);
839 void *rts = NULL;
840 struct panfrost_transfer xfer;
841 unsigned rt_size;
842
843 if (dev->quirks & MIDGARD_SFBD)
844 rt_size = 0;
845 else if (dev->quirks & IS_BIFROST)
846 rt_size = sizeof(struct bifrost_blend_rt);
847 else
848 rt_size = sizeof(struct midgard_blend_rt);
849
850 desc_size += rt_size * rt_count;
851
852 if (rt_size)
853 rts = rzalloc_size(ctx, rt_size * rt_count);
854
855 panfrost_frag_shader_meta_init(ctx, &meta, rts);
856
857 xfer = panfrost_allocate_transient(batch, desc_size);
858
859 memcpy(xfer.cpu, &meta, sizeof(meta));
860 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
861
862 if (rt_size)
863 ralloc_free(rts);
864
865 shader_ptr = xfer.gpu;
866 } else {
867 shader_ptr = panfrost_upload_transient(batch, &meta,
868 sizeof(meta));
869 }
870
871 postfix->shader = shader_ptr;
872 }
873
874 static void
875 panfrost_mali_viewport_init(struct panfrost_context *ctx,
876 struct mali_viewport *mvp)
877 {
878 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
879
880 /* Clip bounds are encoded as floats. The viewport itself is encoded as
881 * (somewhat) asymmetric ints. */
882
883 const struct pipe_scissor_state *ss = &ctx->scissor;
884
885 memset(mvp, 0, sizeof(*mvp));
886
887 /* By default, do no viewport clipping, i.e. clip to (-inf, inf) in
888 * each direction. Clipping to the viewport in theory should work, but
889 * in practice causes issues when we're not explicitly trying to
890 * scissor */
891
892 *mvp = (struct mali_viewport) {
893 .clip_minx = -INFINITY,
894 .clip_miny = -INFINITY,
895 .clip_maxx = INFINITY,
896 .clip_maxy = INFINITY,
897 };
898
899 /* Always scissor to the viewport by default. */
900 float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
901 float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
902
903 float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
904 float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
905
906 float minz = (vp->translate[2] - fabsf(vp->scale[2]));
907 float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
908
909 /* Apply the scissor test */
910
911 unsigned minx, miny, maxx, maxy;
912
913 if (ss && ctx->rasterizer && ctx->rasterizer->base.scissor) {
914 minx = MAX2(ss->minx, vp_minx);
915 miny = MAX2(ss->miny, vp_miny);
916 maxx = MIN2(ss->maxx, vp_maxx);
917 maxy = MIN2(ss->maxy, vp_maxy);
918 } else {
919 minx = vp_minx;
920 miny = vp_miny;
921 maxx = vp_maxx;
922 maxy = vp_maxy;
923 }
924
925 /* Hardware needs the min/max to be strictly ordered, so flip if we
926 * need to. The viewport transformation in the vertex shader will
927 * handle the negatives if we don't */
928
929 if (miny > maxy) {
930 unsigned temp = miny;
931 miny = maxy;
932 maxy = temp;
933 }
934
935 if (minx > maxx) {
936 unsigned temp = minx;
937 minx = maxx;
938 maxx = temp;
939 }
940
941 if (minz > maxz) {
942 float temp = minz;
943 minz = maxz;
944 maxz = temp;
945 }
946
947 /* Clamp to the framebuffer size as a last check */
948
949 minx = MIN2(ctx->pipe_framebuffer.width, minx);
950 maxx = MIN2(ctx->pipe_framebuffer.width, maxx);
951
952 miny = MIN2(ctx->pipe_framebuffer.height, miny);
953 maxy = MIN2(ctx->pipe_framebuffer.height, maxy);
954
955 /* Upload */
956
957 mvp->viewport0[0] = minx;
958 mvp->viewport1[0] = MALI_POSITIVE(maxx);
959
960 mvp->viewport0[1] = miny;
961 mvp->viewport1[1] = MALI_POSITIVE(maxy);
962
963 mvp->clip_minz = minz;
964 mvp->clip_maxz = maxz;
965 }
966
967 void
968 panfrost_emit_viewport(struct panfrost_batch *batch,
969 struct mali_vertex_tiler_postfix *tiler_postfix)
970 {
971 struct panfrost_context *ctx = batch->ctx;
972 struct mali_viewport mvp;
973
974 panfrost_mali_viewport_init(batch->ctx, &mvp);
975
976 /* Update the job, unless we're doing wallpapering (whose lack of
977 * scissor we can ignore, since if we "miss" a tile of wallpaper, it'll
978 * just... be faster :) */
979
980 if (!ctx->wallpaper_batch)
981 panfrost_batch_union_scissor(batch, mvp.viewport0[0],
982 mvp.viewport0[1],
983 mvp.viewport1[0] + 1,
984 mvp.viewport1[1] + 1);
985
986 tiler_postfix->viewport = panfrost_upload_transient(batch, &mvp,
987 sizeof(mvp));
988 }
989
990 static mali_ptr
991 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
992 enum pipe_shader_type st,
993 struct panfrost_constant_buffer *buf,
994 unsigned index)
995 {
996 struct pipe_constant_buffer *cb = &buf->cb[index];
997 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
998
999 if (rsrc) {
1000 panfrost_batch_add_bo(batch, rsrc->bo,
1001 PAN_BO_ACCESS_SHARED |
1002 PAN_BO_ACCESS_READ |
1003 panfrost_bo_access_for_stage(st));
1004
1005 /* Alignment gauranteed by
1006 * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
1007 return rsrc->bo->gpu + cb->buffer_offset;
1008 } else if (cb->user_buffer) {
1009 return panfrost_upload_transient(batch,
1010 cb->user_buffer +
1011 cb->buffer_offset,
1012 cb->buffer_size);
1013 } else {
1014 unreachable("No constant buffer");
1015 }
1016 }
1017
1018 struct sysval_uniform {
1019 union {
1020 float f[4];
1021 int32_t i[4];
1022 uint32_t u[4];
1023 uint64_t du[2];
1024 };
1025 };
1026
1027 static void
1028 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
1029 struct sysval_uniform *uniform)
1030 {
1031 struct panfrost_context *ctx = batch->ctx;
1032 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1033
1034 uniform->f[0] = vp->scale[0];
1035 uniform->f[1] = vp->scale[1];
1036 uniform->f[2] = vp->scale[2];
1037 }
1038
1039 static void
1040 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
1041 struct sysval_uniform *uniform)
1042 {
1043 struct panfrost_context *ctx = batch->ctx;
1044 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1045
1046 uniform->f[0] = vp->translate[0];
1047 uniform->f[1] = vp->translate[1];
1048 uniform->f[2] = vp->translate[2];
1049 }
1050
1051 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
1052 enum pipe_shader_type st,
1053 unsigned int sysvalid,
1054 struct sysval_uniform *uniform)
1055 {
1056 struct panfrost_context *ctx = batch->ctx;
1057 unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
1058 unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
1059 bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
1060 struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
1061
1062 assert(dim);
1063 uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
1064
1065 if (dim > 1)
1066 uniform->i[1] = u_minify(tex->texture->height0,
1067 tex->u.tex.first_level);
1068
1069 if (dim > 2)
1070 uniform->i[2] = u_minify(tex->texture->depth0,
1071 tex->u.tex.first_level);
1072
1073 if (is_array)
1074 uniform->i[dim] = tex->texture->array_size;
1075 }
1076
1077 static void
1078 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
1079 enum pipe_shader_type st,
1080 unsigned ssbo_id,
1081 struct sysval_uniform *uniform)
1082 {
1083 struct panfrost_context *ctx = batch->ctx;
1084
1085 assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
1086 struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
1087
1088 /* Compute address */
1089 struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
1090
1091 panfrost_batch_add_bo(batch, bo,
1092 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
1093 panfrost_bo_access_for_stage(st));
1094
1095 /* Upload address and size as sysval */
1096 uniform->du[0] = bo->gpu + sb.buffer_offset;
1097 uniform->u[2] = sb.buffer_size;
1098 }
1099
1100 static void
1101 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1102 enum pipe_shader_type st,
1103 unsigned samp_idx,
1104 struct sysval_uniform *uniform)
1105 {
1106 struct panfrost_context *ctx = batch->ctx;
1107 struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1108
1109 uniform->f[0] = sampl->min_lod;
1110 uniform->f[1] = sampl->max_lod;
1111 uniform->f[2] = sampl->lod_bias;
1112
1113 /* Even without any errata, Midgard represents "no mipmapping" as
1114 * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1115 * panfrost_create_sampler_state which also explains our choice of
1116 * epsilon value (again to keep behaviour consistent) */
1117
1118 if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1119 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1120 }
1121
1122 static void
1123 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1124 struct sysval_uniform *uniform)
1125 {
1126 struct panfrost_context *ctx = batch->ctx;
1127
1128 uniform->u[0] = ctx->compute_grid->grid[0];
1129 uniform->u[1] = ctx->compute_grid->grid[1];
1130 uniform->u[2] = ctx->compute_grid->grid[2];
1131 }
1132
1133 static void
1134 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1135 struct panfrost_shader_state *ss,
1136 enum pipe_shader_type st)
1137 {
1138 struct sysval_uniform *uniforms = (void *)buf;
1139
1140 for (unsigned i = 0; i < ss->sysval_count; ++i) {
1141 int sysval = ss->sysval[i];
1142
1143 switch (PAN_SYSVAL_TYPE(sysval)) {
1144 case PAN_SYSVAL_VIEWPORT_SCALE:
1145 panfrost_upload_viewport_scale_sysval(batch,
1146 &uniforms[i]);
1147 break;
1148 case PAN_SYSVAL_VIEWPORT_OFFSET:
1149 panfrost_upload_viewport_offset_sysval(batch,
1150 &uniforms[i]);
1151 break;
1152 case PAN_SYSVAL_TEXTURE_SIZE:
1153 panfrost_upload_txs_sysval(batch, st,
1154 PAN_SYSVAL_ID(sysval),
1155 &uniforms[i]);
1156 break;
1157 case PAN_SYSVAL_SSBO:
1158 panfrost_upload_ssbo_sysval(batch, st,
1159 PAN_SYSVAL_ID(sysval),
1160 &uniforms[i]);
1161 break;
1162 case PAN_SYSVAL_NUM_WORK_GROUPS:
1163 panfrost_upload_num_work_groups_sysval(batch,
1164 &uniforms[i]);
1165 break;
1166 case PAN_SYSVAL_SAMPLER:
1167 panfrost_upload_sampler_sysval(batch, st,
1168 PAN_SYSVAL_ID(sysval),
1169 &uniforms[i]);
1170 break;
1171 default:
1172 assert(0);
1173 }
1174 }
1175 }
1176
1177 static const void *
1178 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1179 unsigned index)
1180 {
1181 struct pipe_constant_buffer *cb = &buf->cb[index];
1182 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1183
1184 if (rsrc)
1185 return rsrc->bo->cpu;
1186 else if (cb->user_buffer)
1187 return cb->user_buffer;
1188 else
1189 unreachable("No constant buffer");
1190 }
1191
1192 void
1193 panfrost_emit_const_buf(struct panfrost_batch *batch,
1194 enum pipe_shader_type stage,
1195 struct mali_vertex_tiler_postfix *postfix)
1196 {
1197 struct panfrost_context *ctx = batch->ctx;
1198 struct panfrost_shader_variants *all = ctx->shader[stage];
1199
1200 if (!all)
1201 return;
1202
1203 struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1204
1205 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1206
1207 /* Uniforms are implicitly UBO #0 */
1208 bool has_uniforms = buf->enabled_mask & (1 << 0);
1209
1210 /* Allocate room for the sysval and the uniforms */
1211 size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1212 size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1213 size_t size = sys_size + uniform_size;
1214 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1215 size);
1216
1217 /* Upload sysvals requested by the shader */
1218 panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1219
1220 /* Upload uniforms */
1221 if (has_uniforms && uniform_size) {
1222 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1223 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1224 }
1225
1226 /* Next up, attach UBOs. UBO #0 is the uniforms we just
1227 * uploaded */
1228
1229 unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1230 assert(ubo_count >= 1);
1231
1232 size_t sz = sizeof(uint64_t) * ubo_count;
1233 uint64_t ubos[PAN_MAX_CONST_BUFFERS];
1234 int uniform_count = ss->uniform_count;
1235
1236 /* Upload uniforms as a UBO */
1237 ubos[0] = MALI_MAKE_UBO(2 + uniform_count, transfer.gpu);
1238
1239 /* The rest are honest-to-goodness UBOs */
1240
1241 for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1242 size_t usz = buf->cb[ubo].buffer_size;
1243 bool enabled = buf->enabled_mask & (1 << ubo);
1244 bool empty = usz == 0;
1245
1246 if (!enabled || empty) {
1247 /* Stub out disabled UBOs to catch accesses */
1248 ubos[ubo] = MALI_MAKE_UBO(0, 0xDEAD0000);
1249 continue;
1250 }
1251
1252 mali_ptr gpu = panfrost_map_constant_buffer_gpu(batch, stage,
1253 buf, ubo);
1254
1255 unsigned bytes_per_field = 16;
1256 unsigned aligned = ALIGN_POT(usz, bytes_per_field);
1257 ubos[ubo] = MALI_MAKE_UBO(aligned / bytes_per_field, gpu);
1258 }
1259
1260 mali_ptr ubufs = panfrost_upload_transient(batch, ubos, sz);
1261 postfix->uniforms = transfer.gpu;
1262 postfix->uniform_buffers = ubufs;
1263
1264 buf->dirty_mask = 0;
1265 }
1266
1267 void
1268 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1269 const struct pipe_grid_info *info,
1270 struct midgard_payload_vertex_tiler *vtp)
1271 {
1272 struct panfrost_context *ctx = batch->ctx;
1273 struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1274 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1275 unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1276 128));
1277 unsigned shared_size = single_size * info->grid[0] * info->grid[1] *
1278 info->grid[2] * 4;
1279 struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1280 shared_size,
1281 1);
1282
1283 struct mali_shared_memory shared = {
1284 .shared_memory = bo->gpu,
1285 .shared_workgroup_count =
1286 util_logbase2_ceil(info->grid[0]) +
1287 util_logbase2_ceil(info->grid[1]) +
1288 util_logbase2_ceil(info->grid[2]),
1289 .shared_unk1 = 0x2,
1290 .shared_shift = util_logbase2(single_size) - 1
1291 };
1292
1293 vtp->postfix.shared_memory = panfrost_upload_transient(batch, &shared,
1294 sizeof(shared));
1295 }
1296
1297 static mali_ptr
1298 panfrost_get_tex_desc(struct panfrost_batch *batch,
1299 enum pipe_shader_type st,
1300 struct panfrost_sampler_view *view)
1301 {
1302 if (!view)
1303 return (mali_ptr) 0;
1304
1305 struct pipe_sampler_view *pview = &view->base;
1306 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1307
1308 /* Add the BO to the job so it's retained until the job is done. */
1309
1310 panfrost_batch_add_bo(batch, rsrc->bo,
1311 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1312 panfrost_bo_access_for_stage(st));
1313
1314 panfrost_batch_add_bo(batch, view->midgard_bo,
1315 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1316 panfrost_bo_access_for_stage(st));
1317
1318 return view->midgard_bo->gpu;
1319 }
1320
1321 void
1322 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1323 enum pipe_shader_type stage,
1324 struct mali_vertex_tiler_postfix *postfix)
1325 {
1326 struct panfrost_context *ctx = batch->ctx;
1327 struct panfrost_device *device = pan_device(ctx->base.screen);
1328
1329 if (!ctx->sampler_view_count[stage])
1330 return;
1331
1332 if (device->quirks & IS_BIFROST) {
1333 struct bifrost_texture_descriptor *descriptors;
1334
1335 descriptors = malloc(sizeof(struct bifrost_texture_descriptor) *
1336 ctx->sampler_view_count[stage]);
1337
1338 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1339 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1340 struct pipe_sampler_view *pview = &view->base;
1341 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1342
1343 /* Add the BOs to the job so they are retained until the job is done. */
1344
1345 panfrost_batch_add_bo(batch, rsrc->bo,
1346 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1347 panfrost_bo_access_for_stage(stage));
1348
1349 panfrost_batch_add_bo(batch, view->bifrost_bo,
1350 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1351 panfrost_bo_access_for_stage(stage));
1352
1353 memcpy(&descriptors[i], view->bifrost_descriptor, sizeof(*view->bifrost_descriptor));
1354 }
1355
1356 postfix->textures = panfrost_upload_transient(batch,
1357 descriptors,
1358 sizeof(struct bifrost_texture_descriptor) *
1359 ctx->sampler_view_count[stage]);
1360
1361 free(descriptors);
1362 } else {
1363 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1364
1365 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i)
1366 trampolines[i] = panfrost_get_tex_desc(batch, stage,
1367 ctx->sampler_views[stage][i]);
1368
1369 postfix->textures = panfrost_upload_transient(batch,
1370 trampolines,
1371 sizeof(uint64_t) *
1372 ctx->sampler_view_count[stage]);
1373 }
1374 }
1375
1376 void
1377 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1378 enum pipe_shader_type stage,
1379 struct mali_vertex_tiler_postfix *postfix)
1380 {
1381 struct panfrost_context *ctx = batch->ctx;
1382 struct panfrost_device *device = pan_device(ctx->base.screen);
1383
1384 if (!ctx->sampler_count[stage])
1385 return;
1386
1387 if (device->quirks & IS_BIFROST) {
1388 size_t desc_size = sizeof(struct bifrost_sampler_descriptor);
1389 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1390 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1391 transfer_size);
1392 struct bifrost_sampler_descriptor *desc = (struct bifrost_sampler_descriptor *)transfer.cpu;
1393
1394 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1395 desc[i] = ctx->samplers[stage][i]->bifrost_hw;
1396
1397 postfix->sampler_descriptor = transfer.gpu;
1398 } else {
1399 size_t desc_size = sizeof(struct mali_sampler_descriptor);
1400 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1401 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1402 transfer_size);
1403 struct mali_sampler_descriptor *desc = (struct mali_sampler_descriptor *)transfer.cpu;
1404
1405 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1406 desc[i] = ctx->samplers[stage][i]->midgard_hw;
1407
1408 postfix->sampler_descriptor = transfer.gpu;
1409 }
1410 }
1411
1412 void
1413 panfrost_emit_vertex_attr_meta(struct panfrost_batch *batch,
1414 struct mali_vertex_tiler_postfix *vertex_postfix)
1415 {
1416 struct panfrost_context *ctx = batch->ctx;
1417
1418 if (!ctx->vertex)
1419 return;
1420
1421 struct panfrost_vertex_state *so = ctx->vertex;
1422
1423 panfrost_vertex_state_upd_attr_offs(ctx, vertex_postfix);
1424 vertex_postfix->attribute_meta = panfrost_upload_transient(batch, so->hw,
1425 sizeof(*so->hw) *
1426 PAN_MAX_ATTRIBUTE);
1427 }
1428
1429 void
1430 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1431 struct mali_vertex_tiler_postfix *vertex_postfix)
1432 {
1433 struct panfrost_context *ctx = batch->ctx;
1434 struct panfrost_vertex_state *so = ctx->vertex;
1435
1436 /* Staged mali_attr, and index into them. i =/= k, depending on the
1437 * vertex buffer mask and instancing. Twice as much room is allocated,
1438 * for a worst case of NPOT_DIVIDEs which take up extra slot */
1439 union mali_attr attrs[PIPE_MAX_ATTRIBS * 2];
1440 unsigned k = 0;
1441
1442 for (unsigned i = 0; i < so->num_elements; ++i) {
1443 /* We map a mali_attr to be 1:1 with the mali_attr_meta, which
1444 * means duplicating some vertex buffers (who cares? aside from
1445 * maybe some caching implications but I somehow doubt that
1446 * matters) */
1447
1448 struct pipe_vertex_element *elem = &so->pipe[i];
1449 unsigned vbi = elem->vertex_buffer_index;
1450
1451 /* The exception to 1:1 mapping is that we can have multiple
1452 * entries (NPOT divisors), so we fixup anyways */
1453
1454 so->hw[i].index = k;
1455
1456 if (!(ctx->vb_mask & (1 << vbi)))
1457 continue;
1458
1459 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1460 struct panfrost_resource *rsrc;
1461
1462 rsrc = pan_resource(buf->buffer.resource);
1463 if (!rsrc)
1464 continue;
1465
1466 /* Align to 64 bytes by masking off the lower bits. This
1467 * will be adjusted back when we fixup the src_offset in
1468 * mali_attr_meta */
1469
1470 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1471 mali_ptr addr = raw_addr & ~63;
1472 unsigned chopped_addr = raw_addr - addr;
1473
1474 /* Add a dependency of the batch on the vertex buffer */
1475 panfrost_batch_add_bo(batch, rsrc->bo,
1476 PAN_BO_ACCESS_SHARED |
1477 PAN_BO_ACCESS_READ |
1478 PAN_BO_ACCESS_VERTEX_TILER);
1479
1480 /* Set common fields */
1481 attrs[k].elements = addr;
1482 attrs[k].stride = buf->stride;
1483
1484 /* Since we advanced the base pointer, we shrink the buffer
1485 * size */
1486 attrs[k].size = rsrc->base.width0 - buf->buffer_offset;
1487
1488 /* We need to add the extra size we masked off (for
1489 * correctness) so the data doesn't get clamped away */
1490 attrs[k].size += chopped_addr;
1491
1492 /* For non-instancing make sure we initialize */
1493 attrs[k].shift = attrs[k].extra_flags = 0;
1494
1495 /* Instancing uses a dramatically different code path than
1496 * linear, so dispatch for the actual emission now that the
1497 * common code is finished */
1498
1499 unsigned divisor = elem->instance_divisor;
1500
1501 if (divisor && ctx->instance_count == 1) {
1502 /* Silly corner case where there's a divisor(=1) but
1503 * there's no legitimate instancing. So we want *every*
1504 * attribute to be the same. So set stride to zero so
1505 * we don't go anywhere. */
1506
1507 attrs[k].size = attrs[k].stride + chopped_addr;
1508 attrs[k].stride = 0;
1509 attrs[k++].elements |= MALI_ATTR_LINEAR;
1510 } else if (ctx->instance_count <= 1) {
1511 /* Normal, non-instanced attributes */
1512 attrs[k++].elements |= MALI_ATTR_LINEAR;
1513 } else {
1514 unsigned instance_shift = vertex_postfix->instance_shift;
1515 unsigned instance_odd = vertex_postfix->instance_odd;
1516
1517 k += panfrost_vertex_instanced(ctx->padded_count,
1518 instance_shift,
1519 instance_odd,
1520 divisor, &attrs[k]);
1521 }
1522 }
1523
1524 /* Add special gl_VertexID/gl_InstanceID buffers */
1525
1526 panfrost_vertex_id(ctx->padded_count, &attrs[k]);
1527 so->hw[PAN_VERTEX_ID].index = k++;
1528 panfrost_instance_id(ctx->padded_count, &attrs[k]);
1529 so->hw[PAN_INSTANCE_ID].index = k++;
1530
1531 /* Upload whatever we emitted and go */
1532
1533 vertex_postfix->attributes = panfrost_upload_transient(batch, attrs,
1534 k * sizeof(*attrs));
1535 }
1536
1537 static mali_ptr
1538 panfrost_emit_varyings(struct panfrost_batch *batch, union mali_attr *slot,
1539 unsigned stride, unsigned count)
1540 {
1541 /* Fill out the descriptor */
1542 slot->stride = stride;
1543 slot->size = stride * count;
1544 slot->shift = slot->extra_flags = 0;
1545
1546 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1547 slot->size);
1548
1549 slot->elements = transfer.gpu | MALI_ATTR_LINEAR;
1550
1551 return transfer.gpu;
1552 }
1553
1554 static void
1555 panfrost_emit_streamout(struct panfrost_batch *batch, union mali_attr *slot,
1556 unsigned stride, unsigned offset, unsigned count,
1557 struct pipe_stream_output_target *target)
1558 {
1559 /* Fill out the descriptor */
1560 slot->stride = stride * 4;
1561 slot->shift = slot->extra_flags = 0;
1562
1563 unsigned max_size = target->buffer_size;
1564 unsigned expected_size = slot->stride * count;
1565
1566 slot->size = MIN2(max_size, expected_size);
1567
1568 /* Grab the BO and bind it to the batch */
1569 struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1570
1571 /* Varyings are WRITE from the perspective of the VERTEX but READ from
1572 * the perspective of the TILER and FRAGMENT.
1573 */
1574 panfrost_batch_add_bo(batch, bo,
1575 PAN_BO_ACCESS_SHARED |
1576 PAN_BO_ACCESS_RW |
1577 PAN_BO_ACCESS_VERTEX_TILER |
1578 PAN_BO_ACCESS_FRAGMENT);
1579
1580 mali_ptr addr = bo->gpu + target->buffer_offset + (offset * slot->stride);
1581 slot->elements = addr;
1582 }
1583
1584 /* Given a shader and buffer indices, link varying metadata together */
1585
1586 static bool
1587 is_special_varying(gl_varying_slot loc)
1588 {
1589 switch (loc) {
1590 case VARYING_SLOT_POS:
1591 case VARYING_SLOT_PSIZ:
1592 case VARYING_SLOT_PNTC:
1593 case VARYING_SLOT_FACE:
1594 return true;
1595 default:
1596 return false;
1597 }
1598 }
1599
1600 static void
1601 panfrost_emit_varying_meta(void *outptr, struct panfrost_shader_state *ss,
1602 signed general, signed gl_Position,
1603 signed gl_PointSize, signed gl_PointCoord,
1604 signed gl_FrontFacing)
1605 {
1606 struct mali_attr_meta *out = (struct mali_attr_meta *) outptr;
1607
1608 for (unsigned i = 0; i < ss->varying_count; ++i) {
1609 gl_varying_slot location = ss->varyings_loc[i];
1610 int index = -1;
1611
1612 switch (location) {
1613 case VARYING_SLOT_POS:
1614 index = gl_Position;
1615 break;
1616 case VARYING_SLOT_PSIZ:
1617 index = gl_PointSize;
1618 break;
1619 case VARYING_SLOT_PNTC:
1620 index = gl_PointCoord;
1621 break;
1622 case VARYING_SLOT_FACE:
1623 index = gl_FrontFacing;
1624 break;
1625 default:
1626 index = general;
1627 break;
1628 }
1629
1630 assert(index >= 0);
1631 out[i].index = index;
1632 }
1633 }
1634
1635 static bool
1636 has_point_coord(unsigned mask, gl_varying_slot loc)
1637 {
1638 if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1639 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1640 else if (loc == VARYING_SLOT_PNTC)
1641 return (mask & (1 << 8));
1642 else
1643 return false;
1644 }
1645
1646 /* Helpers for manipulating stream out information so we can pack varyings
1647 * accordingly. Compute the src_offset for a given captured varying */
1648
1649 static struct pipe_stream_output *
1650 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1651 {
1652 for (unsigned i = 0; i < info->num_outputs; ++i) {
1653 if (info->output[i].register_index == loc)
1654 return &info->output[i];
1655 }
1656
1657 unreachable("Varying not captured");
1658 }
1659
1660 /* TODO: Integers */
1661 static enum mali_format
1662 pan_xfb_format(unsigned nr_components)
1663 {
1664 switch (nr_components) {
1665 case 1: return MALI_R32F;
1666 case 2: return MALI_RG32F;
1667 case 3: return MALI_RGB32F;
1668 case 4: return MALI_RGBA32F;
1669 default: unreachable("Invalid format");
1670 }
1671 }
1672
1673 void
1674 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1675 unsigned vertex_count,
1676 struct mali_vertex_tiler_postfix *vertex_postfix,
1677 struct mali_vertex_tiler_postfix *tiler_postfix,
1678 union midgard_primitive_size *primitive_size)
1679 {
1680 /* Load the shaders */
1681 struct panfrost_context *ctx = batch->ctx;
1682 struct panfrost_shader_state *vs, *fs;
1683 unsigned int num_gen_varyings = 0;
1684 size_t vs_size, fs_size;
1685
1686 /* Allocate the varying descriptor */
1687
1688 vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1689 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1690 vs_size = sizeof(struct mali_attr_meta) * vs->varying_count;
1691 fs_size = sizeof(struct mali_attr_meta) * fs->varying_count;
1692
1693 struct panfrost_transfer trans = panfrost_allocate_transient(batch,
1694 vs_size +
1695 fs_size);
1696
1697 struct pipe_stream_output_info *so = &vs->stream_output;
1698
1699 /* Check if this varying is linked by us. This is the case for
1700 * general-purpose, non-captured varyings. If it is, link it. If it's
1701 * not, use the provided stream out information to determine the
1702 * offset, since it was already linked for us. */
1703
1704 for (unsigned i = 0; i < vs->varying_count; i++) {
1705 gl_varying_slot loc = vs->varyings_loc[i];
1706
1707 bool special = is_special_varying(loc);
1708 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1709
1710 if (captured) {
1711 struct pipe_stream_output *o = pan_get_so(so, loc);
1712
1713 unsigned dst_offset = o->dst_offset * 4; /* dwords */
1714 vs->varyings[i].src_offset = dst_offset;
1715 } else if (!special) {
1716 vs->varyings[i].src_offset = 16 * (num_gen_varyings++);
1717 }
1718 }
1719
1720 /* Conversely, we need to set src_offset for the captured varyings.
1721 * Here, the layout is defined by the stream out info, not us */
1722
1723 /* Link up with fragment varyings */
1724 bool reads_point_coord = fs->reads_point_coord;
1725
1726 for (unsigned i = 0; i < fs->varying_count; i++) {
1727 gl_varying_slot loc = fs->varyings_loc[i];
1728 unsigned src_offset;
1729 signed vs_idx = -1;
1730
1731 /* Link up */
1732 for (unsigned j = 0; j < vs->varying_count; ++j) {
1733 if (vs->varyings_loc[j] == loc) {
1734 vs_idx = j;
1735 break;
1736 }
1737 }
1738
1739 /* Either assign or reuse */
1740 if (vs_idx >= 0)
1741 src_offset = vs->varyings[vs_idx].src_offset;
1742 else
1743 src_offset = 16 * (num_gen_varyings++);
1744
1745 fs->varyings[i].src_offset = src_offset;
1746
1747 if (has_point_coord(fs->point_sprite_mask, loc))
1748 reads_point_coord = true;
1749 }
1750
1751 memcpy(trans.cpu, vs->varyings, vs_size);
1752 memcpy(trans.cpu + vs_size, fs->varyings, fs_size);
1753
1754 union mali_attr varyings[PIPE_MAX_ATTRIBS] = {0};
1755
1756 /* Figure out how many streamout buffers could be bound */
1757 unsigned so_count = ctx->streamout.num_targets;
1758 for (unsigned i = 0; i < vs->varying_count; i++) {
1759 gl_varying_slot loc = vs->varyings_loc[i];
1760
1761 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1762 if (!captured) continue;
1763
1764 struct pipe_stream_output *o = pan_get_so(so, loc);
1765 so_count = MAX2(so_count, o->output_buffer + 1);
1766 }
1767
1768 signed idx = so_count;
1769 signed general = idx++;
1770 signed gl_Position = idx++;
1771 signed gl_PointSize = vs->writes_point_size ? (idx++) : -1;
1772 signed gl_PointCoord = reads_point_coord ? (idx++) : -1;
1773 signed gl_FrontFacing = fs->reads_face ? (idx++) : -1;
1774 signed gl_FragCoord = fs->reads_frag_coord ? (idx++) : -1;
1775
1776 /* Emit the stream out buffers */
1777
1778 unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1779 ctx->vertex_count);
1780
1781 for (unsigned i = 0; i < so_count; ++i) {
1782 if (i < ctx->streamout.num_targets) {
1783 panfrost_emit_streamout(batch, &varyings[i],
1784 so->stride[i],
1785 ctx->streamout.offsets[i],
1786 out_count,
1787 ctx->streamout.targets[i]);
1788 } else {
1789 /* Emit a dummy buffer */
1790 panfrost_emit_varyings(batch, &varyings[i],
1791 so->stride[i] * 4,
1792 out_count);
1793
1794 /* Clear the attribute type */
1795 varyings[i].elements &= ~0xF;
1796 }
1797 }
1798
1799 panfrost_emit_varyings(batch, &varyings[general],
1800 num_gen_varyings * 16,
1801 vertex_count);
1802
1803 mali_ptr varyings_p;
1804
1805 /* fp32 vec4 gl_Position */
1806 varyings_p = panfrost_emit_varyings(batch, &varyings[gl_Position],
1807 sizeof(float) * 4, vertex_count);
1808 tiler_postfix->position_varying = varyings_p;
1809
1810
1811 if (panfrost_writes_point_size(ctx)) {
1812 varyings_p = panfrost_emit_varyings(batch,
1813 &varyings[gl_PointSize],
1814 2, vertex_count);
1815 primitive_size->pointer = varyings_p;
1816 }
1817
1818 if (reads_point_coord)
1819 varyings[gl_PointCoord].elements = MALI_VARYING_POINT_COORD;
1820
1821 if (fs->reads_face)
1822 varyings[gl_FrontFacing].elements = MALI_VARYING_FRONT_FACING;
1823
1824 if (fs->reads_frag_coord)
1825 varyings[gl_FragCoord].elements = MALI_VARYING_FRAG_COORD;
1826
1827 struct panfrost_device *device = pan_device(ctx->base.screen);
1828 assert(!(device->quirks & IS_BIFROST) || !(reads_point_coord));
1829
1830 /* Let's go ahead and link varying meta to the buffer in question, now
1831 * that that information is available. VARYING_SLOT_POS is mapped to
1832 * gl_FragCoord for fragment shaders but gl_Positionf or vertex shaders
1833 * */
1834
1835 panfrost_emit_varying_meta(trans.cpu, vs, general, gl_Position,
1836 gl_PointSize, gl_PointCoord,
1837 gl_FrontFacing);
1838
1839 panfrost_emit_varying_meta(trans.cpu + vs_size, fs, general,
1840 gl_FragCoord, gl_PointSize,
1841 gl_PointCoord, gl_FrontFacing);
1842
1843 /* Replace streamout */
1844
1845 struct mali_attr_meta *ovs = (struct mali_attr_meta *)trans.cpu;
1846 struct mali_attr_meta *ofs = ovs + vs->varying_count;
1847
1848 for (unsigned i = 0; i < vs->varying_count; i++) {
1849 gl_varying_slot loc = vs->varyings_loc[i];
1850
1851 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1852 if (!captured)
1853 continue;
1854
1855 struct pipe_stream_output *o = pan_get_so(so, loc);
1856 ovs[i].index = o->output_buffer;
1857
1858 /* Set the type appropriately. TODO: Integer varyings XXX */
1859 assert(o->stream == 0);
1860 ovs[i].format = pan_xfb_format(o->num_components);
1861
1862 if (device->quirks & HAS_SWIZZLES)
1863 ovs[i].swizzle = panfrost_get_default_swizzle(o->num_components);
1864 else
1865 ovs[i].swizzle = panfrost_bifrost_swizzle(o->num_components);
1866
1867 /* Link to the fragment */
1868 signed fs_idx = -1;
1869
1870 /* Link up */
1871 for (unsigned j = 0; j < fs->varying_count; ++j) {
1872 if (fs->varyings_loc[j] == loc) {
1873 fs_idx = j;
1874 break;
1875 }
1876 }
1877
1878 if (fs_idx >= 0) {
1879 ofs[fs_idx].index = ovs[i].index;
1880 ofs[fs_idx].format = ovs[i].format;
1881 ofs[fs_idx].swizzle = ovs[i].swizzle;
1882 }
1883 }
1884
1885 /* Replace point sprite */
1886 for (unsigned i = 0; i < fs->varying_count; i++) {
1887 /* If we have a point sprite replacement, handle that here. We
1888 * have to translate location first. TODO: Flip y in shader.
1889 * We're already keying ... just time crunch .. */
1890
1891 if (has_point_coord(fs->point_sprite_mask,
1892 fs->varyings_loc[i])) {
1893 ofs[i].index = gl_PointCoord;
1894
1895 /* Swizzle out the z/w to 0/1 */
1896 ofs[i].format = MALI_RG16F;
1897 ofs[i].swizzle = panfrost_get_default_swizzle(2);
1898 }
1899 }
1900
1901 /* Fix up unaligned addresses */
1902 for (unsigned i = 0; i < so_count; ++i) {
1903 if (varyings[i].elements < MALI_RECORD_SPECIAL)
1904 continue;
1905
1906 unsigned align = (varyings[i].elements & 63);
1907
1908 /* While we're at it, the SO buffers are linear */
1909
1910 if (!align) {
1911 varyings[i].elements |= MALI_ATTR_LINEAR;
1912 continue;
1913 }
1914
1915 /* We need to adjust alignment */
1916 varyings[i].elements &= ~63;
1917 varyings[i].elements |= MALI_ATTR_LINEAR;
1918 varyings[i].size += align;
1919
1920 for (unsigned v = 0; v < vs->varying_count; ++v) {
1921 if (ovs[v].index != i)
1922 continue;
1923
1924 ovs[v].src_offset = vs->varyings[v].src_offset + align;
1925 }
1926
1927 for (unsigned f = 0; f < fs->varying_count; ++f) {
1928 if (ofs[f].index != i)
1929 continue;
1930
1931 ofs[f].src_offset = fs->varyings[f].src_offset + align;
1932 }
1933 }
1934
1935 varyings_p = panfrost_upload_transient(batch, varyings,
1936 idx * sizeof(*varyings));
1937 vertex_postfix->varyings = varyings_p;
1938 tiler_postfix->varyings = varyings_p;
1939
1940 vertex_postfix->varying_meta = trans.gpu;
1941 tiler_postfix->varying_meta = trans.gpu + vs_size;
1942 }
1943
1944 void
1945 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
1946 struct mali_vertex_tiler_prefix *vertex_prefix,
1947 struct mali_vertex_tiler_postfix *vertex_postfix,
1948 struct mali_vertex_tiler_prefix *tiler_prefix,
1949 struct mali_vertex_tiler_postfix *tiler_postfix,
1950 union midgard_primitive_size *primitive_size)
1951 {
1952 struct panfrost_context *ctx = batch->ctx;
1953 struct panfrost_device *device = pan_device(ctx->base.screen);
1954 bool wallpapering = ctx->wallpaper_batch && batch->tiler_dep;
1955 struct bifrost_payload_vertex bifrost_vertex = {0,};
1956 struct bifrost_payload_tiler bifrost_tiler = {0,};
1957 struct midgard_payload_vertex_tiler midgard_vertex = {0,};
1958 struct midgard_payload_vertex_tiler midgard_tiler = {0,};
1959 void *vp, *tp;
1960 size_t vp_size, tp_size;
1961
1962 if (device->quirks & IS_BIFROST) {
1963 bifrost_vertex.prefix = *vertex_prefix;
1964 bifrost_vertex.postfix = *vertex_postfix;
1965 vp = &bifrost_vertex;
1966 vp_size = sizeof(bifrost_vertex);
1967
1968 bifrost_tiler.prefix = *tiler_prefix;
1969 bifrost_tiler.tiler.primitive_size = *primitive_size;
1970 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
1971 bifrost_tiler.postfix = *tiler_postfix;
1972 tp = &bifrost_tiler;
1973 tp_size = sizeof(bifrost_tiler);
1974 } else {
1975 midgard_vertex.prefix = *vertex_prefix;
1976 midgard_vertex.postfix = *vertex_postfix;
1977 vp = &midgard_vertex;
1978 vp_size = sizeof(midgard_vertex);
1979
1980 midgard_tiler.prefix = *tiler_prefix;
1981 midgard_tiler.postfix = *tiler_postfix;
1982 midgard_tiler.primitive_size = *primitive_size;
1983 tp = &midgard_tiler;
1984 tp_size = sizeof(midgard_tiler);
1985 }
1986
1987 if (wallpapering) {
1988 /* Inject in reverse order, with "predicted" job indices.
1989 * THIS IS A HACK XXX */
1990 panfrost_new_job(batch, JOB_TYPE_TILER, false,
1991 batch->job_index + 2, tp, tp_size, true);
1992 panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
1993 vp, vp_size, true);
1994 return;
1995 }
1996
1997 /* If rasterizer discard is enable, only submit the vertex */
1998
1999 bool rasterizer_discard = ctx->rasterizer &&
2000 ctx->rasterizer->base.rasterizer_discard;
2001
2002 unsigned vertex = panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
2003 vp, vp_size, false);
2004
2005 if (rasterizer_discard)
2006 return;
2007
2008 panfrost_new_job(batch, JOB_TYPE_TILER, false, vertex, tp, tp_size,
2009 false);
2010 }
2011
2012 /* TODO: stop hardcoding this */
2013 mali_ptr
2014 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2015 {
2016 uint16_t locations[] = {
2017 128, 128,
2018 0, 256,
2019 0, 256,
2020 0, 256,
2021 0, 256,
2022 0, 256,
2023 0, 256,
2024 0, 256,
2025 0, 256,
2026 0, 256,
2027 0, 256,
2028 0, 256,
2029 0, 256,
2030 0, 256,
2031 0, 256,
2032 0, 256,
2033 0, 256,
2034 0, 256,
2035 0, 256,
2036 0, 256,
2037 0, 256,
2038 0, 256,
2039 0, 256,
2040 0, 256,
2041 0, 256,
2042 0, 256,
2043 0, 256,
2044 0, 256,
2045 0, 256,
2046 0, 256,
2047 0, 256,
2048 0, 256,
2049 128, 128,
2050 0, 0,
2051 0, 0,
2052 0, 0,
2053 0, 0,
2054 0, 0,
2055 0, 0,
2056 0, 0,
2057 0, 0,
2058 0, 0,
2059 0, 0,
2060 0, 0,
2061 0, 0,
2062 0, 0,
2063 0, 0,
2064 0, 0,
2065 };
2066
2067 return panfrost_upload_transient(batch, locations, 96 * sizeof(uint16_t));
2068 }