panfrost: Set unk2 to accomodate blending
[mesa.git] / src / gallium / drivers / panfrost / pan_cmdstream.c
1 /*
2 * Copyright (C) 2018 Alyssa Rosenzweig
3 * Copyright (C) 2020 Collabora Ltd.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "util/macros.h"
26 #include "util/u_prim.h"
27 #include "util/u_vbuf.h"
28
29 #include "panfrost-quirks.h"
30
31 #include "pan_allocate.h"
32 #include "pan_bo.h"
33 #include "pan_cmdstream.h"
34 #include "pan_context.h"
35 #include "pan_job.h"
36
37 /* If a BO is accessed for a particular shader stage, will it be in the primary
38 * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
39 * fragment will be primary, e.g. compute jobs will be considered
40 * "vertex/tiler" by analogy */
41
42 static inline uint32_t
43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
44 {
45 assert(stage == PIPE_SHADER_FRAGMENT ||
46 stage == PIPE_SHADER_VERTEX ||
47 stage == PIPE_SHADER_COMPUTE);
48
49 return stage == PIPE_SHADER_FRAGMENT ?
50 PAN_BO_ACCESS_FRAGMENT :
51 PAN_BO_ACCESS_VERTEX_TILER;
52 }
53
54 static void
55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
56 struct mali_vertex_tiler_postfix *postfix)
57 {
58 struct panfrost_device *dev = pan_device(ctx->base.screen);
59 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
60
61 unsigned shift = panfrost_get_stack_shift(batch->stack_size);
62 struct mali_shared_memory shared = {
63 .stack_shift = shift,
64 .scratchpad = panfrost_batch_get_scratchpad(batch, shift, dev->thread_tls_alloc, dev->core_count)->gpu,
65 .shared_workgroup_count = ~0,
66 };
67 postfix->shared_memory = panfrost_upload_transient(batch, &shared, sizeof(shared));
68 }
69
70 static void
71 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
72 struct mali_vertex_tiler_postfix *postfix)
73 {
74 struct panfrost_device *dev = pan_device(ctx->base.screen);
75 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
76
77 /* If we haven't, reserve space for the framebuffer */
78
79 if (!batch->framebuffer.gpu) {
80 unsigned size = (dev->quirks & MIDGARD_SFBD) ?
81 sizeof(struct mali_single_framebuffer) :
82 sizeof(struct mali_framebuffer);
83
84 batch->framebuffer = panfrost_allocate_transient(batch, size);
85
86 /* Tag the pointer */
87 if (!(dev->quirks & MIDGARD_SFBD))
88 batch->framebuffer.gpu |= MALI_MFBD;
89 }
90
91 postfix->shared_memory = batch->framebuffer.gpu;
92 }
93
94 static void
95 panfrost_vt_update_rasterizer(struct panfrost_context *ctx,
96 struct mali_vertex_tiler_prefix *prefix,
97 struct mali_vertex_tiler_postfix *postfix)
98 {
99 struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
100
101 postfix->gl_enables |= 0x7;
102 SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
103 rasterizer && rasterizer->base.front_ccw);
104 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
105 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_FRONT));
106 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
107 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_BACK));
108 SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
109 rasterizer && rasterizer->base.flatshade_first);
110 }
111
112 void
113 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
114 struct mali_vertex_tiler_prefix *prefix,
115 union midgard_primitive_size *primitive_size)
116 {
117 struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
118
119 if (!panfrost_writes_point_size(ctx)) {
120 bool points = prefix->draw_mode == MALI_POINTS;
121 float val = 0.0f;
122
123 if (rasterizer)
124 val = points ?
125 rasterizer->base.point_size :
126 rasterizer->base.line_width;
127
128 primitive_size->constant = val;
129 }
130 }
131
132 static void
133 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
134 struct mali_vertex_tiler_postfix *postfix)
135 {
136 SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
137 if (ctx->occlusion_query)
138 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
139 else
140 postfix->occlusion_counter = 0;
141 }
142
143 void
144 panfrost_vt_init(struct panfrost_context *ctx,
145 enum pipe_shader_type stage,
146 struct mali_vertex_tiler_prefix *prefix,
147 struct mali_vertex_tiler_postfix *postfix)
148 {
149 struct panfrost_device *device = pan_device(ctx->base.screen);
150
151 if (!ctx->shader[stage])
152 return;
153
154 memset(prefix, 0, sizeof(*prefix));
155 memset(postfix, 0, sizeof(*postfix));
156
157 if (device->quirks & IS_BIFROST) {
158 postfix->gl_enables = 0x2;
159 panfrost_vt_emit_shared_memory(ctx, postfix);
160 } else {
161 postfix->gl_enables = 0x6;
162 panfrost_vt_attach_framebuffer(ctx, postfix);
163 }
164
165 if (stage == PIPE_SHADER_FRAGMENT) {
166 panfrost_vt_update_occlusion_query(ctx, postfix);
167 panfrost_vt_update_rasterizer(ctx, prefix, postfix);
168 }
169 }
170
171 static unsigned
172 panfrost_translate_index_size(unsigned size)
173 {
174 switch (size) {
175 case 1:
176 return MALI_DRAW_INDEXED_UINT8;
177
178 case 2:
179 return MALI_DRAW_INDEXED_UINT16;
180
181 case 4:
182 return MALI_DRAW_INDEXED_UINT32;
183
184 default:
185 unreachable("Invalid index size");
186 }
187 }
188
189 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
190 * good for the duration of the draw (transient), could last longer. Also get
191 * the bounds on the index buffer for the range accessed by the draw. We do
192 * these operations together because there are natural optimizations which
193 * require them to be together. */
194
195 static mali_ptr
196 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
197 const struct pipe_draw_info *info,
198 unsigned *min_index, unsigned *max_index)
199 {
200 struct panfrost_resource *rsrc = pan_resource(info->index.resource);
201 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
202 off_t offset = info->start * info->index_size;
203 bool needs_indices = true;
204 mali_ptr out = 0;
205
206 if (info->max_index != ~0u) {
207 *min_index = info->min_index;
208 *max_index = info->max_index;
209 needs_indices = false;
210 }
211
212 if (!info->has_user_indices) {
213 /* Only resources can be directly mapped */
214 panfrost_batch_add_bo(batch, rsrc->bo,
215 PAN_BO_ACCESS_SHARED |
216 PAN_BO_ACCESS_READ |
217 PAN_BO_ACCESS_VERTEX_TILER);
218 out = rsrc->bo->gpu + offset;
219
220 /* Check the cache */
221 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
222 info->start,
223 info->count,
224 min_index,
225 max_index);
226 } else {
227 /* Otherwise, we need to upload to transient memory */
228 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
229 out = panfrost_upload_transient(batch, ibuf8 + offset,
230 info->count *
231 info->index_size);
232 }
233
234 if (needs_indices) {
235 /* Fallback */
236 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
237
238 if (!info->has_user_indices)
239 panfrost_minmax_cache_add(rsrc->index_cache,
240 info->start, info->count,
241 *min_index, *max_index);
242 }
243
244 return out;
245 }
246
247 void
248 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
249 const struct pipe_draw_info *info,
250 enum mali_draw_mode draw_mode,
251 struct mali_vertex_tiler_postfix *vertex_postfix,
252 struct mali_vertex_tiler_prefix *tiler_prefix,
253 struct mali_vertex_tiler_postfix *tiler_postfix,
254 unsigned *vertex_count,
255 unsigned *padded_count)
256 {
257 tiler_prefix->draw_mode = draw_mode;
258
259 unsigned draw_flags = 0;
260
261 if (panfrost_writes_point_size(ctx))
262 draw_flags |= MALI_DRAW_VARYING_SIZE;
263
264 if (info->primitive_restart)
265 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
266
267 /* These doesn't make much sense */
268
269 draw_flags |= 0x3000;
270
271 if (info->index_size) {
272 unsigned min_index = 0, max_index = 0;
273
274 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
275 info,
276 &min_index,
277 &max_index);
278
279 /* Use the corresponding values */
280 *vertex_count = max_index - min_index + 1;
281 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
282 tiler_prefix->offset_bias_correction = -min_index;
283 tiler_prefix->index_count = MALI_POSITIVE(info->count);
284 draw_flags |= panfrost_translate_index_size(info->index_size);
285 } else {
286 tiler_prefix->indices = 0;
287 *vertex_count = ctx->vertex_count;
288 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
289 tiler_prefix->offset_bias_correction = 0;
290 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
291 }
292
293 tiler_prefix->unknown_draw = draw_flags;
294
295 /* Encode the padded vertex count */
296
297 if (info->instance_count > 1) {
298 *padded_count = panfrost_padded_vertex_count(*vertex_count);
299
300 unsigned shift = __builtin_ctz(ctx->padded_count);
301 unsigned k = ctx->padded_count >> (shift + 1);
302
303 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
304 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
305 } else {
306 *padded_count = *vertex_count;
307
308 /* Reset instancing state */
309 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
310 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
311 }
312 }
313
314 static void
315 panfrost_shader_meta_init(struct panfrost_context *ctx,
316 enum pipe_shader_type st,
317 struct mali_shader_meta *meta)
318 {
319 const struct panfrost_device *dev = pan_device(ctx->base.screen);
320 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
321
322 memset(meta, 0, sizeof(*meta));
323 meta->shader = (ss->bo ? ss->bo->gpu : 0) | ss->first_tag;
324 meta->attribute_count = ss->attribute_count;
325 meta->varying_count = ss->varying_count;
326 meta->texture_count = ctx->sampler_view_count[st];
327 meta->sampler_count = ctx->sampler_count[st];
328
329 if (dev->quirks & IS_BIFROST) {
330 if (st == PIPE_SHADER_VERTEX)
331 meta->bifrost1.unk1 = 0x800000;
332 else {
333 /* First clause ATEST |= 0x4000000.
334 * Less than 32 regs |= 0x200 */
335 meta->bifrost1.unk1 = 0x958020;
336 }
337
338 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
339 if (st == PIPE_SHADER_VERTEX)
340 meta->bifrost2.preload_regs = 0xC0;
341 else
342 meta->bifrost2.preload_regs = 0x1;
343 meta->bifrost2.uniform_count = MIN2(ss->uniform_count,
344 ss->uniform_cutoff);
345 } else {
346 meta->midgard1.uniform_count = MIN2(ss->uniform_count,
347 ss->uniform_cutoff);
348 meta->midgard1.work_count = ss->work_reg_count;
349 meta->midgard1.flags_hi = 0x8; /* XXX */
350 meta->midgard1.flags_lo = 0x220;
351 meta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
352 }
353 }
354
355 static unsigned
356 panfrost_translate_compare_func(enum pipe_compare_func in)
357 {
358 switch (in) {
359 case PIPE_FUNC_NEVER:
360 return MALI_FUNC_NEVER;
361
362 case PIPE_FUNC_LESS:
363 return MALI_FUNC_LESS;
364
365 case PIPE_FUNC_EQUAL:
366 return MALI_FUNC_EQUAL;
367
368 case PIPE_FUNC_LEQUAL:
369 return MALI_FUNC_LEQUAL;
370
371 case PIPE_FUNC_GREATER:
372 return MALI_FUNC_GREATER;
373
374 case PIPE_FUNC_NOTEQUAL:
375 return MALI_FUNC_NOTEQUAL;
376
377 case PIPE_FUNC_GEQUAL:
378 return MALI_FUNC_GEQUAL;
379
380 case PIPE_FUNC_ALWAYS:
381 return MALI_FUNC_ALWAYS;
382
383 default:
384 unreachable("Invalid func");
385 }
386 }
387
388 static unsigned
389 panfrost_translate_stencil_op(enum pipe_stencil_op in)
390 {
391 switch (in) {
392 case PIPE_STENCIL_OP_KEEP:
393 return MALI_STENCIL_KEEP;
394
395 case PIPE_STENCIL_OP_ZERO:
396 return MALI_STENCIL_ZERO;
397
398 case PIPE_STENCIL_OP_REPLACE:
399 return MALI_STENCIL_REPLACE;
400
401 case PIPE_STENCIL_OP_INCR:
402 return MALI_STENCIL_INCR;
403
404 case PIPE_STENCIL_OP_DECR:
405 return MALI_STENCIL_DECR;
406
407 case PIPE_STENCIL_OP_INCR_WRAP:
408 return MALI_STENCIL_INCR_WRAP;
409
410 case PIPE_STENCIL_OP_DECR_WRAP:
411 return MALI_STENCIL_DECR_WRAP;
412
413 case PIPE_STENCIL_OP_INVERT:
414 return MALI_STENCIL_INVERT;
415
416 default:
417 unreachable("Invalid stencil op");
418 }
419 }
420
421 static unsigned
422 translate_tex_wrap(enum pipe_tex_wrap w)
423 {
424 switch (w) {
425 case PIPE_TEX_WRAP_REPEAT:
426 return MALI_WRAP_REPEAT;
427
428 case PIPE_TEX_WRAP_CLAMP:
429 return MALI_WRAP_CLAMP;
430
431 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
432 return MALI_WRAP_CLAMP_TO_EDGE;
433
434 case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
435 return MALI_WRAP_CLAMP_TO_BORDER;
436
437 case PIPE_TEX_WRAP_MIRROR_REPEAT:
438 return MALI_WRAP_MIRRORED_REPEAT;
439
440 case PIPE_TEX_WRAP_MIRROR_CLAMP:
441 return MALI_WRAP_MIRRORED_CLAMP;
442
443 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
444 return MALI_WRAP_MIRRORED_CLAMP_TO_EDGE;
445
446 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
447 return MALI_WRAP_MIRRORED_CLAMP_TO_BORDER;
448
449 default:
450 unreachable("Invalid wrap");
451 }
452 }
453
454 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
455 struct mali_sampler_descriptor *hw)
456 {
457 unsigned func = panfrost_translate_compare_func(cso->compare_func);
458 bool min_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
459 bool mag_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
460 bool mip_linear = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR;
461 unsigned min_filter = min_nearest ? MALI_SAMP_MIN_NEAREST : 0;
462 unsigned mag_filter = mag_nearest ? MALI_SAMP_MAG_NEAREST : 0;
463 unsigned mip_filter = mip_linear ?
464 (MALI_SAMP_MIP_LINEAR_1 | MALI_SAMP_MIP_LINEAR_2) : 0;
465 unsigned normalized = cso->normalized_coords ? MALI_SAMP_NORM_COORDS : 0;
466
467 *hw = (struct mali_sampler_descriptor) {
468 .filter_mode = min_filter | mag_filter | mip_filter |
469 normalized,
470 .wrap_s = translate_tex_wrap(cso->wrap_s),
471 .wrap_t = translate_tex_wrap(cso->wrap_t),
472 .wrap_r = translate_tex_wrap(cso->wrap_r),
473 .compare_func = panfrost_flip_compare_func(func),
474 .border_color = {
475 cso->border_color.f[0],
476 cso->border_color.f[1],
477 cso->border_color.f[2],
478 cso->border_color.f[3]
479 },
480 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
481 .max_lod = FIXED_16(cso->max_lod, false),
482 .lod_bias = FIXED_16(cso->lod_bias, true), /* can be negative */
483 .seamless_cube_map = cso->seamless_cube_map,
484 };
485
486 /* If necessary, we disable mipmapping in the sampler descriptor by
487 * clamping the LOD as tight as possible (from 0 to epsilon,
488 * essentially -- remember these are fixed point numbers, so
489 * epsilon=1/256) */
490
491 if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
492 hw->max_lod = hw->min_lod + 1;
493 }
494
495 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
496 struct bifrost_sampler_descriptor *hw)
497 {
498 *hw = (struct bifrost_sampler_descriptor) {
499 .unk1 = 0x1,
500 .wrap_s = translate_tex_wrap(cso->wrap_s),
501 .wrap_t = translate_tex_wrap(cso->wrap_t),
502 .wrap_r = translate_tex_wrap(cso->wrap_r),
503 .unk8 = 0x8,
504 .min_filter = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST,
505 .norm_coords = cso->normalized_coords,
506 .mip_filter = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR,
507 .mag_filter = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR,
508 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
509 .max_lod = FIXED_16(cso->max_lod, false),
510 };
511
512 /* If necessary, we disable mipmapping in the sampler descriptor by
513 * clamping the LOD as tight as possible (from 0 to epsilon,
514 * essentially -- remember these are fixed point numbers, so
515 * epsilon=1/256) */
516
517 if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
518 hw->max_lod = hw->min_lod + 1;
519 }
520
521 static void
522 panfrost_make_stencil_state(const struct pipe_stencil_state *in,
523 struct mali_stencil_test *out)
524 {
525 out->ref = 0; /* Gallium gets it from elsewhere */
526
527 out->mask = in->valuemask;
528 out->func = panfrost_translate_compare_func(in->func);
529 out->sfail = panfrost_translate_stencil_op(in->fail_op);
530 out->dpfail = panfrost_translate_stencil_op(in->zfail_op);
531 out->dppass = panfrost_translate_stencil_op(in->zpass_op);
532 }
533
534 static void
535 panfrost_frag_meta_rasterizer_update(struct panfrost_context *ctx,
536 struct mali_shader_meta *fragmeta)
537 {
538 if (!ctx->rasterizer) {
539 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, true);
540 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, false);
541 fragmeta->depth_units = 0.0f;
542 fragmeta->depth_factor = 0.0f;
543 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, false);
544 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, false);
545 return;
546 }
547
548 bool msaa = ctx->rasterizer->base.multisample;
549
550 /* TODO: Sample size */
551 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
552 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
553 fragmeta->depth_units = ctx->rasterizer->base.offset_units * 2.0f;
554 fragmeta->depth_factor = ctx->rasterizer->base.offset_scale;
555
556 /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
557
558 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A,
559 ctx->rasterizer->base.offset_tri);
560 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B,
561 ctx->rasterizer->base.offset_tri);
562 }
563
564 static void
565 panfrost_frag_meta_zsa_update(struct panfrost_context *ctx,
566 struct mali_shader_meta *fragmeta)
567 {
568 const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
569 int zfunc = PIPE_FUNC_ALWAYS;
570
571 if (!zsa) {
572 struct pipe_stencil_state default_stencil = {
573 .enabled = 0,
574 .func = PIPE_FUNC_ALWAYS,
575 .fail_op = MALI_STENCIL_KEEP,
576 .zfail_op = MALI_STENCIL_KEEP,
577 .zpass_op = MALI_STENCIL_KEEP,
578 .writemask = 0xFF,
579 .valuemask = 0xFF
580 };
581
582 panfrost_make_stencil_state(&default_stencil,
583 &fragmeta->stencil_front);
584 fragmeta->stencil_mask_front = default_stencil.writemask;
585 fragmeta->stencil_back = fragmeta->stencil_front;
586 fragmeta->stencil_mask_back = default_stencil.writemask;
587 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST, false);
588 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK, false);
589 } else {
590 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
591 zsa->stencil[0].enabled);
592 panfrost_make_stencil_state(&zsa->stencil[0],
593 &fragmeta->stencil_front);
594 fragmeta->stencil_mask_front = zsa->stencil[0].writemask;
595 fragmeta->stencil_front.ref = ctx->stencil_ref.ref_value[0];
596
597 /* If back-stencil is not enabled, use the front values */
598
599 if (zsa->stencil[1].enabled) {
600 panfrost_make_stencil_state(&zsa->stencil[1],
601 &fragmeta->stencil_back);
602 fragmeta->stencil_mask_back = zsa->stencil[1].writemask;
603 fragmeta->stencil_back.ref = ctx->stencil_ref.ref_value[1];
604 } else {
605 fragmeta->stencil_back = fragmeta->stencil_front;
606 fragmeta->stencil_mask_back = fragmeta->stencil_mask_front;
607 fragmeta->stencil_back.ref = fragmeta->stencil_front.ref;
608 }
609
610 if (zsa->depth.enabled)
611 zfunc = zsa->depth.func;
612
613 /* Depth state (TODO: Refactor) */
614
615 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
616 zsa->depth.writemask);
617 }
618
619 fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
620 fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(zfunc));
621 }
622
623 static bool
624 panfrost_fs_required(
625 struct panfrost_shader_state *fs,
626 struct panfrost_blend_final *blend,
627 unsigned rt_count)
628 {
629 /* If we generally have side effects */
630 if (fs->fs_sidefx)
631 return true;
632
633 /* If colour is written we need to execute */
634 for (unsigned i = 0; i < rt_count; ++i) {
635 if (!blend[i].no_colour)
636 return true;
637 }
638
639 /* If depth is written and not implied we need to execute.
640 * TODO: Predicate on Z/S writes being enabled */
641 return (fs->writes_depth || fs->writes_stencil);
642 }
643
644 static void
645 panfrost_frag_meta_blend_update(struct panfrost_context *ctx,
646 struct mali_shader_meta *fragmeta,
647 void *rts)
648 {
649 const struct panfrost_device *dev = pan_device(ctx->base.screen);
650 struct panfrost_shader_state *fs;
651 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
652
653 SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
654 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
655 !ctx->blend->base.dither);
656
657 /* Get blending setup */
658 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
659
660 struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
661 unsigned shader_offset = 0;
662 struct panfrost_bo *shader_bo = NULL;
663
664 for (unsigned c = 0; c < rt_count; ++c)
665 blend[c] = panfrost_get_blend_for_context(ctx, c, &shader_bo,
666 &shader_offset);
667
668 /* Disable shader execution if we can */
669 if (dev->quirks & MIDGARD_SHADERLESS
670 && !panfrost_fs_required(fs, blend, rt_count)) {
671 fragmeta->shader = 0;
672 fragmeta->attribute_count = 0;
673 fragmeta->varying_count = 0;
674 fragmeta->texture_count = 0;
675 fragmeta->sampler_count = 0;
676
677 /* This feature is not known to work on Bifrost */
678 fragmeta->midgard1.work_count = 1;
679 fragmeta->midgard1.uniform_count = 0;
680 fragmeta->midgard1.uniform_buffer_count = 0;
681 }
682
683 /* If there is a blend shader, work registers are shared. We impose 8
684 * work registers as a limit for blend shaders. Should be lower XXX */
685
686 if (!(dev->quirks & IS_BIFROST)) {
687 for (unsigned c = 0; c < rt_count; ++c) {
688 if (blend[c].is_shader) {
689 fragmeta->midgard1.work_count =
690 MAX2(fragmeta->midgard1.work_count, 8);
691 }
692 }
693 }
694
695 /* Even on MFBD, the shader descriptor gets blend shaders. It's *also*
696 * copied to the blend_meta appended (by convention), but this is the
697 * field actually read by the hardware. (Or maybe both are read...?).
698 * Specify the last RTi with a blend shader. */
699
700 fragmeta->blend.shader = 0;
701
702 for (signed rt = (rt_count - 1); rt >= 0; --rt) {
703 if (!blend[rt].is_shader)
704 continue;
705
706 fragmeta->blend.shader = blend[rt].shader.gpu |
707 blend[rt].shader.first_tag;
708 break;
709 }
710
711 if (dev->quirks & MIDGARD_SFBD) {
712 /* When only a single render target platform is used, the blend
713 * information is inside the shader meta itself. We additionally
714 * need to signal CAN_DISCARD for nontrivial blend modes (so
715 * we're able to read back the destination buffer) */
716
717 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
718 blend[0].is_shader);
719
720 if (!blend[0].is_shader) {
721 fragmeta->blend.equation = *blend[0].equation.equation;
722 fragmeta->blend.constant = blend[0].equation.constant;
723 }
724
725 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
726 !blend[0].no_blending || fs->can_discard);
727 return;
728 }
729
730 /* Additional blend descriptor tacked on for jobs using MFBD */
731
732 for (unsigned i = 0; i < rt_count; ++i) {
733 unsigned flags = 0;
734
735 if (ctx->pipe_framebuffer.nr_cbufs > i && !blend[i].no_colour) {
736 flags = 0x200;
737
738 bool is_srgb = (ctx->pipe_framebuffer.nr_cbufs > i) &&
739 (ctx->pipe_framebuffer.cbufs[i]) &&
740 util_format_is_srgb(ctx->pipe_framebuffer.cbufs[i]->format);
741
742 SET_BIT(flags, MALI_BLEND_MRT_SHADER, blend[i].is_shader);
743 SET_BIT(flags, MALI_BLEND_LOAD_TIB, !blend[i].no_blending);
744 SET_BIT(flags, MALI_BLEND_SRGB, is_srgb);
745 SET_BIT(flags, MALI_BLEND_NO_DITHER, !ctx->blend->base.dither);
746 }
747
748 if (dev->quirks & IS_BIFROST) {
749 struct bifrost_blend_rt *brts = rts;
750
751 brts[i].flags = flags;
752
753 if (blend[i].is_shader) {
754 /* The blend shader's address needs to be at
755 * the same top 32 bit as the fragment shader.
756 * TODO: Ensure that's always the case.
757 */
758 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
759 (fs->bo->gpu & (0xffffffffull << 32)));
760 brts[i].shader = blend[i].shader.gpu;
761 brts[i].unk2 = 0x0;
762 } else if (ctx->pipe_framebuffer.nr_cbufs > i) {
763 enum pipe_format format = ctx->pipe_framebuffer.cbufs[i]->format;
764 const struct util_format_description *format_desc;
765 format_desc = util_format_description(format);
766
767 brts[i].equation = *blend[i].equation.equation;
768
769 /* TODO: this is a bit more complicated */
770 brts[i].constant = blend[i].equation.constant;
771
772 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
773
774 /* 0x19 disables blending and forces REPLACE
775 * mode (equivalent to rgb_mode = alpha_mode =
776 * x122, colour mask = 0xF). 0x1a allows
777 * blending. */
778 brts[i].unk2 = blend[i].no_blending ? 0x19 : 0x1a;
779
780 brts[i].shader_type = fs->blend_types[i];
781 } else {
782 /* Dummy attachment for depth-only */
783 brts[i].unk2 = 0x3;
784 brts[i].shader_type = fs->blend_types[i];
785 }
786 } else {
787 struct midgard_blend_rt *mrts = rts;
788 mrts[i].flags = flags;
789
790 if (blend[i].is_shader) {
791 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
792 } else {
793 mrts[i].blend.equation = *blend[i].equation.equation;
794 mrts[i].blend.constant = blend[i].equation.constant;
795 }
796 }
797 }
798 }
799
800 static void
801 panfrost_frag_shader_meta_init(struct panfrost_context *ctx,
802 struct mali_shader_meta *fragmeta,
803 void *rts)
804 {
805 const struct panfrost_device *dev = pan_device(ctx->base.screen);
806 struct panfrost_shader_state *fs;
807
808 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
809
810 fragmeta->alpha_coverage = ~MALI_ALPHA_COVERAGE(0.000000);
811 fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x3010;
812 fragmeta->unknown2_4 = 0x4e0;
813
814 /* unknown2_4 has 0x10 bit set on T6XX and T720. We don't know why this
815 * is required (independent of 32-bit/64-bit descriptors), or why it's
816 * not used on later GPU revisions. Otherwise, all shader jobs fault on
817 * these earlier chips (perhaps this is a chicken bit of some kind).
818 * More investigation is needed. */
819
820 SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
821
822 if (dev->quirks & IS_BIFROST) {
823 /* TODO */
824 } else {
825 /* Depending on whether it's legal to in the given shader, we try to
826 * enable early-z testing (or forward-pixel kill?) */
827
828 SET_BIT(fragmeta->midgard1.flags_lo, MALI_EARLY_Z,
829 !fs->can_discard && !fs->writes_depth);
830
831 /* Add the writes Z/S flags if needed. */
832 SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_Z, fs->writes_depth);
833 SET_BIT(fragmeta->midgard1.flags_hi, MALI_WRITES_S, fs->writes_stencil);
834
835 /* Any time texturing is used, derivatives are implicitly calculated,
836 * so we need to enable helper invocations */
837
838 SET_BIT(fragmeta->midgard1.flags_lo, MALI_HELPER_INVOCATIONS,
839 fs->helper_invocations);
840
841 const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
842
843 bool depth_enabled = fs->writes_depth ||
844 (zsa && zsa->depth.enabled && zsa->depth.func != PIPE_FUNC_ALWAYS);
845
846 SET_BIT(fragmeta->midgard1.flags_lo, 0x400, !depth_enabled && fs->can_discard);
847 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_ZS, depth_enabled && fs->can_discard);
848 }
849
850 panfrost_frag_meta_rasterizer_update(ctx, fragmeta);
851 panfrost_frag_meta_zsa_update(ctx, fragmeta);
852 panfrost_frag_meta_blend_update(ctx, fragmeta, rts);
853 }
854
855 void
856 panfrost_emit_shader_meta(struct panfrost_batch *batch,
857 enum pipe_shader_type st,
858 struct mali_vertex_tiler_postfix *postfix)
859 {
860 struct panfrost_context *ctx = batch->ctx;
861 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
862
863 if (!ss) {
864 postfix->shader = 0;
865 return;
866 }
867
868 struct mali_shader_meta meta;
869
870 panfrost_shader_meta_init(ctx, st, &meta);
871
872 /* Add the shader BO to the batch. */
873 panfrost_batch_add_bo(batch, ss->bo,
874 PAN_BO_ACCESS_PRIVATE |
875 PAN_BO_ACCESS_READ |
876 panfrost_bo_access_for_stage(st));
877
878 mali_ptr shader_ptr;
879
880 if (st == PIPE_SHADER_FRAGMENT) {
881 struct panfrost_device *dev = pan_device(ctx->base.screen);
882 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
883 size_t desc_size = sizeof(meta);
884 void *rts = NULL;
885 struct panfrost_transfer xfer;
886 unsigned rt_size;
887
888 if (dev->quirks & MIDGARD_SFBD)
889 rt_size = 0;
890 else if (dev->quirks & IS_BIFROST)
891 rt_size = sizeof(struct bifrost_blend_rt);
892 else
893 rt_size = sizeof(struct midgard_blend_rt);
894
895 desc_size += rt_size * rt_count;
896
897 if (rt_size)
898 rts = rzalloc_size(ctx, rt_size * rt_count);
899
900 panfrost_frag_shader_meta_init(ctx, &meta, rts);
901
902 xfer = panfrost_allocate_transient(batch, desc_size);
903
904 memcpy(xfer.cpu, &meta, sizeof(meta));
905 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
906
907 if (rt_size)
908 ralloc_free(rts);
909
910 shader_ptr = xfer.gpu;
911 } else {
912 shader_ptr = panfrost_upload_transient(batch, &meta,
913 sizeof(meta));
914 }
915
916 postfix->shader = shader_ptr;
917 }
918
919 static void
920 panfrost_mali_viewport_init(struct panfrost_context *ctx,
921 struct mali_viewport *mvp)
922 {
923 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
924
925 /* Clip bounds are encoded as floats. The viewport itself is encoded as
926 * (somewhat) asymmetric ints. */
927
928 const struct pipe_scissor_state *ss = &ctx->scissor;
929
930 memset(mvp, 0, sizeof(*mvp));
931
932 /* By default, do no viewport clipping, i.e. clip to (-inf, inf) in
933 * each direction. Clipping to the viewport in theory should work, but
934 * in practice causes issues when we're not explicitly trying to
935 * scissor */
936
937 *mvp = (struct mali_viewport) {
938 .clip_minx = -INFINITY,
939 .clip_miny = -INFINITY,
940 .clip_maxx = INFINITY,
941 .clip_maxy = INFINITY,
942 };
943
944 /* Always scissor to the viewport by default. */
945 float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
946 float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
947
948 float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
949 float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
950
951 float minz = (vp->translate[2] - fabsf(vp->scale[2]));
952 float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
953
954 /* Apply the scissor test */
955
956 unsigned minx, miny, maxx, maxy;
957
958 if (ss && ctx->rasterizer && ctx->rasterizer->base.scissor) {
959 minx = MAX2(ss->minx, vp_minx);
960 miny = MAX2(ss->miny, vp_miny);
961 maxx = MIN2(ss->maxx, vp_maxx);
962 maxy = MIN2(ss->maxy, vp_maxy);
963 } else {
964 minx = vp_minx;
965 miny = vp_miny;
966 maxx = vp_maxx;
967 maxy = vp_maxy;
968 }
969
970 /* Hardware needs the min/max to be strictly ordered, so flip if we
971 * need to. The viewport transformation in the vertex shader will
972 * handle the negatives if we don't */
973
974 if (miny > maxy) {
975 unsigned temp = miny;
976 miny = maxy;
977 maxy = temp;
978 }
979
980 if (minx > maxx) {
981 unsigned temp = minx;
982 minx = maxx;
983 maxx = temp;
984 }
985
986 if (minz > maxz) {
987 float temp = minz;
988 minz = maxz;
989 maxz = temp;
990 }
991
992 /* Clamp to the framebuffer size as a last check */
993
994 minx = MIN2(ctx->pipe_framebuffer.width, minx);
995 maxx = MIN2(ctx->pipe_framebuffer.width, maxx);
996
997 miny = MIN2(ctx->pipe_framebuffer.height, miny);
998 maxy = MIN2(ctx->pipe_framebuffer.height, maxy);
999
1000 /* Upload */
1001
1002 mvp->viewport0[0] = minx;
1003 mvp->viewport1[0] = MALI_POSITIVE(maxx);
1004
1005 mvp->viewport0[1] = miny;
1006 mvp->viewport1[1] = MALI_POSITIVE(maxy);
1007
1008 mvp->clip_minz = minz;
1009 mvp->clip_maxz = maxz;
1010 }
1011
1012 void
1013 panfrost_emit_viewport(struct panfrost_batch *batch,
1014 struct mali_vertex_tiler_postfix *tiler_postfix)
1015 {
1016 struct panfrost_context *ctx = batch->ctx;
1017 struct mali_viewport mvp;
1018
1019 panfrost_mali_viewport_init(batch->ctx, &mvp);
1020
1021 /* Update the job, unless we're doing wallpapering (whose lack of
1022 * scissor we can ignore, since if we "miss" a tile of wallpaper, it'll
1023 * just... be faster :) */
1024
1025 if (!ctx->wallpaper_batch)
1026 panfrost_batch_union_scissor(batch, mvp.viewport0[0],
1027 mvp.viewport0[1],
1028 mvp.viewport1[0] + 1,
1029 mvp.viewport1[1] + 1);
1030
1031 tiler_postfix->viewport = panfrost_upload_transient(batch, &mvp,
1032 sizeof(mvp));
1033 }
1034
1035 static mali_ptr
1036 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
1037 enum pipe_shader_type st,
1038 struct panfrost_constant_buffer *buf,
1039 unsigned index)
1040 {
1041 struct pipe_constant_buffer *cb = &buf->cb[index];
1042 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1043
1044 if (rsrc) {
1045 panfrost_batch_add_bo(batch, rsrc->bo,
1046 PAN_BO_ACCESS_SHARED |
1047 PAN_BO_ACCESS_READ |
1048 panfrost_bo_access_for_stage(st));
1049
1050 /* Alignment gauranteed by
1051 * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
1052 return rsrc->bo->gpu + cb->buffer_offset;
1053 } else if (cb->user_buffer) {
1054 return panfrost_upload_transient(batch,
1055 cb->user_buffer +
1056 cb->buffer_offset,
1057 cb->buffer_size);
1058 } else {
1059 unreachable("No constant buffer");
1060 }
1061 }
1062
1063 struct sysval_uniform {
1064 union {
1065 float f[4];
1066 int32_t i[4];
1067 uint32_t u[4];
1068 uint64_t du[2];
1069 };
1070 };
1071
1072 static void
1073 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
1074 struct sysval_uniform *uniform)
1075 {
1076 struct panfrost_context *ctx = batch->ctx;
1077 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1078
1079 uniform->f[0] = vp->scale[0];
1080 uniform->f[1] = vp->scale[1];
1081 uniform->f[2] = vp->scale[2];
1082 }
1083
1084 static void
1085 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
1086 struct sysval_uniform *uniform)
1087 {
1088 struct panfrost_context *ctx = batch->ctx;
1089 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1090
1091 uniform->f[0] = vp->translate[0];
1092 uniform->f[1] = vp->translate[1];
1093 uniform->f[2] = vp->translate[2];
1094 }
1095
1096 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
1097 enum pipe_shader_type st,
1098 unsigned int sysvalid,
1099 struct sysval_uniform *uniform)
1100 {
1101 struct panfrost_context *ctx = batch->ctx;
1102 unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
1103 unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
1104 bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
1105 struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
1106
1107 assert(dim);
1108 uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
1109
1110 if (dim > 1)
1111 uniform->i[1] = u_minify(tex->texture->height0,
1112 tex->u.tex.first_level);
1113
1114 if (dim > 2)
1115 uniform->i[2] = u_minify(tex->texture->depth0,
1116 tex->u.tex.first_level);
1117
1118 if (is_array)
1119 uniform->i[dim] = tex->texture->array_size;
1120 }
1121
1122 static void
1123 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
1124 enum pipe_shader_type st,
1125 unsigned ssbo_id,
1126 struct sysval_uniform *uniform)
1127 {
1128 struct panfrost_context *ctx = batch->ctx;
1129
1130 assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
1131 struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
1132
1133 /* Compute address */
1134 struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
1135
1136 panfrost_batch_add_bo(batch, bo,
1137 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
1138 panfrost_bo_access_for_stage(st));
1139
1140 /* Upload address and size as sysval */
1141 uniform->du[0] = bo->gpu + sb.buffer_offset;
1142 uniform->u[2] = sb.buffer_size;
1143 }
1144
1145 static void
1146 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1147 enum pipe_shader_type st,
1148 unsigned samp_idx,
1149 struct sysval_uniform *uniform)
1150 {
1151 struct panfrost_context *ctx = batch->ctx;
1152 struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1153
1154 uniform->f[0] = sampl->min_lod;
1155 uniform->f[1] = sampl->max_lod;
1156 uniform->f[2] = sampl->lod_bias;
1157
1158 /* Even without any errata, Midgard represents "no mipmapping" as
1159 * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1160 * panfrost_create_sampler_state which also explains our choice of
1161 * epsilon value (again to keep behaviour consistent) */
1162
1163 if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1164 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1165 }
1166
1167 static void
1168 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1169 struct sysval_uniform *uniform)
1170 {
1171 struct panfrost_context *ctx = batch->ctx;
1172
1173 uniform->u[0] = ctx->compute_grid->grid[0];
1174 uniform->u[1] = ctx->compute_grid->grid[1];
1175 uniform->u[2] = ctx->compute_grid->grid[2];
1176 }
1177
1178 static void
1179 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1180 struct panfrost_shader_state *ss,
1181 enum pipe_shader_type st)
1182 {
1183 struct sysval_uniform *uniforms = (void *)buf;
1184
1185 for (unsigned i = 0; i < ss->sysval_count; ++i) {
1186 int sysval = ss->sysval[i];
1187
1188 switch (PAN_SYSVAL_TYPE(sysval)) {
1189 case PAN_SYSVAL_VIEWPORT_SCALE:
1190 panfrost_upload_viewport_scale_sysval(batch,
1191 &uniforms[i]);
1192 break;
1193 case PAN_SYSVAL_VIEWPORT_OFFSET:
1194 panfrost_upload_viewport_offset_sysval(batch,
1195 &uniforms[i]);
1196 break;
1197 case PAN_SYSVAL_TEXTURE_SIZE:
1198 panfrost_upload_txs_sysval(batch, st,
1199 PAN_SYSVAL_ID(sysval),
1200 &uniforms[i]);
1201 break;
1202 case PAN_SYSVAL_SSBO:
1203 panfrost_upload_ssbo_sysval(batch, st,
1204 PAN_SYSVAL_ID(sysval),
1205 &uniforms[i]);
1206 break;
1207 case PAN_SYSVAL_NUM_WORK_GROUPS:
1208 panfrost_upload_num_work_groups_sysval(batch,
1209 &uniforms[i]);
1210 break;
1211 case PAN_SYSVAL_SAMPLER:
1212 panfrost_upload_sampler_sysval(batch, st,
1213 PAN_SYSVAL_ID(sysval),
1214 &uniforms[i]);
1215 break;
1216 default:
1217 assert(0);
1218 }
1219 }
1220 }
1221
1222 static const void *
1223 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1224 unsigned index)
1225 {
1226 struct pipe_constant_buffer *cb = &buf->cb[index];
1227 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1228
1229 if (rsrc)
1230 return rsrc->bo->cpu;
1231 else if (cb->user_buffer)
1232 return cb->user_buffer;
1233 else
1234 unreachable("No constant buffer");
1235 }
1236
1237 void
1238 panfrost_emit_const_buf(struct panfrost_batch *batch,
1239 enum pipe_shader_type stage,
1240 struct mali_vertex_tiler_postfix *postfix)
1241 {
1242 struct panfrost_context *ctx = batch->ctx;
1243 struct panfrost_shader_variants *all = ctx->shader[stage];
1244
1245 if (!all)
1246 return;
1247
1248 struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1249
1250 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1251
1252 /* Uniforms are implicitly UBO #0 */
1253 bool has_uniforms = buf->enabled_mask & (1 << 0);
1254
1255 /* Allocate room for the sysval and the uniforms */
1256 size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1257 size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1258 size_t size = sys_size + uniform_size;
1259 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1260 size);
1261
1262 /* Upload sysvals requested by the shader */
1263 panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1264
1265 /* Upload uniforms */
1266 if (has_uniforms && uniform_size) {
1267 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1268 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1269 }
1270
1271 /* Next up, attach UBOs. UBO #0 is the uniforms we just
1272 * uploaded */
1273
1274 unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1275 assert(ubo_count >= 1);
1276
1277 size_t sz = sizeof(uint64_t) * ubo_count;
1278 uint64_t ubos[PAN_MAX_CONST_BUFFERS];
1279 int uniform_count = ss->uniform_count;
1280
1281 /* Upload uniforms as a UBO */
1282 ubos[0] = MALI_MAKE_UBO(2 + uniform_count, transfer.gpu);
1283
1284 /* The rest are honest-to-goodness UBOs */
1285
1286 for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1287 size_t usz = buf->cb[ubo].buffer_size;
1288 bool enabled = buf->enabled_mask & (1 << ubo);
1289 bool empty = usz == 0;
1290
1291 if (!enabled || empty) {
1292 /* Stub out disabled UBOs to catch accesses */
1293 ubos[ubo] = MALI_MAKE_UBO(0, 0xDEAD0000);
1294 continue;
1295 }
1296
1297 mali_ptr gpu = panfrost_map_constant_buffer_gpu(batch, stage,
1298 buf, ubo);
1299
1300 unsigned bytes_per_field = 16;
1301 unsigned aligned = ALIGN_POT(usz, bytes_per_field);
1302 ubos[ubo] = MALI_MAKE_UBO(aligned / bytes_per_field, gpu);
1303 }
1304
1305 mali_ptr ubufs = panfrost_upload_transient(batch, ubos, sz);
1306 postfix->uniforms = transfer.gpu;
1307 postfix->uniform_buffers = ubufs;
1308
1309 buf->dirty_mask = 0;
1310 }
1311
1312 void
1313 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1314 const struct pipe_grid_info *info,
1315 struct midgard_payload_vertex_tiler *vtp)
1316 {
1317 struct panfrost_context *ctx = batch->ctx;
1318 struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1319 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1320 unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1321 128));
1322 unsigned shared_size = single_size * info->grid[0] * info->grid[1] *
1323 info->grid[2] * 4;
1324 struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1325 shared_size,
1326 1);
1327
1328 struct mali_shared_memory shared = {
1329 .shared_memory = bo->gpu,
1330 .shared_workgroup_count =
1331 util_logbase2_ceil(info->grid[0]) +
1332 util_logbase2_ceil(info->grid[1]) +
1333 util_logbase2_ceil(info->grid[2]),
1334 .shared_unk1 = 0x2,
1335 .shared_shift = util_logbase2(single_size) - 1
1336 };
1337
1338 vtp->postfix.shared_memory = panfrost_upload_transient(batch, &shared,
1339 sizeof(shared));
1340 }
1341
1342 static mali_ptr
1343 panfrost_get_tex_desc(struct panfrost_batch *batch,
1344 enum pipe_shader_type st,
1345 struct panfrost_sampler_view *view)
1346 {
1347 if (!view)
1348 return (mali_ptr) 0;
1349
1350 struct pipe_sampler_view *pview = &view->base;
1351 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1352
1353 /* Add the BO to the job so it's retained until the job is done. */
1354
1355 panfrost_batch_add_bo(batch, rsrc->bo,
1356 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1357 panfrost_bo_access_for_stage(st));
1358
1359 panfrost_batch_add_bo(batch, view->midgard_bo,
1360 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1361 panfrost_bo_access_for_stage(st));
1362
1363 return view->midgard_bo->gpu;
1364 }
1365
1366 void
1367 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1368 enum pipe_shader_type stage,
1369 struct mali_vertex_tiler_postfix *postfix)
1370 {
1371 struct panfrost_context *ctx = batch->ctx;
1372 struct panfrost_device *device = pan_device(ctx->base.screen);
1373
1374 if (!ctx->sampler_view_count[stage])
1375 return;
1376
1377 if (device->quirks & IS_BIFROST) {
1378 struct bifrost_texture_descriptor *descriptors;
1379
1380 descriptors = malloc(sizeof(struct bifrost_texture_descriptor) *
1381 ctx->sampler_view_count[stage]);
1382
1383 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1384 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1385 struct pipe_sampler_view *pview = &view->base;
1386 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1387
1388 /* Add the BOs to the job so they are retained until the job is done. */
1389
1390 panfrost_batch_add_bo(batch, rsrc->bo,
1391 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1392 panfrost_bo_access_for_stage(stage));
1393
1394 panfrost_batch_add_bo(batch, view->bifrost_bo,
1395 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1396 panfrost_bo_access_for_stage(stage));
1397
1398 memcpy(&descriptors[i], view->bifrost_descriptor, sizeof(*view->bifrost_descriptor));
1399 }
1400
1401 postfix->textures = panfrost_upload_transient(batch,
1402 descriptors,
1403 sizeof(struct bifrost_texture_descriptor) *
1404 ctx->sampler_view_count[stage]);
1405
1406 free(descriptors);
1407 } else {
1408 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1409
1410 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i)
1411 trampolines[i] = panfrost_get_tex_desc(batch, stage,
1412 ctx->sampler_views[stage][i]);
1413
1414 postfix->textures = panfrost_upload_transient(batch,
1415 trampolines,
1416 sizeof(uint64_t) *
1417 ctx->sampler_view_count[stage]);
1418 }
1419 }
1420
1421 void
1422 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1423 enum pipe_shader_type stage,
1424 struct mali_vertex_tiler_postfix *postfix)
1425 {
1426 struct panfrost_context *ctx = batch->ctx;
1427 struct panfrost_device *device = pan_device(ctx->base.screen);
1428
1429 if (!ctx->sampler_count[stage])
1430 return;
1431
1432 if (device->quirks & IS_BIFROST) {
1433 size_t desc_size = sizeof(struct bifrost_sampler_descriptor);
1434 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1435 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1436 transfer_size);
1437 struct bifrost_sampler_descriptor *desc = (struct bifrost_sampler_descriptor *)transfer.cpu;
1438
1439 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1440 desc[i] = ctx->samplers[stage][i]->bifrost_hw;
1441
1442 postfix->sampler_descriptor = transfer.gpu;
1443 } else {
1444 size_t desc_size = sizeof(struct mali_sampler_descriptor);
1445 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1446 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1447 transfer_size);
1448 struct mali_sampler_descriptor *desc = (struct mali_sampler_descriptor *)transfer.cpu;
1449
1450 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1451 desc[i] = ctx->samplers[stage][i]->midgard_hw;
1452
1453 postfix->sampler_descriptor = transfer.gpu;
1454 }
1455 }
1456
1457 void
1458 panfrost_emit_vertex_attr_meta(struct panfrost_batch *batch,
1459 struct mali_vertex_tiler_postfix *vertex_postfix)
1460 {
1461 struct panfrost_context *ctx = batch->ctx;
1462
1463 if (!ctx->vertex)
1464 return;
1465
1466 struct panfrost_vertex_state *so = ctx->vertex;
1467
1468 panfrost_vertex_state_upd_attr_offs(ctx, vertex_postfix);
1469 vertex_postfix->attribute_meta = panfrost_upload_transient(batch, so->hw,
1470 sizeof(*so->hw) *
1471 PAN_MAX_ATTRIBUTE);
1472 }
1473
1474 void
1475 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1476 struct mali_vertex_tiler_postfix *vertex_postfix)
1477 {
1478 struct panfrost_context *ctx = batch->ctx;
1479 struct panfrost_vertex_state *so = ctx->vertex;
1480
1481 /* Staged mali_attr, and index into them. i =/= k, depending on the
1482 * vertex buffer mask and instancing. Twice as much room is allocated,
1483 * for a worst case of NPOT_DIVIDEs which take up extra slot */
1484 union mali_attr attrs[PIPE_MAX_ATTRIBS * 2];
1485 unsigned k = 0;
1486
1487 for (unsigned i = 0; i < so->num_elements; ++i) {
1488 /* We map a mali_attr to be 1:1 with the mali_attr_meta, which
1489 * means duplicating some vertex buffers (who cares? aside from
1490 * maybe some caching implications but I somehow doubt that
1491 * matters) */
1492
1493 struct pipe_vertex_element *elem = &so->pipe[i];
1494 unsigned vbi = elem->vertex_buffer_index;
1495
1496 /* The exception to 1:1 mapping is that we can have multiple
1497 * entries (NPOT divisors), so we fixup anyways */
1498
1499 so->hw[i].index = k;
1500
1501 if (!(ctx->vb_mask & (1 << vbi)))
1502 continue;
1503
1504 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1505 struct panfrost_resource *rsrc;
1506
1507 rsrc = pan_resource(buf->buffer.resource);
1508 if (!rsrc)
1509 continue;
1510
1511 /* Align to 64 bytes by masking off the lower bits. This
1512 * will be adjusted back when we fixup the src_offset in
1513 * mali_attr_meta */
1514
1515 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1516 mali_ptr addr = raw_addr & ~63;
1517 unsigned chopped_addr = raw_addr - addr;
1518
1519 /* Add a dependency of the batch on the vertex buffer */
1520 panfrost_batch_add_bo(batch, rsrc->bo,
1521 PAN_BO_ACCESS_SHARED |
1522 PAN_BO_ACCESS_READ |
1523 PAN_BO_ACCESS_VERTEX_TILER);
1524
1525 /* Set common fields */
1526 attrs[k].elements = addr;
1527 attrs[k].stride = buf->stride;
1528
1529 /* Since we advanced the base pointer, we shrink the buffer
1530 * size */
1531 attrs[k].size = rsrc->base.width0 - buf->buffer_offset;
1532
1533 /* We need to add the extra size we masked off (for
1534 * correctness) so the data doesn't get clamped away */
1535 attrs[k].size += chopped_addr;
1536
1537 /* For non-instancing make sure we initialize */
1538 attrs[k].shift = attrs[k].extra_flags = 0;
1539
1540 /* Instancing uses a dramatically different code path than
1541 * linear, so dispatch for the actual emission now that the
1542 * common code is finished */
1543
1544 unsigned divisor = elem->instance_divisor;
1545
1546 if (divisor && ctx->instance_count == 1) {
1547 /* Silly corner case where there's a divisor(=1) but
1548 * there's no legitimate instancing. So we want *every*
1549 * attribute to be the same. So set stride to zero so
1550 * we don't go anywhere. */
1551
1552 attrs[k].size = attrs[k].stride + chopped_addr;
1553 attrs[k].stride = 0;
1554 attrs[k++].elements |= MALI_ATTR_LINEAR;
1555 } else if (ctx->instance_count <= 1) {
1556 /* Normal, non-instanced attributes */
1557 attrs[k++].elements |= MALI_ATTR_LINEAR;
1558 } else {
1559 unsigned instance_shift = vertex_postfix->instance_shift;
1560 unsigned instance_odd = vertex_postfix->instance_odd;
1561
1562 k += panfrost_vertex_instanced(ctx->padded_count,
1563 instance_shift,
1564 instance_odd,
1565 divisor, &attrs[k]);
1566 }
1567 }
1568
1569 /* Add special gl_VertexID/gl_InstanceID buffers */
1570
1571 panfrost_vertex_id(ctx->padded_count, &attrs[k]);
1572 so->hw[PAN_VERTEX_ID].index = k++;
1573 panfrost_instance_id(ctx->padded_count, &attrs[k]);
1574 so->hw[PAN_INSTANCE_ID].index = k++;
1575
1576 /* Upload whatever we emitted and go */
1577
1578 vertex_postfix->attributes = panfrost_upload_transient(batch, attrs,
1579 k * sizeof(*attrs));
1580 }
1581
1582 static mali_ptr
1583 panfrost_emit_varyings(struct panfrost_batch *batch, union mali_attr *slot,
1584 unsigned stride, unsigned count)
1585 {
1586 /* Fill out the descriptor */
1587 slot->stride = stride;
1588 slot->size = stride * count;
1589 slot->shift = slot->extra_flags = 0;
1590
1591 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1592 slot->size);
1593
1594 slot->elements = transfer.gpu | MALI_ATTR_LINEAR;
1595
1596 return transfer.gpu;
1597 }
1598
1599 static void
1600 panfrost_emit_streamout(struct panfrost_batch *batch, union mali_attr *slot,
1601 unsigned stride, unsigned offset, unsigned count,
1602 struct pipe_stream_output_target *target)
1603 {
1604 /* Fill out the descriptor */
1605 slot->stride = stride * 4;
1606 slot->shift = slot->extra_flags = 0;
1607
1608 unsigned max_size = target->buffer_size;
1609 unsigned expected_size = slot->stride * count;
1610
1611 slot->size = MIN2(max_size, expected_size);
1612
1613 /* Grab the BO and bind it to the batch */
1614 struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1615
1616 /* Varyings are WRITE from the perspective of the VERTEX but READ from
1617 * the perspective of the TILER and FRAGMENT.
1618 */
1619 panfrost_batch_add_bo(batch, bo,
1620 PAN_BO_ACCESS_SHARED |
1621 PAN_BO_ACCESS_RW |
1622 PAN_BO_ACCESS_VERTEX_TILER |
1623 PAN_BO_ACCESS_FRAGMENT);
1624
1625 mali_ptr addr = bo->gpu + target->buffer_offset + (offset * slot->stride);
1626 slot->elements = addr;
1627 }
1628
1629 /* Given a shader and buffer indices, link varying metadata together */
1630
1631 static bool
1632 is_special_varying(gl_varying_slot loc)
1633 {
1634 switch (loc) {
1635 case VARYING_SLOT_POS:
1636 case VARYING_SLOT_PSIZ:
1637 case VARYING_SLOT_PNTC:
1638 case VARYING_SLOT_FACE:
1639 return true;
1640 default:
1641 return false;
1642 }
1643 }
1644
1645 static void
1646 panfrost_emit_varying_meta(void *outptr, struct panfrost_shader_state *ss,
1647 signed general, signed gl_Position,
1648 signed gl_PointSize, signed gl_PointCoord,
1649 signed gl_FrontFacing)
1650 {
1651 struct mali_attr_meta *out = (struct mali_attr_meta *) outptr;
1652
1653 for (unsigned i = 0; i < ss->varying_count; ++i) {
1654 gl_varying_slot location = ss->varyings_loc[i];
1655 int index = -1;
1656
1657 switch (location) {
1658 case VARYING_SLOT_POS:
1659 index = gl_Position;
1660 break;
1661 case VARYING_SLOT_PSIZ:
1662 index = gl_PointSize;
1663 break;
1664 case VARYING_SLOT_PNTC:
1665 index = gl_PointCoord;
1666 break;
1667 case VARYING_SLOT_FACE:
1668 index = gl_FrontFacing;
1669 break;
1670 default:
1671 index = general;
1672 break;
1673 }
1674
1675 assert(index >= 0);
1676 out[i].index = index;
1677 }
1678 }
1679
1680 static bool
1681 has_point_coord(unsigned mask, gl_varying_slot loc)
1682 {
1683 if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1684 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1685 else if (loc == VARYING_SLOT_PNTC)
1686 return (mask & (1 << 8));
1687 else
1688 return false;
1689 }
1690
1691 /* Helpers for manipulating stream out information so we can pack varyings
1692 * accordingly. Compute the src_offset for a given captured varying */
1693
1694 static struct pipe_stream_output *
1695 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1696 {
1697 for (unsigned i = 0; i < info->num_outputs; ++i) {
1698 if (info->output[i].register_index == loc)
1699 return &info->output[i];
1700 }
1701
1702 unreachable("Varying not captured");
1703 }
1704
1705 void
1706 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1707 unsigned vertex_count,
1708 struct mali_vertex_tiler_postfix *vertex_postfix,
1709 struct mali_vertex_tiler_postfix *tiler_postfix,
1710 union midgard_primitive_size *primitive_size)
1711 {
1712 /* Load the shaders */
1713 struct panfrost_context *ctx = batch->ctx;
1714 struct panfrost_shader_state *vs, *fs;
1715 unsigned int num_gen_varyings = 0;
1716 size_t vs_size, fs_size;
1717
1718 /* Allocate the varying descriptor */
1719
1720 vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1721 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1722 vs_size = sizeof(struct mali_attr_meta) * vs->varying_count;
1723 fs_size = sizeof(struct mali_attr_meta) * fs->varying_count;
1724
1725 struct panfrost_transfer trans = panfrost_allocate_transient(batch,
1726 vs_size +
1727 fs_size);
1728
1729 struct pipe_stream_output_info *so = &vs->stream_output;
1730
1731 /* Check if this varying is linked by us. This is the case for
1732 * general-purpose, non-captured varyings. If it is, link it. If it's
1733 * not, use the provided stream out information to determine the
1734 * offset, since it was already linked for us. */
1735
1736 for (unsigned i = 0; i < vs->varying_count; i++) {
1737 gl_varying_slot loc = vs->varyings_loc[i];
1738
1739 bool special = is_special_varying(loc);
1740 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1741
1742 if (captured) {
1743 struct pipe_stream_output *o = pan_get_so(so, loc);
1744
1745 unsigned dst_offset = o->dst_offset * 4; /* dwords */
1746 vs->varyings[i].src_offset = dst_offset;
1747 } else if (!special) {
1748 vs->varyings[i].src_offset = 16 * (num_gen_varyings++);
1749 }
1750 }
1751
1752 /* Conversely, we need to set src_offset for the captured varyings.
1753 * Here, the layout is defined by the stream out info, not us */
1754
1755 /* Link up with fragment varyings */
1756 bool reads_point_coord = fs->reads_point_coord;
1757
1758 for (unsigned i = 0; i < fs->varying_count; i++) {
1759 gl_varying_slot loc = fs->varyings_loc[i];
1760 unsigned src_offset;
1761 signed vs_idx = -1;
1762
1763 /* Link up */
1764 for (unsigned j = 0; j < vs->varying_count; ++j) {
1765 if (vs->varyings_loc[j] == loc) {
1766 vs_idx = j;
1767 break;
1768 }
1769 }
1770
1771 /* Either assign or reuse */
1772 if (vs_idx >= 0)
1773 src_offset = vs->varyings[vs_idx].src_offset;
1774 else
1775 src_offset = 16 * (num_gen_varyings++);
1776
1777 fs->varyings[i].src_offset = src_offset;
1778
1779 if (has_point_coord(fs->point_sprite_mask, loc))
1780 reads_point_coord = true;
1781 }
1782
1783 memcpy(trans.cpu, vs->varyings, vs_size);
1784 memcpy(trans.cpu + vs_size, fs->varyings, fs_size);
1785
1786 union mali_attr varyings[PIPE_MAX_ATTRIBS] = {0};
1787
1788 /* Figure out how many streamout buffers could be bound */
1789 unsigned so_count = ctx->streamout.num_targets;
1790 for (unsigned i = 0; i < vs->varying_count; i++) {
1791 gl_varying_slot loc = vs->varyings_loc[i];
1792
1793 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1794 if (!captured) continue;
1795
1796 struct pipe_stream_output *o = pan_get_so(so, loc);
1797 so_count = MAX2(so_count, o->output_buffer + 1);
1798 }
1799
1800 signed idx = so_count;
1801 signed general = idx++;
1802 signed gl_Position = idx++;
1803 signed gl_PointSize = vs->writes_point_size ? (idx++) : -1;
1804 signed gl_PointCoord = reads_point_coord ? (idx++) : -1;
1805 signed gl_FrontFacing = fs->reads_face ? (idx++) : -1;
1806 signed gl_FragCoord = fs->reads_frag_coord ? (idx++) : -1;
1807
1808 /* Emit the stream out buffers */
1809
1810 unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1811 ctx->vertex_count);
1812
1813 for (unsigned i = 0; i < so_count; ++i) {
1814 if (i < ctx->streamout.num_targets) {
1815 panfrost_emit_streamout(batch, &varyings[i],
1816 so->stride[i],
1817 ctx->streamout.offsets[i],
1818 out_count,
1819 ctx->streamout.targets[i]);
1820 } else {
1821 /* Emit a dummy buffer */
1822 panfrost_emit_varyings(batch, &varyings[i],
1823 so->stride[i] * 4,
1824 out_count);
1825
1826 /* Clear the attribute type */
1827 varyings[i].elements &= ~0xF;
1828 }
1829 }
1830
1831 panfrost_emit_varyings(batch, &varyings[general],
1832 num_gen_varyings * 16,
1833 vertex_count);
1834
1835 mali_ptr varyings_p;
1836
1837 /* fp32 vec4 gl_Position */
1838 varyings_p = panfrost_emit_varyings(batch, &varyings[gl_Position],
1839 sizeof(float) * 4, vertex_count);
1840 tiler_postfix->position_varying = varyings_p;
1841
1842
1843 if (panfrost_writes_point_size(ctx)) {
1844 varyings_p = panfrost_emit_varyings(batch,
1845 &varyings[gl_PointSize],
1846 2, vertex_count);
1847 primitive_size->pointer = varyings_p;
1848 }
1849
1850 if (reads_point_coord)
1851 varyings[gl_PointCoord].elements = MALI_VARYING_POINT_COORD;
1852
1853 if (fs->reads_face)
1854 varyings[gl_FrontFacing].elements = MALI_VARYING_FRONT_FACING;
1855
1856 if (fs->reads_frag_coord)
1857 varyings[gl_FragCoord].elements = MALI_VARYING_FRAG_COORD;
1858
1859 struct panfrost_device *device = pan_device(ctx->base.screen);
1860 assert(!(device->quirks & IS_BIFROST) || !(reads_point_coord));
1861
1862 /* Let's go ahead and link varying meta to the buffer in question, now
1863 * that that information is available. VARYING_SLOT_POS is mapped to
1864 * gl_FragCoord for fragment shaders but gl_Positionf or vertex shaders
1865 * */
1866
1867 panfrost_emit_varying_meta(trans.cpu, vs, general, gl_Position,
1868 gl_PointSize, gl_PointCoord,
1869 gl_FrontFacing);
1870
1871 panfrost_emit_varying_meta(trans.cpu + vs_size, fs, general,
1872 gl_FragCoord, gl_PointSize,
1873 gl_PointCoord, gl_FrontFacing);
1874
1875 /* Replace streamout */
1876
1877 struct mali_attr_meta *ovs = (struct mali_attr_meta *)trans.cpu;
1878 struct mali_attr_meta *ofs = ovs + vs->varying_count;
1879
1880 for (unsigned i = 0; i < vs->varying_count; i++) {
1881 gl_varying_slot loc = vs->varyings_loc[i];
1882
1883 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1884 if (!captured)
1885 continue;
1886
1887 struct pipe_stream_output *o = pan_get_so(so, loc);
1888 ovs[i].index = o->output_buffer;
1889
1890 assert(o->stream == 0);
1891 ovs[i].format = (vs->varyings[i].format & ~MALI_NR_CHANNELS(4))
1892 | MALI_NR_CHANNELS(o->num_components);
1893
1894 if (device->quirks & HAS_SWIZZLES)
1895 ovs[i].swizzle = panfrost_get_default_swizzle(o->num_components);
1896 else
1897 ovs[i].swizzle = panfrost_bifrost_swizzle(o->num_components);
1898
1899 /* Link to the fragment */
1900 signed fs_idx = -1;
1901
1902 /* Link up */
1903 for (unsigned j = 0; j < fs->varying_count; ++j) {
1904 if (fs->varyings_loc[j] == loc) {
1905 fs_idx = j;
1906 break;
1907 }
1908 }
1909
1910 if (fs_idx >= 0) {
1911 ofs[fs_idx].index = ovs[i].index;
1912 ofs[fs_idx].format = ovs[i].format;
1913 ofs[fs_idx].swizzle = ovs[i].swizzle;
1914 }
1915 }
1916
1917 /* Replace point sprite */
1918 for (unsigned i = 0; i < fs->varying_count; i++) {
1919 /* If we have a point sprite replacement, handle that here. We
1920 * have to translate location first. TODO: Flip y in shader.
1921 * We're already keying ... just time crunch .. */
1922
1923 if (has_point_coord(fs->point_sprite_mask,
1924 fs->varyings_loc[i])) {
1925 ofs[i].index = gl_PointCoord;
1926
1927 /* Swizzle out the z/w to 0/1 */
1928 ofs[i].format = MALI_RG16F;
1929 ofs[i].swizzle = panfrost_get_default_swizzle(2);
1930 }
1931 }
1932
1933 /* Fix up unaligned addresses */
1934 for (unsigned i = 0; i < so_count; ++i) {
1935 if (varyings[i].elements < MALI_RECORD_SPECIAL)
1936 continue;
1937
1938 unsigned align = (varyings[i].elements & 63);
1939
1940 /* While we're at it, the SO buffers are linear */
1941
1942 if (!align) {
1943 varyings[i].elements |= MALI_ATTR_LINEAR;
1944 continue;
1945 }
1946
1947 /* We need to adjust alignment */
1948 varyings[i].elements &= ~63;
1949 varyings[i].elements |= MALI_ATTR_LINEAR;
1950 varyings[i].size += align;
1951
1952 for (unsigned v = 0; v < vs->varying_count; ++v) {
1953 if (ovs[v].index != i)
1954 continue;
1955
1956 ovs[v].src_offset = vs->varyings[v].src_offset + align;
1957 }
1958
1959 for (unsigned f = 0; f < fs->varying_count; ++f) {
1960 if (ofs[f].index != i)
1961 continue;
1962
1963 ofs[f].src_offset = fs->varyings[f].src_offset + align;
1964 }
1965 }
1966
1967 varyings_p = panfrost_upload_transient(batch, varyings,
1968 idx * sizeof(*varyings));
1969 vertex_postfix->varyings = varyings_p;
1970 tiler_postfix->varyings = varyings_p;
1971
1972 vertex_postfix->varying_meta = trans.gpu;
1973 tiler_postfix->varying_meta = trans.gpu + vs_size;
1974 }
1975
1976 void
1977 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
1978 struct mali_vertex_tiler_prefix *vertex_prefix,
1979 struct mali_vertex_tiler_postfix *vertex_postfix,
1980 struct mali_vertex_tiler_prefix *tiler_prefix,
1981 struct mali_vertex_tiler_postfix *tiler_postfix,
1982 union midgard_primitive_size *primitive_size)
1983 {
1984 struct panfrost_context *ctx = batch->ctx;
1985 struct panfrost_device *device = pan_device(ctx->base.screen);
1986 bool wallpapering = ctx->wallpaper_batch && batch->tiler_dep;
1987 struct bifrost_payload_vertex bifrost_vertex = {0,};
1988 struct bifrost_payload_tiler bifrost_tiler = {0,};
1989 struct midgard_payload_vertex_tiler midgard_vertex = {0,};
1990 struct midgard_payload_vertex_tiler midgard_tiler = {0,};
1991 void *vp, *tp;
1992 size_t vp_size, tp_size;
1993
1994 if (device->quirks & IS_BIFROST) {
1995 bifrost_vertex.prefix = *vertex_prefix;
1996 bifrost_vertex.postfix = *vertex_postfix;
1997 vp = &bifrost_vertex;
1998 vp_size = sizeof(bifrost_vertex);
1999
2000 bifrost_tiler.prefix = *tiler_prefix;
2001 bifrost_tiler.tiler.primitive_size = *primitive_size;
2002 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
2003 bifrost_tiler.postfix = *tiler_postfix;
2004 tp = &bifrost_tiler;
2005 tp_size = sizeof(bifrost_tiler);
2006 } else {
2007 midgard_vertex.prefix = *vertex_prefix;
2008 midgard_vertex.postfix = *vertex_postfix;
2009 vp = &midgard_vertex;
2010 vp_size = sizeof(midgard_vertex);
2011
2012 midgard_tiler.prefix = *tiler_prefix;
2013 midgard_tiler.postfix = *tiler_postfix;
2014 midgard_tiler.primitive_size = *primitive_size;
2015 tp = &midgard_tiler;
2016 tp_size = sizeof(midgard_tiler);
2017 }
2018
2019 if (wallpapering) {
2020 /* Inject in reverse order, with "predicted" job indices.
2021 * THIS IS A HACK XXX */
2022 panfrost_new_job(batch, JOB_TYPE_TILER, false,
2023 batch->job_index + 2, tp, tp_size, true);
2024 panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
2025 vp, vp_size, true);
2026 return;
2027 }
2028
2029 /* If rasterizer discard is enable, only submit the vertex */
2030
2031 bool rasterizer_discard = ctx->rasterizer &&
2032 ctx->rasterizer->base.rasterizer_discard;
2033
2034 unsigned vertex = panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
2035 vp, vp_size, false);
2036
2037 if (rasterizer_discard)
2038 return;
2039
2040 panfrost_new_job(batch, JOB_TYPE_TILER, false, vertex, tp, tp_size,
2041 false);
2042 }
2043
2044 /* TODO: stop hardcoding this */
2045 mali_ptr
2046 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2047 {
2048 uint16_t locations[] = {
2049 128, 128,
2050 0, 256,
2051 0, 256,
2052 0, 256,
2053 0, 256,
2054 0, 256,
2055 0, 256,
2056 0, 256,
2057 0, 256,
2058 0, 256,
2059 0, 256,
2060 0, 256,
2061 0, 256,
2062 0, 256,
2063 0, 256,
2064 0, 256,
2065 0, 256,
2066 0, 256,
2067 0, 256,
2068 0, 256,
2069 0, 256,
2070 0, 256,
2071 0, 256,
2072 0, 256,
2073 0, 256,
2074 0, 256,
2075 0, 256,
2076 0, 256,
2077 0, 256,
2078 0, 256,
2079 0, 256,
2080 0, 256,
2081 128, 128,
2082 0, 0,
2083 0, 0,
2084 0, 0,
2085 0, 0,
2086 0, 0,
2087 0, 0,
2088 0, 0,
2089 0, 0,
2090 0, 0,
2091 0, 0,
2092 0, 0,
2093 0, 0,
2094 0, 0,
2095 0, 0,
2096 0, 0,
2097 };
2098
2099 return panfrost_upload_transient(batch, locations, 96 * sizeof(uint16_t));
2100 }