panfrost: Preload gl_FragCoord on Bifrost
[mesa.git] / src / gallium / drivers / panfrost / pan_cmdstream.c
1 /*
2 * Copyright (C) 2018 Alyssa Rosenzweig
3 * Copyright (C) 2020 Collabora Ltd.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "util/macros.h"
26 #include "util/u_prim.h"
27 #include "util/u_vbuf.h"
28
29 #include "panfrost-quirks.h"
30
31 #include "pan_allocate.h"
32 #include "pan_bo.h"
33 #include "pan_cmdstream.h"
34 #include "pan_context.h"
35 #include "pan_job.h"
36
37 /* If a BO is accessed for a particular shader stage, will it be in the primary
38 * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
39 * fragment will be primary, e.g. compute jobs will be considered
40 * "vertex/tiler" by analogy */
41
42 static inline uint32_t
43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
44 {
45 assert(stage == PIPE_SHADER_FRAGMENT ||
46 stage == PIPE_SHADER_VERTEX ||
47 stage == PIPE_SHADER_COMPUTE);
48
49 return stage == PIPE_SHADER_FRAGMENT ?
50 PAN_BO_ACCESS_FRAGMENT :
51 PAN_BO_ACCESS_VERTEX_TILER;
52 }
53
54 static void
55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
56 struct mali_vertex_tiler_postfix *postfix)
57 {
58 struct panfrost_device *dev = pan_device(ctx->base.screen);
59 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
60
61 unsigned shift = panfrost_get_stack_shift(batch->stack_size);
62 struct mali_shared_memory shared = {
63 .stack_shift = shift,
64 .scratchpad = panfrost_batch_get_scratchpad(batch, shift, dev->thread_tls_alloc, dev->core_count)->gpu,
65 .shared_workgroup_count = ~0,
66 };
67 postfix->shared_memory = panfrost_upload_transient(batch, &shared, sizeof(shared));
68 }
69
70 static void
71 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
72 struct mali_vertex_tiler_postfix *postfix)
73 {
74 struct panfrost_device *dev = pan_device(ctx->base.screen);
75 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
76
77 /* If we haven't, reserve space for the framebuffer */
78
79 if (!batch->framebuffer.gpu) {
80 unsigned size = (dev->quirks & MIDGARD_SFBD) ?
81 sizeof(struct mali_single_framebuffer) :
82 sizeof(struct mali_framebuffer);
83
84 batch->framebuffer = panfrost_allocate_transient(batch, size);
85
86 /* Tag the pointer */
87 if (!(dev->quirks & MIDGARD_SFBD))
88 batch->framebuffer.gpu |= MALI_MFBD;
89 }
90
91 postfix->shared_memory = batch->framebuffer.gpu;
92 }
93
94 static void
95 panfrost_vt_update_rasterizer(struct panfrost_context *ctx,
96 struct mali_vertex_tiler_prefix *prefix,
97 struct mali_vertex_tiler_postfix *postfix)
98 {
99 struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
100
101 postfix->gl_enables |= 0x7;
102 SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
103 rasterizer && rasterizer->base.front_ccw);
104 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
105 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_FRONT));
106 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
107 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_BACK));
108 SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
109 rasterizer && rasterizer->base.flatshade_first);
110 }
111
112 void
113 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
114 struct mali_vertex_tiler_prefix *prefix,
115 union midgard_primitive_size *primitive_size)
116 {
117 struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
118
119 if (!panfrost_writes_point_size(ctx)) {
120 bool points = prefix->draw_mode == MALI_POINTS;
121 float val = 0.0f;
122
123 if (rasterizer)
124 val = points ?
125 rasterizer->base.point_size :
126 rasterizer->base.line_width;
127
128 primitive_size->constant = val;
129 }
130 }
131
132 static void
133 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
134 struct mali_vertex_tiler_postfix *postfix)
135 {
136 SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
137 if (ctx->occlusion_query)
138 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
139 else
140 postfix->occlusion_counter = 0;
141 }
142
143 void
144 panfrost_vt_init(struct panfrost_context *ctx,
145 enum pipe_shader_type stage,
146 struct mali_vertex_tiler_prefix *prefix,
147 struct mali_vertex_tiler_postfix *postfix)
148 {
149 struct panfrost_device *device = pan_device(ctx->base.screen);
150
151 if (!ctx->shader[stage])
152 return;
153
154 memset(prefix, 0, sizeof(*prefix));
155 memset(postfix, 0, sizeof(*postfix));
156
157 if (device->quirks & IS_BIFROST) {
158 postfix->gl_enables = 0x2;
159 panfrost_vt_emit_shared_memory(ctx, postfix);
160 } else {
161 postfix->gl_enables = 0x6;
162 panfrost_vt_attach_framebuffer(ctx, postfix);
163 }
164
165 if (stage == PIPE_SHADER_FRAGMENT) {
166 panfrost_vt_update_occlusion_query(ctx, postfix);
167 panfrost_vt_update_rasterizer(ctx, prefix, postfix);
168 }
169 }
170
171 static unsigned
172 panfrost_translate_index_size(unsigned size)
173 {
174 switch (size) {
175 case 1:
176 return MALI_DRAW_INDEXED_UINT8;
177
178 case 2:
179 return MALI_DRAW_INDEXED_UINT16;
180
181 case 4:
182 return MALI_DRAW_INDEXED_UINT32;
183
184 default:
185 unreachable("Invalid index size");
186 }
187 }
188
189 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
190 * good for the duration of the draw (transient), could last longer. Also get
191 * the bounds on the index buffer for the range accessed by the draw. We do
192 * these operations together because there are natural optimizations which
193 * require them to be together. */
194
195 static mali_ptr
196 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
197 const struct pipe_draw_info *info,
198 unsigned *min_index, unsigned *max_index)
199 {
200 struct panfrost_resource *rsrc = pan_resource(info->index.resource);
201 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
202 off_t offset = info->start * info->index_size;
203 bool needs_indices = true;
204 mali_ptr out = 0;
205
206 if (info->max_index != ~0u) {
207 *min_index = info->min_index;
208 *max_index = info->max_index;
209 needs_indices = false;
210 }
211
212 if (!info->has_user_indices) {
213 /* Only resources can be directly mapped */
214 panfrost_batch_add_bo(batch, rsrc->bo,
215 PAN_BO_ACCESS_SHARED |
216 PAN_BO_ACCESS_READ |
217 PAN_BO_ACCESS_VERTEX_TILER);
218 out = rsrc->bo->gpu + offset;
219
220 /* Check the cache */
221 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
222 info->start,
223 info->count,
224 min_index,
225 max_index);
226 } else {
227 /* Otherwise, we need to upload to transient memory */
228 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
229 out = panfrost_upload_transient(batch, ibuf8 + offset,
230 info->count *
231 info->index_size);
232 }
233
234 if (needs_indices) {
235 /* Fallback */
236 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
237
238 if (!info->has_user_indices)
239 panfrost_minmax_cache_add(rsrc->index_cache,
240 info->start, info->count,
241 *min_index, *max_index);
242 }
243
244 return out;
245 }
246
247 void
248 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
249 const struct pipe_draw_info *info,
250 enum mali_draw_mode draw_mode,
251 struct mali_vertex_tiler_postfix *vertex_postfix,
252 struct mali_vertex_tiler_prefix *tiler_prefix,
253 struct mali_vertex_tiler_postfix *tiler_postfix,
254 unsigned *vertex_count,
255 unsigned *padded_count)
256 {
257 tiler_prefix->draw_mode = draw_mode;
258
259 unsigned draw_flags = 0;
260
261 if (panfrost_writes_point_size(ctx))
262 draw_flags |= MALI_DRAW_VARYING_SIZE;
263
264 if (info->primitive_restart)
265 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
266
267 /* These doesn't make much sense */
268
269 draw_flags |= 0x3000;
270
271 if (info->index_size) {
272 unsigned min_index = 0, max_index = 0;
273
274 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
275 info,
276 &min_index,
277 &max_index);
278
279 /* Use the corresponding values */
280 *vertex_count = max_index - min_index + 1;
281 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
282 tiler_prefix->offset_bias_correction = -min_index;
283 tiler_prefix->index_count = MALI_POSITIVE(info->count);
284 draw_flags |= panfrost_translate_index_size(info->index_size);
285 } else {
286 tiler_prefix->indices = 0;
287 *vertex_count = ctx->vertex_count;
288 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
289 tiler_prefix->offset_bias_correction = 0;
290 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
291 }
292
293 tiler_prefix->unknown_draw = draw_flags;
294
295 /* Encode the padded vertex count */
296
297 if (info->instance_count > 1) {
298 *padded_count = panfrost_padded_vertex_count(*vertex_count);
299
300 unsigned shift = __builtin_ctz(ctx->padded_count);
301 unsigned k = ctx->padded_count >> (shift + 1);
302
303 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
304 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
305 } else {
306 *padded_count = *vertex_count;
307
308 /* Reset instancing state */
309 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
310 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
311 }
312 }
313
314 static void
315 panfrost_shader_meta_init(struct panfrost_context *ctx,
316 enum pipe_shader_type st,
317 struct mali_shader_meta *meta)
318 {
319 const struct panfrost_device *dev = pan_device(ctx->base.screen);
320 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
321
322 memset(meta, 0, sizeof(*meta));
323 meta->shader = (ss->bo ? ss->bo->gpu : 0) | ss->first_tag;
324 meta->attribute_count = ss->attribute_count;
325 meta->varying_count = ss->varying_count;
326 meta->texture_count = ctx->sampler_view_count[st];
327 meta->sampler_count = ctx->sampler_count[st];
328
329 if (dev->quirks & IS_BIFROST) {
330 if (st == PIPE_SHADER_VERTEX)
331 meta->bifrost1.unk1 = 0x800000;
332 else {
333 /* First clause ATEST |= 0x4000000.
334 * Less than 32 regs |= 0x200 */
335 meta->bifrost1.unk1 = 0x950020;
336 }
337
338 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
339 if (st == PIPE_SHADER_VERTEX)
340 meta->bifrost2.preload_regs = 0xC0;
341 else {
342 meta->bifrost2.preload_regs = 0x1;
343 SET_BIT(meta->bifrost2.preload_regs, 0x10, ss->reads_frag_coord);
344 }
345
346 meta->bifrost2.uniform_count = MIN2(ss->uniform_count,
347 ss->uniform_cutoff);
348 } else {
349 meta->midgard1.uniform_count = MIN2(ss->uniform_count,
350 ss->uniform_cutoff);
351 meta->midgard1.work_count = ss->work_reg_count;
352
353 /* TODO: This is not conformant on ES3 */
354 meta->midgard1.flags_hi = MALI_SUPPRESS_INF_NAN;
355
356 meta->midgard1.flags_lo = 0x220;
357 meta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
358 }
359 }
360
361 static unsigned
362 panfrost_translate_compare_func(enum pipe_compare_func in)
363 {
364 switch (in) {
365 case PIPE_FUNC_NEVER:
366 return MALI_FUNC_NEVER;
367
368 case PIPE_FUNC_LESS:
369 return MALI_FUNC_LESS;
370
371 case PIPE_FUNC_EQUAL:
372 return MALI_FUNC_EQUAL;
373
374 case PIPE_FUNC_LEQUAL:
375 return MALI_FUNC_LEQUAL;
376
377 case PIPE_FUNC_GREATER:
378 return MALI_FUNC_GREATER;
379
380 case PIPE_FUNC_NOTEQUAL:
381 return MALI_FUNC_NOTEQUAL;
382
383 case PIPE_FUNC_GEQUAL:
384 return MALI_FUNC_GEQUAL;
385
386 case PIPE_FUNC_ALWAYS:
387 return MALI_FUNC_ALWAYS;
388
389 default:
390 unreachable("Invalid func");
391 }
392 }
393
394 static unsigned
395 panfrost_translate_stencil_op(enum pipe_stencil_op in)
396 {
397 switch (in) {
398 case PIPE_STENCIL_OP_KEEP:
399 return MALI_STENCIL_KEEP;
400
401 case PIPE_STENCIL_OP_ZERO:
402 return MALI_STENCIL_ZERO;
403
404 case PIPE_STENCIL_OP_REPLACE:
405 return MALI_STENCIL_REPLACE;
406
407 case PIPE_STENCIL_OP_INCR:
408 return MALI_STENCIL_INCR;
409
410 case PIPE_STENCIL_OP_DECR:
411 return MALI_STENCIL_DECR;
412
413 case PIPE_STENCIL_OP_INCR_WRAP:
414 return MALI_STENCIL_INCR_WRAP;
415
416 case PIPE_STENCIL_OP_DECR_WRAP:
417 return MALI_STENCIL_DECR_WRAP;
418
419 case PIPE_STENCIL_OP_INVERT:
420 return MALI_STENCIL_INVERT;
421
422 default:
423 unreachable("Invalid stencil op");
424 }
425 }
426
427 static unsigned
428 translate_tex_wrap(enum pipe_tex_wrap w)
429 {
430 switch (w) {
431 case PIPE_TEX_WRAP_REPEAT:
432 return MALI_WRAP_REPEAT;
433
434 case PIPE_TEX_WRAP_CLAMP:
435 return MALI_WRAP_CLAMP;
436
437 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
438 return MALI_WRAP_CLAMP_TO_EDGE;
439
440 case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
441 return MALI_WRAP_CLAMP_TO_BORDER;
442
443 case PIPE_TEX_WRAP_MIRROR_REPEAT:
444 return MALI_WRAP_MIRRORED_REPEAT;
445
446 case PIPE_TEX_WRAP_MIRROR_CLAMP:
447 return MALI_WRAP_MIRRORED_CLAMP;
448
449 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
450 return MALI_WRAP_MIRRORED_CLAMP_TO_EDGE;
451
452 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
453 return MALI_WRAP_MIRRORED_CLAMP_TO_BORDER;
454
455 default:
456 unreachable("Invalid wrap");
457 }
458 }
459
460 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
461 struct mali_sampler_descriptor *hw)
462 {
463 unsigned func = panfrost_translate_compare_func(cso->compare_func);
464 bool min_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
465 bool mag_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
466 bool mip_linear = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR;
467 unsigned min_filter = min_nearest ? MALI_SAMP_MIN_NEAREST : 0;
468 unsigned mag_filter = mag_nearest ? MALI_SAMP_MAG_NEAREST : 0;
469 unsigned mip_filter = mip_linear ?
470 (MALI_SAMP_MIP_LINEAR_1 | MALI_SAMP_MIP_LINEAR_2) : 0;
471 unsigned normalized = cso->normalized_coords ? MALI_SAMP_NORM_COORDS : 0;
472
473 *hw = (struct mali_sampler_descriptor) {
474 .filter_mode = min_filter | mag_filter | mip_filter |
475 normalized,
476 .wrap_s = translate_tex_wrap(cso->wrap_s),
477 .wrap_t = translate_tex_wrap(cso->wrap_t),
478 .wrap_r = translate_tex_wrap(cso->wrap_r),
479 .compare_func = panfrost_flip_compare_func(func),
480 .border_color = {
481 cso->border_color.f[0],
482 cso->border_color.f[1],
483 cso->border_color.f[2],
484 cso->border_color.f[3]
485 },
486 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
487 .max_lod = FIXED_16(cso->max_lod, false),
488 .lod_bias = FIXED_16(cso->lod_bias, true), /* can be negative */
489 .seamless_cube_map = cso->seamless_cube_map,
490 };
491
492 /* If necessary, we disable mipmapping in the sampler descriptor by
493 * clamping the LOD as tight as possible (from 0 to epsilon,
494 * essentially -- remember these are fixed point numbers, so
495 * epsilon=1/256) */
496
497 if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
498 hw->max_lod = hw->min_lod + 1;
499 }
500
501 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
502 struct bifrost_sampler_descriptor *hw)
503 {
504 *hw = (struct bifrost_sampler_descriptor) {
505 .unk1 = 0x1,
506 .wrap_s = translate_tex_wrap(cso->wrap_s),
507 .wrap_t = translate_tex_wrap(cso->wrap_t),
508 .wrap_r = translate_tex_wrap(cso->wrap_r),
509 .unk8 = 0x8,
510 .min_filter = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST,
511 .norm_coords = cso->normalized_coords,
512 .mip_filter = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR,
513 .mag_filter = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR,
514 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
515 .max_lod = FIXED_16(cso->max_lod, false),
516 };
517
518 /* If necessary, we disable mipmapping in the sampler descriptor by
519 * clamping the LOD as tight as possible (from 0 to epsilon,
520 * essentially -- remember these are fixed point numbers, so
521 * epsilon=1/256) */
522
523 if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
524 hw->max_lod = hw->min_lod + 1;
525 }
526
527 static void
528 panfrost_make_stencil_state(const struct pipe_stencil_state *in,
529 struct mali_stencil_test *out)
530 {
531 out->ref = 0; /* Gallium gets it from elsewhere */
532
533 out->mask = in->valuemask;
534 out->func = panfrost_translate_compare_func(in->func);
535 out->sfail = panfrost_translate_stencil_op(in->fail_op);
536 out->dpfail = panfrost_translate_stencil_op(in->zfail_op);
537 out->dppass = panfrost_translate_stencil_op(in->zpass_op);
538 }
539
540 static void
541 panfrost_frag_meta_rasterizer_update(struct panfrost_context *ctx,
542 struct mali_shader_meta *fragmeta)
543 {
544 if (!ctx->rasterizer) {
545 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, true);
546 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, false);
547 fragmeta->depth_units = 0.0f;
548 fragmeta->depth_factor = 0.0f;
549 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, false);
550 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, false);
551 return;
552 }
553
554 bool msaa = ctx->rasterizer->base.multisample;
555
556 /* TODO: Sample size */
557 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
558 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
559 fragmeta->depth_units = ctx->rasterizer->base.offset_units * 2.0f;
560 fragmeta->depth_factor = ctx->rasterizer->base.offset_scale;
561
562 /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
563
564 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A,
565 ctx->rasterizer->base.offset_tri);
566 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B,
567 ctx->rasterizer->base.offset_tri);
568 }
569
570 static void
571 panfrost_frag_meta_zsa_update(struct panfrost_context *ctx,
572 struct mali_shader_meta *fragmeta)
573 {
574 const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
575 int zfunc = PIPE_FUNC_ALWAYS;
576
577 if (!zsa) {
578 struct pipe_stencil_state default_stencil = {
579 .enabled = 0,
580 .func = PIPE_FUNC_ALWAYS,
581 .fail_op = MALI_STENCIL_KEEP,
582 .zfail_op = MALI_STENCIL_KEEP,
583 .zpass_op = MALI_STENCIL_KEEP,
584 .writemask = 0xFF,
585 .valuemask = 0xFF
586 };
587
588 panfrost_make_stencil_state(&default_stencil,
589 &fragmeta->stencil_front);
590 fragmeta->stencil_mask_front = default_stencil.writemask;
591 fragmeta->stencil_back = fragmeta->stencil_front;
592 fragmeta->stencil_mask_back = default_stencil.writemask;
593 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST, false);
594 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK, false);
595 } else {
596 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
597 zsa->stencil[0].enabled);
598 panfrost_make_stencil_state(&zsa->stencil[0],
599 &fragmeta->stencil_front);
600 fragmeta->stencil_mask_front = zsa->stencil[0].writemask;
601 fragmeta->stencil_front.ref = ctx->stencil_ref.ref_value[0];
602
603 /* If back-stencil is not enabled, use the front values */
604
605 if (zsa->stencil[1].enabled) {
606 panfrost_make_stencil_state(&zsa->stencil[1],
607 &fragmeta->stencil_back);
608 fragmeta->stencil_mask_back = zsa->stencil[1].writemask;
609 fragmeta->stencil_back.ref = ctx->stencil_ref.ref_value[1];
610 } else {
611 fragmeta->stencil_back = fragmeta->stencil_front;
612 fragmeta->stencil_mask_back = fragmeta->stencil_mask_front;
613 fragmeta->stencil_back.ref = fragmeta->stencil_front.ref;
614 }
615
616 if (zsa->depth.enabled)
617 zfunc = zsa->depth.func;
618
619 /* Depth state (TODO: Refactor) */
620
621 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
622 zsa->depth.writemask);
623 }
624
625 fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
626 fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(zfunc));
627 }
628
629 static bool
630 panfrost_fs_required(
631 struct panfrost_shader_state *fs,
632 struct panfrost_blend_final *blend,
633 unsigned rt_count)
634 {
635 /* If we generally have side effects */
636 if (fs->fs_sidefx)
637 return true;
638
639 /* If colour is written we need to execute */
640 for (unsigned i = 0; i < rt_count; ++i) {
641 if (!blend[i].no_colour)
642 return true;
643 }
644
645 /* If depth is written and not implied we need to execute.
646 * TODO: Predicate on Z/S writes being enabled */
647 return (fs->writes_depth || fs->writes_stencil);
648 }
649
650 static void
651 panfrost_frag_meta_blend_update(struct panfrost_context *ctx,
652 struct mali_shader_meta *fragmeta,
653 void *rts)
654 {
655 const struct panfrost_device *dev = pan_device(ctx->base.screen);
656 struct panfrost_shader_state *fs;
657 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
658
659 SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
660 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
661 !ctx->blend->base.dither);
662
663 /* Get blending setup */
664 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
665
666 struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
667 unsigned shader_offset = 0;
668 struct panfrost_bo *shader_bo = NULL;
669
670 for (unsigned c = 0; c < rt_count; ++c)
671 blend[c] = panfrost_get_blend_for_context(ctx, c, &shader_bo,
672 &shader_offset);
673
674 /* Disable shader execution if we can */
675 if (dev->quirks & MIDGARD_SHADERLESS
676 && !panfrost_fs_required(fs, blend, rt_count)) {
677 fragmeta->shader = 0;
678 fragmeta->attribute_count = 0;
679 fragmeta->varying_count = 0;
680 fragmeta->texture_count = 0;
681 fragmeta->sampler_count = 0;
682
683 /* This feature is not known to work on Bifrost */
684 fragmeta->midgard1.work_count = 1;
685 fragmeta->midgard1.uniform_count = 0;
686 fragmeta->midgard1.uniform_buffer_count = 0;
687 }
688
689 /* If there is a blend shader, work registers are shared. We impose 8
690 * work registers as a limit for blend shaders. Should be lower XXX */
691
692 if (!(dev->quirks & IS_BIFROST)) {
693 for (unsigned c = 0; c < rt_count; ++c) {
694 if (blend[c].is_shader) {
695 fragmeta->midgard1.work_count =
696 MAX2(fragmeta->midgard1.work_count, 8);
697 }
698 }
699 }
700
701 /* Even on MFBD, the shader descriptor gets blend shaders. It's *also*
702 * copied to the blend_meta appended (by convention), but this is the
703 * field actually read by the hardware. (Or maybe both are read...?).
704 * Specify the last RTi with a blend shader. */
705
706 fragmeta->blend.shader = 0;
707
708 for (signed rt = (rt_count - 1); rt >= 0; --rt) {
709 if (!blend[rt].is_shader)
710 continue;
711
712 fragmeta->blend.shader = blend[rt].shader.gpu |
713 blend[rt].shader.first_tag;
714 break;
715 }
716
717 if (dev->quirks & MIDGARD_SFBD) {
718 /* When only a single render target platform is used, the blend
719 * information is inside the shader meta itself. We additionally
720 * need to signal CAN_DISCARD for nontrivial blend modes (so
721 * we're able to read back the destination buffer) */
722
723 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
724 blend[0].is_shader);
725
726 if (!blend[0].is_shader) {
727 fragmeta->blend.equation = *blend[0].equation.equation;
728 fragmeta->blend.constant = blend[0].equation.constant;
729 }
730
731 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
732 !blend[0].no_blending || fs->can_discard);
733 return;
734 }
735
736 if (dev->quirks & IS_BIFROST) {
737 bool no_blend = true;
738
739 for (unsigned i = 0; i < rt_count; ++i)
740 no_blend &= (blend[i].no_blending | blend[i].no_colour);
741
742 SET_BIT(fragmeta->bifrost1.unk1, MALI_BIFROST_EARLY_Z,
743 !fs->can_discard && !fs->writes_depth && no_blend);
744 }
745
746 /* Additional blend descriptor tacked on for jobs using MFBD */
747
748 for (unsigned i = 0; i < rt_count; ++i) {
749 unsigned flags = 0;
750
751 if (ctx->pipe_framebuffer.nr_cbufs > i && !blend[i].no_colour) {
752 flags = 0x200;
753
754 bool is_srgb = (ctx->pipe_framebuffer.nr_cbufs > i) &&
755 (ctx->pipe_framebuffer.cbufs[i]) &&
756 util_format_is_srgb(ctx->pipe_framebuffer.cbufs[i]->format);
757
758 SET_BIT(flags, MALI_BLEND_MRT_SHADER, blend[i].is_shader);
759 SET_BIT(flags, MALI_BLEND_LOAD_TIB, !blend[i].no_blending);
760 SET_BIT(flags, MALI_BLEND_SRGB, is_srgb);
761 SET_BIT(flags, MALI_BLEND_NO_DITHER, !ctx->blend->base.dither);
762 }
763
764 if (dev->quirks & IS_BIFROST) {
765 struct bifrost_blend_rt *brts = rts;
766
767 brts[i].flags = flags;
768
769 if (blend[i].is_shader) {
770 /* The blend shader's address needs to be at
771 * the same top 32 bit as the fragment shader.
772 * TODO: Ensure that's always the case.
773 */
774 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
775 (fs->bo->gpu & (0xffffffffull << 32)));
776 brts[i].shader = blend[i].shader.gpu;
777 brts[i].unk2 = 0x0;
778 } else if (ctx->pipe_framebuffer.nr_cbufs > i) {
779 enum pipe_format format = ctx->pipe_framebuffer.cbufs[i]->format;
780 const struct util_format_description *format_desc;
781 format_desc = util_format_description(format);
782
783 brts[i].equation = *blend[i].equation.equation;
784
785 /* TODO: this is a bit more complicated */
786 brts[i].constant = blend[i].equation.constant;
787
788 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
789
790 /* 0x19 disables blending and forces REPLACE
791 * mode (equivalent to rgb_mode = alpha_mode =
792 * x122, colour mask = 0xF). 0x1a allows
793 * blending. */
794 brts[i].unk2 = blend[i].no_blending ? 0x19 : 0x1a;
795
796 brts[i].shader_type = fs->blend_types[i];
797 } else {
798 /* Dummy attachment for depth-only */
799 brts[i].unk2 = 0x3;
800 brts[i].shader_type = fs->blend_types[i];
801 }
802 } else {
803 struct midgard_blend_rt *mrts = rts;
804 mrts[i].flags = flags;
805
806 if (blend[i].is_shader) {
807 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
808 } else {
809 mrts[i].blend.equation = *blend[i].equation.equation;
810 mrts[i].blend.constant = blend[i].equation.constant;
811 }
812 }
813 }
814 }
815
816 static void
817 panfrost_frag_shader_meta_init(struct panfrost_context *ctx,
818 struct mali_shader_meta *fragmeta,
819 void *rts)
820 {
821 const struct panfrost_device *dev = pan_device(ctx->base.screen);
822 struct panfrost_shader_state *fs;
823
824 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
825
826 fragmeta->alpha_coverage = ~MALI_ALPHA_COVERAGE(0.000000);
827 fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x3010;
828 fragmeta->unknown2_4 = 0x4e0;
829
830 /* unknown2_4 has 0x10 bit set on T6XX and T720. We don't know why this
831 * is required (independent of 32-bit/64-bit descriptors), or why it's
832 * not used on later GPU revisions. Otherwise, all shader jobs fault on
833 * these earlier chips (perhaps this is a chicken bit of some kind).
834 * More investigation is needed. */
835
836 SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
837
838 if (dev->quirks & IS_BIFROST) {
839 /* TODO */
840 } else {
841 /* Depending on whether it's legal to in the given shader, we try to
842 * enable early-z testing (or forward-pixel kill?) */
843
844 SET_BIT(fragmeta->midgard1.flags_lo, MALI_EARLY_Z,
845 !fs->can_discard && !fs->writes_depth);
846
847 /* Add the writes Z/S flags if needed. */
848 SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_Z, fs->writes_depth);
849 SET_BIT(fragmeta->midgard1.flags_hi, MALI_WRITES_S, fs->writes_stencil);
850
851 /* Any time texturing is used, derivatives are implicitly calculated,
852 * so we need to enable helper invocations */
853
854 SET_BIT(fragmeta->midgard1.flags_lo, MALI_HELPER_INVOCATIONS,
855 fs->helper_invocations);
856
857 const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
858
859 bool depth_enabled = fs->writes_depth ||
860 (zsa && zsa->depth.enabled && zsa->depth.func != PIPE_FUNC_ALWAYS);
861
862 SET_BIT(fragmeta->midgard1.flags_lo, 0x400, !depth_enabled && fs->can_discard);
863 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_ZS, depth_enabled && fs->can_discard);
864 }
865
866 panfrost_frag_meta_rasterizer_update(ctx, fragmeta);
867 panfrost_frag_meta_zsa_update(ctx, fragmeta);
868 panfrost_frag_meta_blend_update(ctx, fragmeta, rts);
869 }
870
871 void
872 panfrost_emit_shader_meta(struct panfrost_batch *batch,
873 enum pipe_shader_type st,
874 struct mali_vertex_tiler_postfix *postfix)
875 {
876 struct panfrost_context *ctx = batch->ctx;
877 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
878
879 if (!ss) {
880 postfix->shader = 0;
881 return;
882 }
883
884 struct mali_shader_meta meta;
885
886 panfrost_shader_meta_init(ctx, st, &meta);
887
888 /* Add the shader BO to the batch. */
889 panfrost_batch_add_bo(batch, ss->bo,
890 PAN_BO_ACCESS_PRIVATE |
891 PAN_BO_ACCESS_READ |
892 panfrost_bo_access_for_stage(st));
893
894 mali_ptr shader_ptr;
895
896 if (st == PIPE_SHADER_FRAGMENT) {
897 struct panfrost_device *dev = pan_device(ctx->base.screen);
898 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
899 size_t desc_size = sizeof(meta);
900 void *rts = NULL;
901 struct panfrost_transfer xfer;
902 unsigned rt_size;
903
904 if (dev->quirks & MIDGARD_SFBD)
905 rt_size = 0;
906 else if (dev->quirks & IS_BIFROST)
907 rt_size = sizeof(struct bifrost_blend_rt);
908 else
909 rt_size = sizeof(struct midgard_blend_rt);
910
911 desc_size += rt_size * rt_count;
912
913 if (rt_size)
914 rts = rzalloc_size(ctx, rt_size * rt_count);
915
916 panfrost_frag_shader_meta_init(ctx, &meta, rts);
917
918 xfer = panfrost_allocate_transient(batch, desc_size);
919
920 memcpy(xfer.cpu, &meta, sizeof(meta));
921 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
922
923 if (rt_size)
924 ralloc_free(rts);
925
926 shader_ptr = xfer.gpu;
927 } else {
928 shader_ptr = panfrost_upload_transient(batch, &meta,
929 sizeof(meta));
930 }
931
932 postfix->shader = shader_ptr;
933 }
934
935 static void
936 panfrost_mali_viewport_init(struct panfrost_context *ctx,
937 struct mali_viewport *mvp)
938 {
939 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
940
941 /* Clip bounds are encoded as floats. The viewport itself is encoded as
942 * (somewhat) asymmetric ints. */
943
944 const struct pipe_scissor_state *ss = &ctx->scissor;
945
946 memset(mvp, 0, sizeof(*mvp));
947
948 /* By default, do no viewport clipping, i.e. clip to (-inf, inf) in
949 * each direction. Clipping to the viewport in theory should work, but
950 * in practice causes issues when we're not explicitly trying to
951 * scissor */
952
953 *mvp = (struct mali_viewport) {
954 .clip_minx = -INFINITY,
955 .clip_miny = -INFINITY,
956 .clip_maxx = INFINITY,
957 .clip_maxy = INFINITY,
958 };
959
960 /* Always scissor to the viewport by default. */
961 float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
962 float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
963
964 float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
965 float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
966
967 float minz = (vp->translate[2] - fabsf(vp->scale[2]));
968 float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
969
970 /* Apply the scissor test */
971
972 unsigned minx, miny, maxx, maxy;
973
974 if (ss && ctx->rasterizer && ctx->rasterizer->base.scissor) {
975 minx = MAX2(ss->minx, vp_minx);
976 miny = MAX2(ss->miny, vp_miny);
977 maxx = MIN2(ss->maxx, vp_maxx);
978 maxy = MIN2(ss->maxy, vp_maxy);
979 } else {
980 minx = vp_minx;
981 miny = vp_miny;
982 maxx = vp_maxx;
983 maxy = vp_maxy;
984 }
985
986 /* Hardware needs the min/max to be strictly ordered, so flip if we
987 * need to. The viewport transformation in the vertex shader will
988 * handle the negatives if we don't */
989
990 if (miny > maxy) {
991 unsigned temp = miny;
992 miny = maxy;
993 maxy = temp;
994 }
995
996 if (minx > maxx) {
997 unsigned temp = minx;
998 minx = maxx;
999 maxx = temp;
1000 }
1001
1002 if (minz > maxz) {
1003 float temp = minz;
1004 minz = maxz;
1005 maxz = temp;
1006 }
1007
1008 /* Clamp to the framebuffer size as a last check */
1009
1010 minx = MIN2(ctx->pipe_framebuffer.width, minx);
1011 maxx = MIN2(ctx->pipe_framebuffer.width, maxx);
1012
1013 miny = MIN2(ctx->pipe_framebuffer.height, miny);
1014 maxy = MIN2(ctx->pipe_framebuffer.height, maxy);
1015
1016 /* Upload */
1017
1018 mvp->viewport0[0] = minx;
1019 mvp->viewport1[0] = MALI_POSITIVE(maxx);
1020
1021 mvp->viewport0[1] = miny;
1022 mvp->viewport1[1] = MALI_POSITIVE(maxy);
1023
1024 mvp->clip_minz = minz;
1025 mvp->clip_maxz = maxz;
1026 }
1027
1028 void
1029 panfrost_emit_viewport(struct panfrost_batch *batch,
1030 struct mali_vertex_tiler_postfix *tiler_postfix)
1031 {
1032 struct panfrost_context *ctx = batch->ctx;
1033 struct mali_viewport mvp;
1034
1035 panfrost_mali_viewport_init(batch->ctx, &mvp);
1036
1037 /* Update the job, unless we're doing wallpapering (whose lack of
1038 * scissor we can ignore, since if we "miss" a tile of wallpaper, it'll
1039 * just... be faster :) */
1040
1041 if (!ctx->wallpaper_batch)
1042 panfrost_batch_union_scissor(batch, mvp.viewport0[0],
1043 mvp.viewport0[1],
1044 mvp.viewport1[0] + 1,
1045 mvp.viewport1[1] + 1);
1046
1047 tiler_postfix->viewport = panfrost_upload_transient(batch, &mvp,
1048 sizeof(mvp));
1049 }
1050
1051 static mali_ptr
1052 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
1053 enum pipe_shader_type st,
1054 struct panfrost_constant_buffer *buf,
1055 unsigned index)
1056 {
1057 struct pipe_constant_buffer *cb = &buf->cb[index];
1058 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1059
1060 if (rsrc) {
1061 panfrost_batch_add_bo(batch, rsrc->bo,
1062 PAN_BO_ACCESS_SHARED |
1063 PAN_BO_ACCESS_READ |
1064 panfrost_bo_access_for_stage(st));
1065
1066 /* Alignment gauranteed by
1067 * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
1068 return rsrc->bo->gpu + cb->buffer_offset;
1069 } else if (cb->user_buffer) {
1070 return panfrost_upload_transient(batch,
1071 cb->user_buffer +
1072 cb->buffer_offset,
1073 cb->buffer_size);
1074 } else {
1075 unreachable("No constant buffer");
1076 }
1077 }
1078
1079 struct sysval_uniform {
1080 union {
1081 float f[4];
1082 int32_t i[4];
1083 uint32_t u[4];
1084 uint64_t du[2];
1085 };
1086 };
1087
1088 static void
1089 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
1090 struct sysval_uniform *uniform)
1091 {
1092 struct panfrost_context *ctx = batch->ctx;
1093 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1094
1095 uniform->f[0] = vp->scale[0];
1096 uniform->f[1] = vp->scale[1];
1097 uniform->f[2] = vp->scale[2];
1098 }
1099
1100 static void
1101 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
1102 struct sysval_uniform *uniform)
1103 {
1104 struct panfrost_context *ctx = batch->ctx;
1105 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1106
1107 uniform->f[0] = vp->translate[0];
1108 uniform->f[1] = vp->translate[1];
1109 uniform->f[2] = vp->translate[2];
1110 }
1111
1112 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
1113 enum pipe_shader_type st,
1114 unsigned int sysvalid,
1115 struct sysval_uniform *uniform)
1116 {
1117 struct panfrost_context *ctx = batch->ctx;
1118 unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
1119 unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
1120 bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
1121 struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
1122
1123 assert(dim);
1124 uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
1125
1126 if (dim > 1)
1127 uniform->i[1] = u_minify(tex->texture->height0,
1128 tex->u.tex.first_level);
1129
1130 if (dim > 2)
1131 uniform->i[2] = u_minify(tex->texture->depth0,
1132 tex->u.tex.first_level);
1133
1134 if (is_array)
1135 uniform->i[dim] = tex->texture->array_size;
1136 }
1137
1138 static void
1139 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
1140 enum pipe_shader_type st,
1141 unsigned ssbo_id,
1142 struct sysval_uniform *uniform)
1143 {
1144 struct panfrost_context *ctx = batch->ctx;
1145
1146 assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
1147 struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
1148
1149 /* Compute address */
1150 struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
1151
1152 panfrost_batch_add_bo(batch, bo,
1153 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
1154 panfrost_bo_access_for_stage(st));
1155
1156 /* Upload address and size as sysval */
1157 uniform->du[0] = bo->gpu + sb.buffer_offset;
1158 uniform->u[2] = sb.buffer_size;
1159 }
1160
1161 static void
1162 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1163 enum pipe_shader_type st,
1164 unsigned samp_idx,
1165 struct sysval_uniform *uniform)
1166 {
1167 struct panfrost_context *ctx = batch->ctx;
1168 struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1169
1170 uniform->f[0] = sampl->min_lod;
1171 uniform->f[1] = sampl->max_lod;
1172 uniform->f[2] = sampl->lod_bias;
1173
1174 /* Even without any errata, Midgard represents "no mipmapping" as
1175 * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1176 * panfrost_create_sampler_state which also explains our choice of
1177 * epsilon value (again to keep behaviour consistent) */
1178
1179 if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1180 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1181 }
1182
1183 static void
1184 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1185 struct sysval_uniform *uniform)
1186 {
1187 struct panfrost_context *ctx = batch->ctx;
1188
1189 uniform->u[0] = ctx->compute_grid->grid[0];
1190 uniform->u[1] = ctx->compute_grid->grid[1];
1191 uniform->u[2] = ctx->compute_grid->grid[2];
1192 }
1193
1194 static void
1195 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1196 struct panfrost_shader_state *ss,
1197 enum pipe_shader_type st)
1198 {
1199 struct sysval_uniform *uniforms = (void *)buf;
1200
1201 for (unsigned i = 0; i < ss->sysval_count; ++i) {
1202 int sysval = ss->sysval[i];
1203
1204 switch (PAN_SYSVAL_TYPE(sysval)) {
1205 case PAN_SYSVAL_VIEWPORT_SCALE:
1206 panfrost_upload_viewport_scale_sysval(batch,
1207 &uniforms[i]);
1208 break;
1209 case PAN_SYSVAL_VIEWPORT_OFFSET:
1210 panfrost_upload_viewport_offset_sysval(batch,
1211 &uniforms[i]);
1212 break;
1213 case PAN_SYSVAL_TEXTURE_SIZE:
1214 panfrost_upload_txs_sysval(batch, st,
1215 PAN_SYSVAL_ID(sysval),
1216 &uniforms[i]);
1217 break;
1218 case PAN_SYSVAL_SSBO:
1219 panfrost_upload_ssbo_sysval(batch, st,
1220 PAN_SYSVAL_ID(sysval),
1221 &uniforms[i]);
1222 break;
1223 case PAN_SYSVAL_NUM_WORK_GROUPS:
1224 panfrost_upload_num_work_groups_sysval(batch,
1225 &uniforms[i]);
1226 break;
1227 case PAN_SYSVAL_SAMPLER:
1228 panfrost_upload_sampler_sysval(batch, st,
1229 PAN_SYSVAL_ID(sysval),
1230 &uniforms[i]);
1231 break;
1232 default:
1233 assert(0);
1234 }
1235 }
1236 }
1237
1238 static const void *
1239 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1240 unsigned index)
1241 {
1242 struct pipe_constant_buffer *cb = &buf->cb[index];
1243 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1244
1245 if (rsrc)
1246 return rsrc->bo->cpu;
1247 else if (cb->user_buffer)
1248 return cb->user_buffer;
1249 else
1250 unreachable("No constant buffer");
1251 }
1252
1253 void
1254 panfrost_emit_const_buf(struct panfrost_batch *batch,
1255 enum pipe_shader_type stage,
1256 struct mali_vertex_tiler_postfix *postfix)
1257 {
1258 struct panfrost_context *ctx = batch->ctx;
1259 struct panfrost_shader_variants *all = ctx->shader[stage];
1260
1261 if (!all)
1262 return;
1263
1264 struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1265
1266 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1267
1268 /* Uniforms are implicitly UBO #0 */
1269 bool has_uniforms = buf->enabled_mask & (1 << 0);
1270
1271 /* Allocate room for the sysval and the uniforms */
1272 size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1273 size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1274 size_t size = sys_size + uniform_size;
1275 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1276 size);
1277
1278 /* Upload sysvals requested by the shader */
1279 panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1280
1281 /* Upload uniforms */
1282 if (has_uniforms && uniform_size) {
1283 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1284 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1285 }
1286
1287 /* Next up, attach UBOs. UBO #0 is the uniforms we just
1288 * uploaded */
1289
1290 unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1291 assert(ubo_count >= 1);
1292
1293 size_t sz = sizeof(uint64_t) * ubo_count;
1294 uint64_t ubos[PAN_MAX_CONST_BUFFERS];
1295 int uniform_count = ss->uniform_count;
1296
1297 /* Upload uniforms as a UBO */
1298 ubos[0] = MALI_MAKE_UBO(2 + uniform_count, transfer.gpu);
1299
1300 /* The rest are honest-to-goodness UBOs */
1301
1302 for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1303 size_t usz = buf->cb[ubo].buffer_size;
1304 bool enabled = buf->enabled_mask & (1 << ubo);
1305 bool empty = usz == 0;
1306
1307 if (!enabled || empty) {
1308 /* Stub out disabled UBOs to catch accesses */
1309 ubos[ubo] = MALI_MAKE_UBO(0, 0xDEAD0000);
1310 continue;
1311 }
1312
1313 mali_ptr gpu = panfrost_map_constant_buffer_gpu(batch, stage,
1314 buf, ubo);
1315
1316 unsigned bytes_per_field = 16;
1317 unsigned aligned = ALIGN_POT(usz, bytes_per_field);
1318 ubos[ubo] = MALI_MAKE_UBO(aligned / bytes_per_field, gpu);
1319 }
1320
1321 mali_ptr ubufs = panfrost_upload_transient(batch, ubos, sz);
1322 postfix->uniforms = transfer.gpu;
1323 postfix->uniform_buffers = ubufs;
1324
1325 buf->dirty_mask = 0;
1326 }
1327
1328 void
1329 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1330 const struct pipe_grid_info *info,
1331 struct midgard_payload_vertex_tiler *vtp)
1332 {
1333 struct panfrost_context *ctx = batch->ctx;
1334 struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1335 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1336 unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1337 128));
1338 unsigned shared_size = single_size * info->grid[0] * info->grid[1] *
1339 info->grid[2] * 4;
1340 struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1341 shared_size,
1342 1);
1343
1344 struct mali_shared_memory shared = {
1345 .shared_memory = bo->gpu,
1346 .shared_workgroup_count =
1347 util_logbase2_ceil(info->grid[0]) +
1348 util_logbase2_ceil(info->grid[1]) +
1349 util_logbase2_ceil(info->grid[2]),
1350 .shared_unk1 = 0x2,
1351 .shared_shift = util_logbase2(single_size) - 1
1352 };
1353
1354 vtp->postfix.shared_memory = panfrost_upload_transient(batch, &shared,
1355 sizeof(shared));
1356 }
1357
1358 static mali_ptr
1359 panfrost_get_tex_desc(struct panfrost_batch *batch,
1360 enum pipe_shader_type st,
1361 struct panfrost_sampler_view *view)
1362 {
1363 if (!view)
1364 return (mali_ptr) 0;
1365
1366 struct pipe_sampler_view *pview = &view->base;
1367 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1368
1369 /* Add the BO to the job so it's retained until the job is done. */
1370
1371 panfrost_batch_add_bo(batch, rsrc->bo,
1372 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1373 panfrost_bo_access_for_stage(st));
1374
1375 panfrost_batch_add_bo(batch, view->midgard_bo,
1376 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1377 panfrost_bo_access_for_stage(st));
1378
1379 return view->midgard_bo->gpu;
1380 }
1381
1382 void
1383 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1384 enum pipe_shader_type stage,
1385 struct mali_vertex_tiler_postfix *postfix)
1386 {
1387 struct panfrost_context *ctx = batch->ctx;
1388 struct panfrost_device *device = pan_device(ctx->base.screen);
1389
1390 if (!ctx->sampler_view_count[stage])
1391 return;
1392
1393 if (device->quirks & IS_BIFROST) {
1394 struct bifrost_texture_descriptor *descriptors;
1395
1396 descriptors = malloc(sizeof(struct bifrost_texture_descriptor) *
1397 ctx->sampler_view_count[stage]);
1398
1399 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1400 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1401 struct pipe_sampler_view *pview = &view->base;
1402 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1403
1404 /* Add the BOs to the job so they are retained until the job is done. */
1405
1406 panfrost_batch_add_bo(batch, rsrc->bo,
1407 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1408 panfrost_bo_access_for_stage(stage));
1409
1410 panfrost_batch_add_bo(batch, view->bifrost_bo,
1411 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1412 panfrost_bo_access_for_stage(stage));
1413
1414 memcpy(&descriptors[i], view->bifrost_descriptor, sizeof(*view->bifrost_descriptor));
1415 }
1416
1417 postfix->textures = panfrost_upload_transient(batch,
1418 descriptors,
1419 sizeof(struct bifrost_texture_descriptor) *
1420 ctx->sampler_view_count[stage]);
1421
1422 free(descriptors);
1423 } else {
1424 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1425
1426 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i)
1427 trampolines[i] = panfrost_get_tex_desc(batch, stage,
1428 ctx->sampler_views[stage][i]);
1429
1430 postfix->textures = panfrost_upload_transient(batch,
1431 trampolines,
1432 sizeof(uint64_t) *
1433 ctx->sampler_view_count[stage]);
1434 }
1435 }
1436
1437 void
1438 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1439 enum pipe_shader_type stage,
1440 struct mali_vertex_tiler_postfix *postfix)
1441 {
1442 struct panfrost_context *ctx = batch->ctx;
1443 struct panfrost_device *device = pan_device(ctx->base.screen);
1444
1445 if (!ctx->sampler_count[stage])
1446 return;
1447
1448 if (device->quirks & IS_BIFROST) {
1449 size_t desc_size = sizeof(struct bifrost_sampler_descriptor);
1450 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1451 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1452 transfer_size);
1453 struct bifrost_sampler_descriptor *desc = (struct bifrost_sampler_descriptor *)transfer.cpu;
1454
1455 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1456 desc[i] = ctx->samplers[stage][i]->bifrost_hw;
1457
1458 postfix->sampler_descriptor = transfer.gpu;
1459 } else {
1460 size_t desc_size = sizeof(struct mali_sampler_descriptor);
1461 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1462 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1463 transfer_size);
1464 struct mali_sampler_descriptor *desc = (struct mali_sampler_descriptor *)transfer.cpu;
1465
1466 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1467 desc[i] = ctx->samplers[stage][i]->midgard_hw;
1468
1469 postfix->sampler_descriptor = transfer.gpu;
1470 }
1471 }
1472
1473 void
1474 panfrost_emit_vertex_attr_meta(struct panfrost_batch *batch,
1475 struct mali_vertex_tiler_postfix *vertex_postfix)
1476 {
1477 struct panfrost_context *ctx = batch->ctx;
1478
1479 if (!ctx->vertex)
1480 return;
1481
1482 struct panfrost_vertex_state *so = ctx->vertex;
1483
1484 panfrost_vertex_state_upd_attr_offs(ctx, vertex_postfix);
1485 vertex_postfix->attribute_meta = panfrost_upload_transient(batch, so->hw,
1486 sizeof(*so->hw) *
1487 PAN_MAX_ATTRIBUTE);
1488 }
1489
1490 void
1491 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1492 struct mali_vertex_tiler_postfix *vertex_postfix)
1493 {
1494 struct panfrost_context *ctx = batch->ctx;
1495 struct panfrost_vertex_state *so = ctx->vertex;
1496
1497 /* Staged mali_attr, and index into them. i =/= k, depending on the
1498 * vertex buffer mask and instancing. Twice as much room is allocated,
1499 * for a worst case of NPOT_DIVIDEs which take up extra slot */
1500 union mali_attr attrs[PIPE_MAX_ATTRIBS * 2];
1501 unsigned k = 0;
1502
1503 for (unsigned i = 0; i < so->num_elements; ++i) {
1504 /* We map a mali_attr to be 1:1 with the mali_attr_meta, which
1505 * means duplicating some vertex buffers (who cares? aside from
1506 * maybe some caching implications but I somehow doubt that
1507 * matters) */
1508
1509 struct pipe_vertex_element *elem = &so->pipe[i];
1510 unsigned vbi = elem->vertex_buffer_index;
1511
1512 /* The exception to 1:1 mapping is that we can have multiple
1513 * entries (NPOT divisors), so we fixup anyways */
1514
1515 so->hw[i].index = k;
1516
1517 if (!(ctx->vb_mask & (1 << vbi)))
1518 continue;
1519
1520 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1521 struct panfrost_resource *rsrc;
1522
1523 rsrc = pan_resource(buf->buffer.resource);
1524 if (!rsrc)
1525 continue;
1526
1527 /* Align to 64 bytes by masking off the lower bits. This
1528 * will be adjusted back when we fixup the src_offset in
1529 * mali_attr_meta */
1530
1531 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1532 mali_ptr addr = raw_addr & ~63;
1533 unsigned chopped_addr = raw_addr - addr;
1534
1535 /* Add a dependency of the batch on the vertex buffer */
1536 panfrost_batch_add_bo(batch, rsrc->bo,
1537 PAN_BO_ACCESS_SHARED |
1538 PAN_BO_ACCESS_READ |
1539 PAN_BO_ACCESS_VERTEX_TILER);
1540
1541 /* Set common fields */
1542 attrs[k].elements = addr;
1543 attrs[k].stride = buf->stride;
1544
1545 /* Since we advanced the base pointer, we shrink the buffer
1546 * size */
1547 attrs[k].size = rsrc->base.width0 - buf->buffer_offset;
1548
1549 /* We need to add the extra size we masked off (for
1550 * correctness) so the data doesn't get clamped away */
1551 attrs[k].size += chopped_addr;
1552
1553 /* For non-instancing make sure we initialize */
1554 attrs[k].shift = attrs[k].extra_flags = 0;
1555
1556 /* Instancing uses a dramatically different code path than
1557 * linear, so dispatch for the actual emission now that the
1558 * common code is finished */
1559
1560 unsigned divisor = elem->instance_divisor;
1561
1562 if (divisor && ctx->instance_count == 1) {
1563 /* Silly corner case where there's a divisor(=1) but
1564 * there's no legitimate instancing. So we want *every*
1565 * attribute to be the same. So set stride to zero so
1566 * we don't go anywhere. */
1567
1568 attrs[k].size = attrs[k].stride + chopped_addr;
1569 attrs[k].stride = 0;
1570 attrs[k++].elements |= MALI_ATTR_LINEAR;
1571 } else if (ctx->instance_count <= 1) {
1572 /* Normal, non-instanced attributes */
1573 attrs[k++].elements |= MALI_ATTR_LINEAR;
1574 } else {
1575 unsigned instance_shift = vertex_postfix->instance_shift;
1576 unsigned instance_odd = vertex_postfix->instance_odd;
1577
1578 k += panfrost_vertex_instanced(ctx->padded_count,
1579 instance_shift,
1580 instance_odd,
1581 divisor, &attrs[k]);
1582 }
1583 }
1584
1585 /* Add special gl_VertexID/gl_InstanceID buffers */
1586
1587 panfrost_vertex_id(ctx->padded_count, &attrs[k]);
1588 so->hw[PAN_VERTEX_ID].index = k++;
1589 panfrost_instance_id(ctx->padded_count, &attrs[k]);
1590 so->hw[PAN_INSTANCE_ID].index = k++;
1591
1592 /* Upload whatever we emitted and go */
1593
1594 vertex_postfix->attributes = panfrost_upload_transient(batch, attrs,
1595 k * sizeof(*attrs));
1596 }
1597
1598 static mali_ptr
1599 panfrost_emit_varyings(struct panfrost_batch *batch, union mali_attr *slot,
1600 unsigned stride, unsigned count)
1601 {
1602 /* Fill out the descriptor */
1603 slot->stride = stride;
1604 slot->size = stride * count;
1605 slot->shift = slot->extra_flags = 0;
1606
1607 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1608 slot->size);
1609
1610 slot->elements = transfer.gpu | MALI_ATTR_LINEAR;
1611
1612 return transfer.gpu;
1613 }
1614
1615 static void
1616 panfrost_emit_streamout(struct panfrost_batch *batch, union mali_attr *slot,
1617 unsigned stride, unsigned offset, unsigned count,
1618 struct pipe_stream_output_target *target)
1619 {
1620 /* Fill out the descriptor */
1621 slot->stride = stride * 4;
1622 slot->shift = slot->extra_flags = 0;
1623
1624 unsigned max_size = target->buffer_size;
1625 unsigned expected_size = slot->stride * count;
1626
1627 slot->size = MIN2(max_size, expected_size);
1628
1629 /* Grab the BO and bind it to the batch */
1630 struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1631
1632 /* Varyings are WRITE from the perspective of the VERTEX but READ from
1633 * the perspective of the TILER and FRAGMENT.
1634 */
1635 panfrost_batch_add_bo(batch, bo,
1636 PAN_BO_ACCESS_SHARED |
1637 PAN_BO_ACCESS_RW |
1638 PAN_BO_ACCESS_VERTEX_TILER |
1639 PAN_BO_ACCESS_FRAGMENT);
1640
1641 mali_ptr addr = bo->gpu + target->buffer_offset + (offset * slot->stride);
1642 slot->elements = addr;
1643 }
1644
1645 /* Given a shader and buffer indices, link varying metadata together */
1646
1647 static bool
1648 is_special_varying(gl_varying_slot loc)
1649 {
1650 switch (loc) {
1651 case VARYING_SLOT_POS:
1652 case VARYING_SLOT_PSIZ:
1653 case VARYING_SLOT_PNTC:
1654 case VARYING_SLOT_FACE:
1655 return true;
1656 default:
1657 return false;
1658 }
1659 }
1660
1661 static void
1662 panfrost_emit_varying_meta(void *outptr, struct panfrost_shader_state *ss,
1663 signed general, signed gl_Position,
1664 signed gl_PointSize, signed gl_PointCoord,
1665 signed gl_FrontFacing)
1666 {
1667 struct mali_attr_meta *out = (struct mali_attr_meta *) outptr;
1668
1669 for (unsigned i = 0; i < ss->varying_count; ++i) {
1670 gl_varying_slot location = ss->varyings_loc[i];
1671 int index = -1;
1672
1673 switch (location) {
1674 case VARYING_SLOT_POS:
1675 index = gl_Position;
1676 break;
1677 case VARYING_SLOT_PSIZ:
1678 index = gl_PointSize;
1679 break;
1680 case VARYING_SLOT_PNTC:
1681 index = gl_PointCoord;
1682 break;
1683 case VARYING_SLOT_FACE:
1684 index = gl_FrontFacing;
1685 break;
1686 default:
1687 index = general;
1688 break;
1689 }
1690
1691 assert(index >= 0);
1692 out[i].index = index;
1693 }
1694 }
1695
1696 static bool
1697 has_point_coord(unsigned mask, gl_varying_slot loc)
1698 {
1699 if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1700 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1701 else if (loc == VARYING_SLOT_PNTC)
1702 return (mask & (1 << 8));
1703 else
1704 return false;
1705 }
1706
1707 /* Helpers for manipulating stream out information so we can pack varyings
1708 * accordingly. Compute the src_offset for a given captured varying */
1709
1710 static struct pipe_stream_output *
1711 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1712 {
1713 for (unsigned i = 0; i < info->num_outputs; ++i) {
1714 if (info->output[i].register_index == loc)
1715 return &info->output[i];
1716 }
1717
1718 unreachable("Varying not captured");
1719 }
1720
1721 void
1722 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1723 unsigned vertex_count,
1724 struct mali_vertex_tiler_postfix *vertex_postfix,
1725 struct mali_vertex_tiler_postfix *tiler_postfix,
1726 union midgard_primitive_size *primitive_size)
1727 {
1728 /* Load the shaders */
1729 struct panfrost_context *ctx = batch->ctx;
1730 struct panfrost_device *device = pan_device(ctx->base.screen);
1731 struct panfrost_shader_state *vs, *fs;
1732 unsigned int num_gen_varyings = 0;
1733 size_t vs_size, fs_size;
1734
1735 /* Allocate the varying descriptor */
1736
1737 vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1738 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1739 vs_size = sizeof(struct mali_attr_meta) * vs->varying_count;
1740 fs_size = sizeof(struct mali_attr_meta) * fs->varying_count;
1741
1742 struct panfrost_transfer trans = panfrost_allocate_transient(batch,
1743 vs_size +
1744 fs_size);
1745
1746 struct pipe_stream_output_info *so = &vs->stream_output;
1747
1748 /* Check if this varying is linked by us. This is the case for
1749 * general-purpose, non-captured varyings. If it is, link it. If it's
1750 * not, use the provided stream out information to determine the
1751 * offset, since it was already linked for us. */
1752
1753 for (unsigned i = 0; i < vs->varying_count; i++) {
1754 gl_varying_slot loc = vs->varyings_loc[i];
1755
1756 bool special = is_special_varying(loc);
1757 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1758
1759 if (captured) {
1760 struct pipe_stream_output *o = pan_get_so(so, loc);
1761
1762 unsigned dst_offset = o->dst_offset * 4; /* dwords */
1763 vs->varyings[i].src_offset = dst_offset;
1764 } else if (!special) {
1765 vs->varyings[i].src_offset = 16 * (num_gen_varyings++);
1766 }
1767 }
1768
1769 /* Conversely, we need to set src_offset for the captured varyings.
1770 * Here, the layout is defined by the stream out info, not us */
1771
1772 /* Link up with fragment varyings */
1773 bool reads_point_coord = fs->reads_point_coord;
1774
1775 for (unsigned i = 0; i < fs->varying_count; i++) {
1776 gl_varying_slot loc = fs->varyings_loc[i];
1777 unsigned src_offset;
1778 signed vs_idx = -1;
1779
1780 /* Link up */
1781 for (unsigned j = 0; j < vs->varying_count; ++j) {
1782 if (vs->varyings_loc[j] == loc) {
1783 vs_idx = j;
1784 break;
1785 }
1786 }
1787
1788 /* Either assign or reuse */
1789 if (vs_idx >= 0)
1790 src_offset = vs->varyings[vs_idx].src_offset;
1791 else
1792 src_offset = 16 * (num_gen_varyings++);
1793
1794 fs->varyings[i].src_offset = src_offset;
1795
1796 if (has_point_coord(fs->point_sprite_mask, loc))
1797 reads_point_coord = true;
1798 }
1799
1800 memcpy(trans.cpu, vs->varyings, vs_size);
1801 memcpy(trans.cpu + vs_size, fs->varyings, fs_size);
1802
1803 union mali_attr varyings[PIPE_MAX_ATTRIBS] = {0};
1804
1805 /* Figure out how many streamout buffers could be bound */
1806 unsigned so_count = ctx->streamout.num_targets;
1807 for (unsigned i = 0; i < vs->varying_count; i++) {
1808 gl_varying_slot loc = vs->varyings_loc[i];
1809
1810 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1811 if (!captured) continue;
1812
1813 struct pipe_stream_output *o = pan_get_so(so, loc);
1814 so_count = MAX2(so_count, o->output_buffer + 1);
1815 }
1816
1817 signed idx = so_count;
1818 signed general = idx++;
1819 signed gl_Position = idx++;
1820 signed gl_PointSize = vs->writes_point_size ? (idx++) : -1;
1821 signed gl_PointCoord = reads_point_coord ? (idx++) : -1;
1822 signed gl_FrontFacing = fs->reads_face ? (idx++) : -1;
1823 signed gl_FragCoord = (fs->reads_frag_coord &&
1824 !(device->quirks & IS_BIFROST))
1825 ? (idx++) : -1;
1826
1827 /* Emit the stream out buffers */
1828
1829 unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1830 ctx->vertex_count);
1831
1832 for (unsigned i = 0; i < so_count; ++i) {
1833 if (i < ctx->streamout.num_targets) {
1834 panfrost_emit_streamout(batch, &varyings[i],
1835 so->stride[i],
1836 ctx->streamout.offsets[i],
1837 out_count,
1838 ctx->streamout.targets[i]);
1839 } else {
1840 /* Emit a dummy buffer */
1841 panfrost_emit_varyings(batch, &varyings[i],
1842 so->stride[i] * 4,
1843 out_count);
1844
1845 /* Clear the attribute type */
1846 varyings[i].elements &= ~0xF;
1847 }
1848 }
1849
1850 panfrost_emit_varyings(batch, &varyings[general],
1851 num_gen_varyings * 16,
1852 vertex_count);
1853
1854 mali_ptr varyings_p;
1855
1856 /* fp32 vec4 gl_Position */
1857 varyings_p = panfrost_emit_varyings(batch, &varyings[gl_Position],
1858 sizeof(float) * 4, vertex_count);
1859 tiler_postfix->position_varying = varyings_p;
1860
1861
1862 if (panfrost_writes_point_size(ctx)) {
1863 varyings_p = panfrost_emit_varyings(batch,
1864 &varyings[gl_PointSize],
1865 2, vertex_count);
1866 primitive_size->pointer = varyings_p;
1867 }
1868
1869 if (gl_PointCoord >= 0)
1870 varyings[gl_PointCoord].elements = MALI_VARYING_POINT_COORD;
1871
1872 if (gl_FrontFacing >= 0)
1873 varyings[gl_FrontFacing].elements = MALI_VARYING_FRONT_FACING;
1874
1875 if (gl_FragCoord >= 0)
1876 varyings[gl_FragCoord].elements = MALI_VARYING_FRAG_COORD;
1877
1878 assert(!(device->quirks & IS_BIFROST) || !(reads_point_coord));
1879
1880 /* Let's go ahead and link varying meta to the buffer in question, now
1881 * that that information is available. VARYING_SLOT_POS is mapped to
1882 * gl_FragCoord for fragment shaders but gl_Positionf or vertex shaders
1883 * */
1884
1885 panfrost_emit_varying_meta(trans.cpu, vs, general, gl_Position,
1886 gl_PointSize, gl_PointCoord,
1887 gl_FrontFacing);
1888
1889 panfrost_emit_varying_meta(trans.cpu + vs_size, fs, general,
1890 gl_FragCoord, gl_PointSize,
1891 gl_PointCoord, gl_FrontFacing);
1892
1893 /* Replace streamout */
1894
1895 struct mali_attr_meta *ovs = (struct mali_attr_meta *)trans.cpu;
1896 struct mali_attr_meta *ofs = ovs + vs->varying_count;
1897
1898 for (unsigned i = 0; i < vs->varying_count; i++) {
1899 gl_varying_slot loc = vs->varyings_loc[i];
1900
1901 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1902 if (!captured)
1903 continue;
1904
1905 struct pipe_stream_output *o = pan_get_so(so, loc);
1906 ovs[i].index = o->output_buffer;
1907
1908 assert(o->stream == 0);
1909 ovs[i].format = (vs->varyings[i].format & ~MALI_NR_CHANNELS(4))
1910 | MALI_NR_CHANNELS(o->num_components);
1911
1912 if (device->quirks & HAS_SWIZZLES)
1913 ovs[i].swizzle = panfrost_get_default_swizzle(o->num_components);
1914 else
1915 ovs[i].swizzle = panfrost_bifrost_swizzle(o->num_components);
1916
1917 /* Link to the fragment */
1918 signed fs_idx = -1;
1919
1920 /* Link up */
1921 for (unsigned j = 0; j < fs->varying_count; ++j) {
1922 if (fs->varyings_loc[j] == loc) {
1923 fs_idx = j;
1924 break;
1925 }
1926 }
1927
1928 if (fs_idx >= 0) {
1929 ofs[fs_idx].index = ovs[i].index;
1930 ofs[fs_idx].format = ovs[i].format;
1931 ofs[fs_idx].swizzle = ovs[i].swizzle;
1932 }
1933 }
1934
1935 /* Replace point sprite */
1936 for (unsigned i = 0; i < fs->varying_count; i++) {
1937 /* If we have a point sprite replacement, handle that here. We
1938 * have to translate location first. TODO: Flip y in shader.
1939 * We're already keying ... just time crunch .. */
1940
1941 if (has_point_coord(fs->point_sprite_mask,
1942 fs->varyings_loc[i])) {
1943 ofs[i].index = gl_PointCoord;
1944
1945 /* Swizzle out the z/w to 0/1 */
1946 ofs[i].format = MALI_RG16F;
1947 ofs[i].swizzle = panfrost_get_default_swizzle(2);
1948 }
1949 }
1950
1951 /* Fix up unaligned addresses */
1952 for (unsigned i = 0; i < so_count; ++i) {
1953 if (varyings[i].elements < MALI_RECORD_SPECIAL)
1954 continue;
1955
1956 unsigned align = (varyings[i].elements & 63);
1957
1958 /* While we're at it, the SO buffers are linear */
1959
1960 if (!align) {
1961 varyings[i].elements |= MALI_ATTR_LINEAR;
1962 continue;
1963 }
1964
1965 /* We need to adjust alignment */
1966 varyings[i].elements &= ~63;
1967 varyings[i].elements |= MALI_ATTR_LINEAR;
1968 varyings[i].size += align;
1969
1970 for (unsigned v = 0; v < vs->varying_count; ++v) {
1971 if (ovs[v].index != i)
1972 continue;
1973
1974 ovs[v].src_offset = vs->varyings[v].src_offset + align;
1975 }
1976
1977 for (unsigned f = 0; f < fs->varying_count; ++f) {
1978 if (ofs[f].index != i)
1979 continue;
1980
1981 ofs[f].src_offset = fs->varyings[f].src_offset + align;
1982 }
1983 }
1984
1985 varyings_p = panfrost_upload_transient(batch, varyings,
1986 idx * sizeof(*varyings));
1987 vertex_postfix->varyings = varyings_p;
1988 tiler_postfix->varyings = varyings_p;
1989
1990 vertex_postfix->varying_meta = trans.gpu;
1991 tiler_postfix->varying_meta = trans.gpu + vs_size;
1992 }
1993
1994 void
1995 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
1996 struct mali_vertex_tiler_prefix *vertex_prefix,
1997 struct mali_vertex_tiler_postfix *vertex_postfix,
1998 struct mali_vertex_tiler_prefix *tiler_prefix,
1999 struct mali_vertex_tiler_postfix *tiler_postfix,
2000 union midgard_primitive_size *primitive_size)
2001 {
2002 struct panfrost_context *ctx = batch->ctx;
2003 struct panfrost_device *device = pan_device(ctx->base.screen);
2004 bool wallpapering = ctx->wallpaper_batch && batch->tiler_dep;
2005 struct bifrost_payload_vertex bifrost_vertex = {0,};
2006 struct bifrost_payload_tiler bifrost_tiler = {0,};
2007 struct midgard_payload_vertex_tiler midgard_vertex = {0,};
2008 struct midgard_payload_vertex_tiler midgard_tiler = {0,};
2009 void *vp, *tp;
2010 size_t vp_size, tp_size;
2011
2012 if (device->quirks & IS_BIFROST) {
2013 bifrost_vertex.prefix = *vertex_prefix;
2014 bifrost_vertex.postfix = *vertex_postfix;
2015 vp = &bifrost_vertex;
2016 vp_size = sizeof(bifrost_vertex);
2017
2018 bifrost_tiler.prefix = *tiler_prefix;
2019 bifrost_tiler.tiler.primitive_size = *primitive_size;
2020 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
2021 bifrost_tiler.postfix = *tiler_postfix;
2022 tp = &bifrost_tiler;
2023 tp_size = sizeof(bifrost_tiler);
2024 } else {
2025 midgard_vertex.prefix = *vertex_prefix;
2026 midgard_vertex.postfix = *vertex_postfix;
2027 vp = &midgard_vertex;
2028 vp_size = sizeof(midgard_vertex);
2029
2030 midgard_tiler.prefix = *tiler_prefix;
2031 midgard_tiler.postfix = *tiler_postfix;
2032 midgard_tiler.primitive_size = *primitive_size;
2033 tp = &midgard_tiler;
2034 tp_size = sizeof(midgard_tiler);
2035 }
2036
2037 if (wallpapering) {
2038 /* Inject in reverse order, with "predicted" job indices.
2039 * THIS IS A HACK XXX */
2040 panfrost_new_job(batch, JOB_TYPE_TILER, false,
2041 batch->job_index + 2, tp, tp_size, true);
2042 panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
2043 vp, vp_size, true);
2044 return;
2045 }
2046
2047 /* If rasterizer discard is enable, only submit the vertex */
2048
2049 bool rasterizer_discard = ctx->rasterizer &&
2050 ctx->rasterizer->base.rasterizer_discard;
2051
2052 unsigned vertex = panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
2053 vp, vp_size, false);
2054
2055 if (rasterizer_discard)
2056 return;
2057
2058 panfrost_new_job(batch, JOB_TYPE_TILER, false, vertex, tp, tp_size,
2059 false);
2060 }
2061
2062 /* TODO: stop hardcoding this */
2063 mali_ptr
2064 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2065 {
2066 uint16_t locations[] = {
2067 128, 128,
2068 0, 256,
2069 0, 256,
2070 0, 256,
2071 0, 256,
2072 0, 256,
2073 0, 256,
2074 0, 256,
2075 0, 256,
2076 0, 256,
2077 0, 256,
2078 0, 256,
2079 0, 256,
2080 0, 256,
2081 0, 256,
2082 0, 256,
2083 0, 256,
2084 0, 256,
2085 0, 256,
2086 0, 256,
2087 0, 256,
2088 0, 256,
2089 0, 256,
2090 0, 256,
2091 0, 256,
2092 0, 256,
2093 0, 256,
2094 0, 256,
2095 0, 256,
2096 0, 256,
2097 0, 256,
2098 0, 256,
2099 128, 128,
2100 0, 0,
2101 0, 0,
2102 0, 0,
2103 0, 0,
2104 0, 0,
2105 0, 0,
2106 0, 0,
2107 0, 0,
2108 0, 0,
2109 0, 0,
2110 0, 0,
2111 0, 0,
2112 0, 0,
2113 0, 0,
2114 0, 0,
2115 };
2116
2117 return panfrost_upload_transient(batch, locations, 96 * sizeof(uint16_t));
2118 }