panfrost: Document Midgard Inf/NaN suppress bit
[mesa.git] / src / gallium / drivers / panfrost / pan_cmdstream.c
1 /*
2 * Copyright (C) 2018 Alyssa Rosenzweig
3 * Copyright (C) 2020 Collabora Ltd.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "util/macros.h"
26 #include "util/u_prim.h"
27 #include "util/u_vbuf.h"
28
29 #include "panfrost-quirks.h"
30
31 #include "pan_allocate.h"
32 #include "pan_bo.h"
33 #include "pan_cmdstream.h"
34 #include "pan_context.h"
35 #include "pan_job.h"
36
37 /* If a BO is accessed for a particular shader stage, will it be in the primary
38 * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
39 * fragment will be primary, e.g. compute jobs will be considered
40 * "vertex/tiler" by analogy */
41
42 static inline uint32_t
43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
44 {
45 assert(stage == PIPE_SHADER_FRAGMENT ||
46 stage == PIPE_SHADER_VERTEX ||
47 stage == PIPE_SHADER_COMPUTE);
48
49 return stage == PIPE_SHADER_FRAGMENT ?
50 PAN_BO_ACCESS_FRAGMENT :
51 PAN_BO_ACCESS_VERTEX_TILER;
52 }
53
54 static void
55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
56 struct mali_vertex_tiler_postfix *postfix)
57 {
58 struct panfrost_device *dev = pan_device(ctx->base.screen);
59 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
60
61 unsigned shift = panfrost_get_stack_shift(batch->stack_size);
62 struct mali_shared_memory shared = {
63 .stack_shift = shift,
64 .scratchpad = panfrost_batch_get_scratchpad(batch, shift, dev->thread_tls_alloc, dev->core_count)->gpu,
65 .shared_workgroup_count = ~0,
66 };
67 postfix->shared_memory = panfrost_upload_transient(batch, &shared, sizeof(shared));
68 }
69
70 static void
71 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
72 struct mali_vertex_tiler_postfix *postfix)
73 {
74 struct panfrost_device *dev = pan_device(ctx->base.screen);
75 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
76
77 /* If we haven't, reserve space for the framebuffer */
78
79 if (!batch->framebuffer.gpu) {
80 unsigned size = (dev->quirks & MIDGARD_SFBD) ?
81 sizeof(struct mali_single_framebuffer) :
82 sizeof(struct mali_framebuffer);
83
84 batch->framebuffer = panfrost_allocate_transient(batch, size);
85
86 /* Tag the pointer */
87 if (!(dev->quirks & MIDGARD_SFBD))
88 batch->framebuffer.gpu |= MALI_MFBD;
89 }
90
91 postfix->shared_memory = batch->framebuffer.gpu;
92 }
93
94 static void
95 panfrost_vt_update_rasterizer(struct panfrost_context *ctx,
96 struct mali_vertex_tiler_prefix *prefix,
97 struct mali_vertex_tiler_postfix *postfix)
98 {
99 struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
100
101 postfix->gl_enables |= 0x7;
102 SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
103 rasterizer && rasterizer->base.front_ccw);
104 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
105 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_FRONT));
106 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
107 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_BACK));
108 SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
109 rasterizer && rasterizer->base.flatshade_first);
110 }
111
112 void
113 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
114 struct mali_vertex_tiler_prefix *prefix,
115 union midgard_primitive_size *primitive_size)
116 {
117 struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
118
119 if (!panfrost_writes_point_size(ctx)) {
120 bool points = prefix->draw_mode == MALI_POINTS;
121 float val = 0.0f;
122
123 if (rasterizer)
124 val = points ?
125 rasterizer->base.point_size :
126 rasterizer->base.line_width;
127
128 primitive_size->constant = val;
129 }
130 }
131
132 static void
133 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
134 struct mali_vertex_tiler_postfix *postfix)
135 {
136 SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
137 if (ctx->occlusion_query)
138 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
139 else
140 postfix->occlusion_counter = 0;
141 }
142
143 void
144 panfrost_vt_init(struct panfrost_context *ctx,
145 enum pipe_shader_type stage,
146 struct mali_vertex_tiler_prefix *prefix,
147 struct mali_vertex_tiler_postfix *postfix)
148 {
149 struct panfrost_device *device = pan_device(ctx->base.screen);
150
151 if (!ctx->shader[stage])
152 return;
153
154 memset(prefix, 0, sizeof(*prefix));
155 memset(postfix, 0, sizeof(*postfix));
156
157 if (device->quirks & IS_BIFROST) {
158 postfix->gl_enables = 0x2;
159 panfrost_vt_emit_shared_memory(ctx, postfix);
160 } else {
161 postfix->gl_enables = 0x6;
162 panfrost_vt_attach_framebuffer(ctx, postfix);
163 }
164
165 if (stage == PIPE_SHADER_FRAGMENT) {
166 panfrost_vt_update_occlusion_query(ctx, postfix);
167 panfrost_vt_update_rasterizer(ctx, prefix, postfix);
168 }
169 }
170
171 static unsigned
172 panfrost_translate_index_size(unsigned size)
173 {
174 switch (size) {
175 case 1:
176 return MALI_DRAW_INDEXED_UINT8;
177
178 case 2:
179 return MALI_DRAW_INDEXED_UINT16;
180
181 case 4:
182 return MALI_DRAW_INDEXED_UINT32;
183
184 default:
185 unreachable("Invalid index size");
186 }
187 }
188
189 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
190 * good for the duration of the draw (transient), could last longer. Also get
191 * the bounds on the index buffer for the range accessed by the draw. We do
192 * these operations together because there are natural optimizations which
193 * require them to be together. */
194
195 static mali_ptr
196 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
197 const struct pipe_draw_info *info,
198 unsigned *min_index, unsigned *max_index)
199 {
200 struct panfrost_resource *rsrc = pan_resource(info->index.resource);
201 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
202 off_t offset = info->start * info->index_size;
203 bool needs_indices = true;
204 mali_ptr out = 0;
205
206 if (info->max_index != ~0u) {
207 *min_index = info->min_index;
208 *max_index = info->max_index;
209 needs_indices = false;
210 }
211
212 if (!info->has_user_indices) {
213 /* Only resources can be directly mapped */
214 panfrost_batch_add_bo(batch, rsrc->bo,
215 PAN_BO_ACCESS_SHARED |
216 PAN_BO_ACCESS_READ |
217 PAN_BO_ACCESS_VERTEX_TILER);
218 out = rsrc->bo->gpu + offset;
219
220 /* Check the cache */
221 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
222 info->start,
223 info->count,
224 min_index,
225 max_index);
226 } else {
227 /* Otherwise, we need to upload to transient memory */
228 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
229 out = panfrost_upload_transient(batch, ibuf8 + offset,
230 info->count *
231 info->index_size);
232 }
233
234 if (needs_indices) {
235 /* Fallback */
236 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
237
238 if (!info->has_user_indices)
239 panfrost_minmax_cache_add(rsrc->index_cache,
240 info->start, info->count,
241 *min_index, *max_index);
242 }
243
244 return out;
245 }
246
247 void
248 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
249 const struct pipe_draw_info *info,
250 enum mali_draw_mode draw_mode,
251 struct mali_vertex_tiler_postfix *vertex_postfix,
252 struct mali_vertex_tiler_prefix *tiler_prefix,
253 struct mali_vertex_tiler_postfix *tiler_postfix,
254 unsigned *vertex_count,
255 unsigned *padded_count)
256 {
257 tiler_prefix->draw_mode = draw_mode;
258
259 unsigned draw_flags = 0;
260
261 if (panfrost_writes_point_size(ctx))
262 draw_flags |= MALI_DRAW_VARYING_SIZE;
263
264 if (info->primitive_restart)
265 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
266
267 /* These doesn't make much sense */
268
269 draw_flags |= 0x3000;
270
271 if (info->index_size) {
272 unsigned min_index = 0, max_index = 0;
273
274 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
275 info,
276 &min_index,
277 &max_index);
278
279 /* Use the corresponding values */
280 *vertex_count = max_index - min_index + 1;
281 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
282 tiler_prefix->offset_bias_correction = -min_index;
283 tiler_prefix->index_count = MALI_POSITIVE(info->count);
284 draw_flags |= panfrost_translate_index_size(info->index_size);
285 } else {
286 tiler_prefix->indices = 0;
287 *vertex_count = ctx->vertex_count;
288 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
289 tiler_prefix->offset_bias_correction = 0;
290 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
291 }
292
293 tiler_prefix->unknown_draw = draw_flags;
294
295 /* Encode the padded vertex count */
296
297 if (info->instance_count > 1) {
298 *padded_count = panfrost_padded_vertex_count(*vertex_count);
299
300 unsigned shift = __builtin_ctz(ctx->padded_count);
301 unsigned k = ctx->padded_count >> (shift + 1);
302
303 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
304 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
305 } else {
306 *padded_count = *vertex_count;
307
308 /* Reset instancing state */
309 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
310 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
311 }
312 }
313
314 static void
315 panfrost_shader_meta_init(struct panfrost_context *ctx,
316 enum pipe_shader_type st,
317 struct mali_shader_meta *meta)
318 {
319 const struct panfrost_device *dev = pan_device(ctx->base.screen);
320 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
321
322 memset(meta, 0, sizeof(*meta));
323 meta->shader = (ss->bo ? ss->bo->gpu : 0) | ss->first_tag;
324 meta->attribute_count = ss->attribute_count;
325 meta->varying_count = ss->varying_count;
326 meta->texture_count = ctx->sampler_view_count[st];
327 meta->sampler_count = ctx->sampler_count[st];
328
329 if (dev->quirks & IS_BIFROST) {
330 if (st == PIPE_SHADER_VERTEX)
331 meta->bifrost1.unk1 = 0x800000;
332 else {
333 /* First clause ATEST |= 0x4000000.
334 * Less than 32 regs |= 0x200 */
335 meta->bifrost1.unk1 = 0x958020;
336 }
337
338 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
339 if (st == PIPE_SHADER_VERTEX)
340 meta->bifrost2.preload_regs = 0xC0;
341 else
342 meta->bifrost2.preload_regs = 0x1;
343 meta->bifrost2.uniform_count = MIN2(ss->uniform_count,
344 ss->uniform_cutoff);
345 } else {
346 meta->midgard1.uniform_count = MIN2(ss->uniform_count,
347 ss->uniform_cutoff);
348 meta->midgard1.work_count = ss->work_reg_count;
349
350 /* TODO: This is not conformant on ES3 */
351 meta->midgard1.flags_hi = MALI_SUPPRESS_INF_NAN;
352
353 meta->midgard1.flags_lo = 0x220;
354 meta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
355 }
356 }
357
358 static unsigned
359 panfrost_translate_compare_func(enum pipe_compare_func in)
360 {
361 switch (in) {
362 case PIPE_FUNC_NEVER:
363 return MALI_FUNC_NEVER;
364
365 case PIPE_FUNC_LESS:
366 return MALI_FUNC_LESS;
367
368 case PIPE_FUNC_EQUAL:
369 return MALI_FUNC_EQUAL;
370
371 case PIPE_FUNC_LEQUAL:
372 return MALI_FUNC_LEQUAL;
373
374 case PIPE_FUNC_GREATER:
375 return MALI_FUNC_GREATER;
376
377 case PIPE_FUNC_NOTEQUAL:
378 return MALI_FUNC_NOTEQUAL;
379
380 case PIPE_FUNC_GEQUAL:
381 return MALI_FUNC_GEQUAL;
382
383 case PIPE_FUNC_ALWAYS:
384 return MALI_FUNC_ALWAYS;
385
386 default:
387 unreachable("Invalid func");
388 }
389 }
390
391 static unsigned
392 panfrost_translate_stencil_op(enum pipe_stencil_op in)
393 {
394 switch (in) {
395 case PIPE_STENCIL_OP_KEEP:
396 return MALI_STENCIL_KEEP;
397
398 case PIPE_STENCIL_OP_ZERO:
399 return MALI_STENCIL_ZERO;
400
401 case PIPE_STENCIL_OP_REPLACE:
402 return MALI_STENCIL_REPLACE;
403
404 case PIPE_STENCIL_OP_INCR:
405 return MALI_STENCIL_INCR;
406
407 case PIPE_STENCIL_OP_DECR:
408 return MALI_STENCIL_DECR;
409
410 case PIPE_STENCIL_OP_INCR_WRAP:
411 return MALI_STENCIL_INCR_WRAP;
412
413 case PIPE_STENCIL_OP_DECR_WRAP:
414 return MALI_STENCIL_DECR_WRAP;
415
416 case PIPE_STENCIL_OP_INVERT:
417 return MALI_STENCIL_INVERT;
418
419 default:
420 unreachable("Invalid stencil op");
421 }
422 }
423
424 static unsigned
425 translate_tex_wrap(enum pipe_tex_wrap w)
426 {
427 switch (w) {
428 case PIPE_TEX_WRAP_REPEAT:
429 return MALI_WRAP_REPEAT;
430
431 case PIPE_TEX_WRAP_CLAMP:
432 return MALI_WRAP_CLAMP;
433
434 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
435 return MALI_WRAP_CLAMP_TO_EDGE;
436
437 case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
438 return MALI_WRAP_CLAMP_TO_BORDER;
439
440 case PIPE_TEX_WRAP_MIRROR_REPEAT:
441 return MALI_WRAP_MIRRORED_REPEAT;
442
443 case PIPE_TEX_WRAP_MIRROR_CLAMP:
444 return MALI_WRAP_MIRRORED_CLAMP;
445
446 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
447 return MALI_WRAP_MIRRORED_CLAMP_TO_EDGE;
448
449 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
450 return MALI_WRAP_MIRRORED_CLAMP_TO_BORDER;
451
452 default:
453 unreachable("Invalid wrap");
454 }
455 }
456
457 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
458 struct mali_sampler_descriptor *hw)
459 {
460 unsigned func = panfrost_translate_compare_func(cso->compare_func);
461 bool min_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
462 bool mag_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
463 bool mip_linear = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR;
464 unsigned min_filter = min_nearest ? MALI_SAMP_MIN_NEAREST : 0;
465 unsigned mag_filter = mag_nearest ? MALI_SAMP_MAG_NEAREST : 0;
466 unsigned mip_filter = mip_linear ?
467 (MALI_SAMP_MIP_LINEAR_1 | MALI_SAMP_MIP_LINEAR_2) : 0;
468 unsigned normalized = cso->normalized_coords ? MALI_SAMP_NORM_COORDS : 0;
469
470 *hw = (struct mali_sampler_descriptor) {
471 .filter_mode = min_filter | mag_filter | mip_filter |
472 normalized,
473 .wrap_s = translate_tex_wrap(cso->wrap_s),
474 .wrap_t = translate_tex_wrap(cso->wrap_t),
475 .wrap_r = translate_tex_wrap(cso->wrap_r),
476 .compare_func = panfrost_flip_compare_func(func),
477 .border_color = {
478 cso->border_color.f[0],
479 cso->border_color.f[1],
480 cso->border_color.f[2],
481 cso->border_color.f[3]
482 },
483 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
484 .max_lod = FIXED_16(cso->max_lod, false),
485 .lod_bias = FIXED_16(cso->lod_bias, true), /* can be negative */
486 .seamless_cube_map = cso->seamless_cube_map,
487 };
488
489 /* If necessary, we disable mipmapping in the sampler descriptor by
490 * clamping the LOD as tight as possible (from 0 to epsilon,
491 * essentially -- remember these are fixed point numbers, so
492 * epsilon=1/256) */
493
494 if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
495 hw->max_lod = hw->min_lod + 1;
496 }
497
498 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
499 struct bifrost_sampler_descriptor *hw)
500 {
501 *hw = (struct bifrost_sampler_descriptor) {
502 .unk1 = 0x1,
503 .wrap_s = translate_tex_wrap(cso->wrap_s),
504 .wrap_t = translate_tex_wrap(cso->wrap_t),
505 .wrap_r = translate_tex_wrap(cso->wrap_r),
506 .unk8 = 0x8,
507 .min_filter = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST,
508 .norm_coords = cso->normalized_coords,
509 .mip_filter = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR,
510 .mag_filter = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR,
511 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
512 .max_lod = FIXED_16(cso->max_lod, false),
513 };
514
515 /* If necessary, we disable mipmapping in the sampler descriptor by
516 * clamping the LOD as tight as possible (from 0 to epsilon,
517 * essentially -- remember these are fixed point numbers, so
518 * epsilon=1/256) */
519
520 if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
521 hw->max_lod = hw->min_lod + 1;
522 }
523
524 static void
525 panfrost_make_stencil_state(const struct pipe_stencil_state *in,
526 struct mali_stencil_test *out)
527 {
528 out->ref = 0; /* Gallium gets it from elsewhere */
529
530 out->mask = in->valuemask;
531 out->func = panfrost_translate_compare_func(in->func);
532 out->sfail = panfrost_translate_stencil_op(in->fail_op);
533 out->dpfail = panfrost_translate_stencil_op(in->zfail_op);
534 out->dppass = panfrost_translate_stencil_op(in->zpass_op);
535 }
536
537 static void
538 panfrost_frag_meta_rasterizer_update(struct panfrost_context *ctx,
539 struct mali_shader_meta *fragmeta)
540 {
541 if (!ctx->rasterizer) {
542 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, true);
543 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, false);
544 fragmeta->depth_units = 0.0f;
545 fragmeta->depth_factor = 0.0f;
546 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, false);
547 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, false);
548 return;
549 }
550
551 bool msaa = ctx->rasterizer->base.multisample;
552
553 /* TODO: Sample size */
554 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
555 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
556 fragmeta->depth_units = ctx->rasterizer->base.offset_units * 2.0f;
557 fragmeta->depth_factor = ctx->rasterizer->base.offset_scale;
558
559 /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
560
561 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A,
562 ctx->rasterizer->base.offset_tri);
563 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B,
564 ctx->rasterizer->base.offset_tri);
565 }
566
567 static void
568 panfrost_frag_meta_zsa_update(struct panfrost_context *ctx,
569 struct mali_shader_meta *fragmeta)
570 {
571 const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
572 int zfunc = PIPE_FUNC_ALWAYS;
573
574 if (!zsa) {
575 struct pipe_stencil_state default_stencil = {
576 .enabled = 0,
577 .func = PIPE_FUNC_ALWAYS,
578 .fail_op = MALI_STENCIL_KEEP,
579 .zfail_op = MALI_STENCIL_KEEP,
580 .zpass_op = MALI_STENCIL_KEEP,
581 .writemask = 0xFF,
582 .valuemask = 0xFF
583 };
584
585 panfrost_make_stencil_state(&default_stencil,
586 &fragmeta->stencil_front);
587 fragmeta->stencil_mask_front = default_stencil.writemask;
588 fragmeta->stencil_back = fragmeta->stencil_front;
589 fragmeta->stencil_mask_back = default_stencil.writemask;
590 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST, false);
591 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK, false);
592 } else {
593 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
594 zsa->stencil[0].enabled);
595 panfrost_make_stencil_state(&zsa->stencil[0],
596 &fragmeta->stencil_front);
597 fragmeta->stencil_mask_front = zsa->stencil[0].writemask;
598 fragmeta->stencil_front.ref = ctx->stencil_ref.ref_value[0];
599
600 /* If back-stencil is not enabled, use the front values */
601
602 if (zsa->stencil[1].enabled) {
603 panfrost_make_stencil_state(&zsa->stencil[1],
604 &fragmeta->stencil_back);
605 fragmeta->stencil_mask_back = zsa->stencil[1].writemask;
606 fragmeta->stencil_back.ref = ctx->stencil_ref.ref_value[1];
607 } else {
608 fragmeta->stencil_back = fragmeta->stencil_front;
609 fragmeta->stencil_mask_back = fragmeta->stencil_mask_front;
610 fragmeta->stencil_back.ref = fragmeta->stencil_front.ref;
611 }
612
613 if (zsa->depth.enabled)
614 zfunc = zsa->depth.func;
615
616 /* Depth state (TODO: Refactor) */
617
618 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
619 zsa->depth.writemask);
620 }
621
622 fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
623 fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(zfunc));
624 }
625
626 static bool
627 panfrost_fs_required(
628 struct panfrost_shader_state *fs,
629 struct panfrost_blend_final *blend,
630 unsigned rt_count)
631 {
632 /* If we generally have side effects */
633 if (fs->fs_sidefx)
634 return true;
635
636 /* If colour is written we need to execute */
637 for (unsigned i = 0; i < rt_count; ++i) {
638 if (!blend[i].no_colour)
639 return true;
640 }
641
642 /* If depth is written and not implied we need to execute.
643 * TODO: Predicate on Z/S writes being enabled */
644 return (fs->writes_depth || fs->writes_stencil);
645 }
646
647 static void
648 panfrost_frag_meta_blend_update(struct panfrost_context *ctx,
649 struct mali_shader_meta *fragmeta,
650 void *rts)
651 {
652 const struct panfrost_device *dev = pan_device(ctx->base.screen);
653 struct panfrost_shader_state *fs;
654 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
655
656 SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
657 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
658 !ctx->blend->base.dither);
659
660 /* Get blending setup */
661 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
662
663 struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
664 unsigned shader_offset = 0;
665 struct panfrost_bo *shader_bo = NULL;
666
667 for (unsigned c = 0; c < rt_count; ++c)
668 blend[c] = panfrost_get_blend_for_context(ctx, c, &shader_bo,
669 &shader_offset);
670
671 /* Disable shader execution if we can */
672 if (dev->quirks & MIDGARD_SHADERLESS
673 && !panfrost_fs_required(fs, blend, rt_count)) {
674 fragmeta->shader = 0;
675 fragmeta->attribute_count = 0;
676 fragmeta->varying_count = 0;
677 fragmeta->texture_count = 0;
678 fragmeta->sampler_count = 0;
679
680 /* This feature is not known to work on Bifrost */
681 fragmeta->midgard1.work_count = 1;
682 fragmeta->midgard1.uniform_count = 0;
683 fragmeta->midgard1.uniform_buffer_count = 0;
684 }
685
686 /* If there is a blend shader, work registers are shared. We impose 8
687 * work registers as a limit for blend shaders. Should be lower XXX */
688
689 if (!(dev->quirks & IS_BIFROST)) {
690 for (unsigned c = 0; c < rt_count; ++c) {
691 if (blend[c].is_shader) {
692 fragmeta->midgard1.work_count =
693 MAX2(fragmeta->midgard1.work_count, 8);
694 }
695 }
696 }
697
698 /* Even on MFBD, the shader descriptor gets blend shaders. It's *also*
699 * copied to the blend_meta appended (by convention), but this is the
700 * field actually read by the hardware. (Or maybe both are read...?).
701 * Specify the last RTi with a blend shader. */
702
703 fragmeta->blend.shader = 0;
704
705 for (signed rt = (rt_count - 1); rt >= 0; --rt) {
706 if (!blend[rt].is_shader)
707 continue;
708
709 fragmeta->blend.shader = blend[rt].shader.gpu |
710 blend[rt].shader.first_tag;
711 break;
712 }
713
714 if (dev->quirks & MIDGARD_SFBD) {
715 /* When only a single render target platform is used, the blend
716 * information is inside the shader meta itself. We additionally
717 * need to signal CAN_DISCARD for nontrivial blend modes (so
718 * we're able to read back the destination buffer) */
719
720 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
721 blend[0].is_shader);
722
723 if (!blend[0].is_shader) {
724 fragmeta->blend.equation = *blend[0].equation.equation;
725 fragmeta->blend.constant = blend[0].equation.constant;
726 }
727
728 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
729 !blend[0].no_blending || fs->can_discard);
730 return;
731 }
732
733 /* Additional blend descriptor tacked on for jobs using MFBD */
734
735 for (unsigned i = 0; i < rt_count; ++i) {
736 unsigned flags = 0;
737
738 if (ctx->pipe_framebuffer.nr_cbufs > i && !blend[i].no_colour) {
739 flags = 0x200;
740
741 bool is_srgb = (ctx->pipe_framebuffer.nr_cbufs > i) &&
742 (ctx->pipe_framebuffer.cbufs[i]) &&
743 util_format_is_srgb(ctx->pipe_framebuffer.cbufs[i]->format);
744
745 SET_BIT(flags, MALI_BLEND_MRT_SHADER, blend[i].is_shader);
746 SET_BIT(flags, MALI_BLEND_LOAD_TIB, !blend[i].no_blending);
747 SET_BIT(flags, MALI_BLEND_SRGB, is_srgb);
748 SET_BIT(flags, MALI_BLEND_NO_DITHER, !ctx->blend->base.dither);
749 }
750
751 if (dev->quirks & IS_BIFROST) {
752 struct bifrost_blend_rt *brts = rts;
753
754 brts[i].flags = flags;
755
756 if (blend[i].is_shader) {
757 /* The blend shader's address needs to be at
758 * the same top 32 bit as the fragment shader.
759 * TODO: Ensure that's always the case.
760 */
761 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
762 (fs->bo->gpu & (0xffffffffull << 32)));
763 brts[i].shader = blend[i].shader.gpu;
764 brts[i].unk2 = 0x0;
765 } else if (ctx->pipe_framebuffer.nr_cbufs > i) {
766 enum pipe_format format = ctx->pipe_framebuffer.cbufs[i]->format;
767 const struct util_format_description *format_desc;
768 format_desc = util_format_description(format);
769
770 brts[i].equation = *blend[i].equation.equation;
771
772 /* TODO: this is a bit more complicated */
773 brts[i].constant = blend[i].equation.constant;
774
775 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
776
777 /* 0x19 disables blending and forces REPLACE
778 * mode (equivalent to rgb_mode = alpha_mode =
779 * x122, colour mask = 0xF). 0x1a allows
780 * blending. */
781 brts[i].unk2 = blend[i].no_blending ? 0x19 : 0x1a;
782
783 brts[i].shader_type = fs->blend_types[i];
784 } else {
785 /* Dummy attachment for depth-only */
786 brts[i].unk2 = 0x3;
787 brts[i].shader_type = fs->blend_types[i];
788 }
789 } else {
790 struct midgard_blend_rt *mrts = rts;
791 mrts[i].flags = flags;
792
793 if (blend[i].is_shader) {
794 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
795 } else {
796 mrts[i].blend.equation = *blend[i].equation.equation;
797 mrts[i].blend.constant = blend[i].equation.constant;
798 }
799 }
800 }
801 }
802
803 static void
804 panfrost_frag_shader_meta_init(struct panfrost_context *ctx,
805 struct mali_shader_meta *fragmeta,
806 void *rts)
807 {
808 const struct panfrost_device *dev = pan_device(ctx->base.screen);
809 struct panfrost_shader_state *fs;
810
811 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
812
813 fragmeta->alpha_coverage = ~MALI_ALPHA_COVERAGE(0.000000);
814 fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x3010;
815 fragmeta->unknown2_4 = 0x4e0;
816
817 /* unknown2_4 has 0x10 bit set on T6XX and T720. We don't know why this
818 * is required (independent of 32-bit/64-bit descriptors), or why it's
819 * not used on later GPU revisions. Otherwise, all shader jobs fault on
820 * these earlier chips (perhaps this is a chicken bit of some kind).
821 * More investigation is needed. */
822
823 SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
824
825 if (dev->quirks & IS_BIFROST) {
826 /* TODO */
827 } else {
828 /* Depending on whether it's legal to in the given shader, we try to
829 * enable early-z testing (or forward-pixel kill?) */
830
831 SET_BIT(fragmeta->midgard1.flags_lo, MALI_EARLY_Z,
832 !fs->can_discard && !fs->writes_depth);
833
834 /* Add the writes Z/S flags if needed. */
835 SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_Z, fs->writes_depth);
836 SET_BIT(fragmeta->midgard1.flags_hi, MALI_WRITES_S, fs->writes_stencil);
837
838 /* Any time texturing is used, derivatives are implicitly calculated,
839 * so we need to enable helper invocations */
840
841 SET_BIT(fragmeta->midgard1.flags_lo, MALI_HELPER_INVOCATIONS,
842 fs->helper_invocations);
843
844 const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
845
846 bool depth_enabled = fs->writes_depth ||
847 (zsa && zsa->depth.enabled && zsa->depth.func != PIPE_FUNC_ALWAYS);
848
849 SET_BIT(fragmeta->midgard1.flags_lo, 0x400, !depth_enabled && fs->can_discard);
850 SET_BIT(fragmeta->midgard1.flags_lo, MALI_READS_ZS, depth_enabled && fs->can_discard);
851 }
852
853 panfrost_frag_meta_rasterizer_update(ctx, fragmeta);
854 panfrost_frag_meta_zsa_update(ctx, fragmeta);
855 panfrost_frag_meta_blend_update(ctx, fragmeta, rts);
856 }
857
858 void
859 panfrost_emit_shader_meta(struct panfrost_batch *batch,
860 enum pipe_shader_type st,
861 struct mali_vertex_tiler_postfix *postfix)
862 {
863 struct panfrost_context *ctx = batch->ctx;
864 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
865
866 if (!ss) {
867 postfix->shader = 0;
868 return;
869 }
870
871 struct mali_shader_meta meta;
872
873 panfrost_shader_meta_init(ctx, st, &meta);
874
875 /* Add the shader BO to the batch. */
876 panfrost_batch_add_bo(batch, ss->bo,
877 PAN_BO_ACCESS_PRIVATE |
878 PAN_BO_ACCESS_READ |
879 panfrost_bo_access_for_stage(st));
880
881 mali_ptr shader_ptr;
882
883 if (st == PIPE_SHADER_FRAGMENT) {
884 struct panfrost_device *dev = pan_device(ctx->base.screen);
885 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
886 size_t desc_size = sizeof(meta);
887 void *rts = NULL;
888 struct panfrost_transfer xfer;
889 unsigned rt_size;
890
891 if (dev->quirks & MIDGARD_SFBD)
892 rt_size = 0;
893 else if (dev->quirks & IS_BIFROST)
894 rt_size = sizeof(struct bifrost_blend_rt);
895 else
896 rt_size = sizeof(struct midgard_blend_rt);
897
898 desc_size += rt_size * rt_count;
899
900 if (rt_size)
901 rts = rzalloc_size(ctx, rt_size * rt_count);
902
903 panfrost_frag_shader_meta_init(ctx, &meta, rts);
904
905 xfer = panfrost_allocate_transient(batch, desc_size);
906
907 memcpy(xfer.cpu, &meta, sizeof(meta));
908 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
909
910 if (rt_size)
911 ralloc_free(rts);
912
913 shader_ptr = xfer.gpu;
914 } else {
915 shader_ptr = panfrost_upload_transient(batch, &meta,
916 sizeof(meta));
917 }
918
919 postfix->shader = shader_ptr;
920 }
921
922 static void
923 panfrost_mali_viewport_init(struct panfrost_context *ctx,
924 struct mali_viewport *mvp)
925 {
926 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
927
928 /* Clip bounds are encoded as floats. The viewport itself is encoded as
929 * (somewhat) asymmetric ints. */
930
931 const struct pipe_scissor_state *ss = &ctx->scissor;
932
933 memset(mvp, 0, sizeof(*mvp));
934
935 /* By default, do no viewport clipping, i.e. clip to (-inf, inf) in
936 * each direction. Clipping to the viewport in theory should work, but
937 * in practice causes issues when we're not explicitly trying to
938 * scissor */
939
940 *mvp = (struct mali_viewport) {
941 .clip_minx = -INFINITY,
942 .clip_miny = -INFINITY,
943 .clip_maxx = INFINITY,
944 .clip_maxy = INFINITY,
945 };
946
947 /* Always scissor to the viewport by default. */
948 float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
949 float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
950
951 float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
952 float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
953
954 float minz = (vp->translate[2] - fabsf(vp->scale[2]));
955 float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
956
957 /* Apply the scissor test */
958
959 unsigned minx, miny, maxx, maxy;
960
961 if (ss && ctx->rasterizer && ctx->rasterizer->base.scissor) {
962 minx = MAX2(ss->minx, vp_minx);
963 miny = MAX2(ss->miny, vp_miny);
964 maxx = MIN2(ss->maxx, vp_maxx);
965 maxy = MIN2(ss->maxy, vp_maxy);
966 } else {
967 minx = vp_minx;
968 miny = vp_miny;
969 maxx = vp_maxx;
970 maxy = vp_maxy;
971 }
972
973 /* Hardware needs the min/max to be strictly ordered, so flip if we
974 * need to. The viewport transformation in the vertex shader will
975 * handle the negatives if we don't */
976
977 if (miny > maxy) {
978 unsigned temp = miny;
979 miny = maxy;
980 maxy = temp;
981 }
982
983 if (minx > maxx) {
984 unsigned temp = minx;
985 minx = maxx;
986 maxx = temp;
987 }
988
989 if (minz > maxz) {
990 float temp = minz;
991 minz = maxz;
992 maxz = temp;
993 }
994
995 /* Clamp to the framebuffer size as a last check */
996
997 minx = MIN2(ctx->pipe_framebuffer.width, minx);
998 maxx = MIN2(ctx->pipe_framebuffer.width, maxx);
999
1000 miny = MIN2(ctx->pipe_framebuffer.height, miny);
1001 maxy = MIN2(ctx->pipe_framebuffer.height, maxy);
1002
1003 /* Upload */
1004
1005 mvp->viewport0[0] = minx;
1006 mvp->viewport1[0] = MALI_POSITIVE(maxx);
1007
1008 mvp->viewport0[1] = miny;
1009 mvp->viewport1[1] = MALI_POSITIVE(maxy);
1010
1011 mvp->clip_minz = minz;
1012 mvp->clip_maxz = maxz;
1013 }
1014
1015 void
1016 panfrost_emit_viewport(struct panfrost_batch *batch,
1017 struct mali_vertex_tiler_postfix *tiler_postfix)
1018 {
1019 struct panfrost_context *ctx = batch->ctx;
1020 struct mali_viewport mvp;
1021
1022 panfrost_mali_viewport_init(batch->ctx, &mvp);
1023
1024 /* Update the job, unless we're doing wallpapering (whose lack of
1025 * scissor we can ignore, since if we "miss" a tile of wallpaper, it'll
1026 * just... be faster :) */
1027
1028 if (!ctx->wallpaper_batch)
1029 panfrost_batch_union_scissor(batch, mvp.viewport0[0],
1030 mvp.viewport0[1],
1031 mvp.viewport1[0] + 1,
1032 mvp.viewport1[1] + 1);
1033
1034 tiler_postfix->viewport = panfrost_upload_transient(batch, &mvp,
1035 sizeof(mvp));
1036 }
1037
1038 static mali_ptr
1039 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
1040 enum pipe_shader_type st,
1041 struct panfrost_constant_buffer *buf,
1042 unsigned index)
1043 {
1044 struct pipe_constant_buffer *cb = &buf->cb[index];
1045 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1046
1047 if (rsrc) {
1048 panfrost_batch_add_bo(batch, rsrc->bo,
1049 PAN_BO_ACCESS_SHARED |
1050 PAN_BO_ACCESS_READ |
1051 panfrost_bo_access_for_stage(st));
1052
1053 /* Alignment gauranteed by
1054 * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
1055 return rsrc->bo->gpu + cb->buffer_offset;
1056 } else if (cb->user_buffer) {
1057 return panfrost_upload_transient(batch,
1058 cb->user_buffer +
1059 cb->buffer_offset,
1060 cb->buffer_size);
1061 } else {
1062 unreachable("No constant buffer");
1063 }
1064 }
1065
1066 struct sysval_uniform {
1067 union {
1068 float f[4];
1069 int32_t i[4];
1070 uint32_t u[4];
1071 uint64_t du[2];
1072 };
1073 };
1074
1075 static void
1076 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
1077 struct sysval_uniform *uniform)
1078 {
1079 struct panfrost_context *ctx = batch->ctx;
1080 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1081
1082 uniform->f[0] = vp->scale[0];
1083 uniform->f[1] = vp->scale[1];
1084 uniform->f[2] = vp->scale[2];
1085 }
1086
1087 static void
1088 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
1089 struct sysval_uniform *uniform)
1090 {
1091 struct panfrost_context *ctx = batch->ctx;
1092 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1093
1094 uniform->f[0] = vp->translate[0];
1095 uniform->f[1] = vp->translate[1];
1096 uniform->f[2] = vp->translate[2];
1097 }
1098
1099 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
1100 enum pipe_shader_type st,
1101 unsigned int sysvalid,
1102 struct sysval_uniform *uniform)
1103 {
1104 struct panfrost_context *ctx = batch->ctx;
1105 unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
1106 unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
1107 bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
1108 struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
1109
1110 assert(dim);
1111 uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
1112
1113 if (dim > 1)
1114 uniform->i[1] = u_minify(tex->texture->height0,
1115 tex->u.tex.first_level);
1116
1117 if (dim > 2)
1118 uniform->i[2] = u_minify(tex->texture->depth0,
1119 tex->u.tex.first_level);
1120
1121 if (is_array)
1122 uniform->i[dim] = tex->texture->array_size;
1123 }
1124
1125 static void
1126 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
1127 enum pipe_shader_type st,
1128 unsigned ssbo_id,
1129 struct sysval_uniform *uniform)
1130 {
1131 struct panfrost_context *ctx = batch->ctx;
1132
1133 assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
1134 struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
1135
1136 /* Compute address */
1137 struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
1138
1139 panfrost_batch_add_bo(batch, bo,
1140 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
1141 panfrost_bo_access_for_stage(st));
1142
1143 /* Upload address and size as sysval */
1144 uniform->du[0] = bo->gpu + sb.buffer_offset;
1145 uniform->u[2] = sb.buffer_size;
1146 }
1147
1148 static void
1149 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1150 enum pipe_shader_type st,
1151 unsigned samp_idx,
1152 struct sysval_uniform *uniform)
1153 {
1154 struct panfrost_context *ctx = batch->ctx;
1155 struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1156
1157 uniform->f[0] = sampl->min_lod;
1158 uniform->f[1] = sampl->max_lod;
1159 uniform->f[2] = sampl->lod_bias;
1160
1161 /* Even without any errata, Midgard represents "no mipmapping" as
1162 * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1163 * panfrost_create_sampler_state which also explains our choice of
1164 * epsilon value (again to keep behaviour consistent) */
1165
1166 if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1167 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1168 }
1169
1170 static void
1171 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1172 struct sysval_uniform *uniform)
1173 {
1174 struct panfrost_context *ctx = batch->ctx;
1175
1176 uniform->u[0] = ctx->compute_grid->grid[0];
1177 uniform->u[1] = ctx->compute_grid->grid[1];
1178 uniform->u[2] = ctx->compute_grid->grid[2];
1179 }
1180
1181 static void
1182 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1183 struct panfrost_shader_state *ss,
1184 enum pipe_shader_type st)
1185 {
1186 struct sysval_uniform *uniforms = (void *)buf;
1187
1188 for (unsigned i = 0; i < ss->sysval_count; ++i) {
1189 int sysval = ss->sysval[i];
1190
1191 switch (PAN_SYSVAL_TYPE(sysval)) {
1192 case PAN_SYSVAL_VIEWPORT_SCALE:
1193 panfrost_upload_viewport_scale_sysval(batch,
1194 &uniforms[i]);
1195 break;
1196 case PAN_SYSVAL_VIEWPORT_OFFSET:
1197 panfrost_upload_viewport_offset_sysval(batch,
1198 &uniforms[i]);
1199 break;
1200 case PAN_SYSVAL_TEXTURE_SIZE:
1201 panfrost_upload_txs_sysval(batch, st,
1202 PAN_SYSVAL_ID(sysval),
1203 &uniforms[i]);
1204 break;
1205 case PAN_SYSVAL_SSBO:
1206 panfrost_upload_ssbo_sysval(batch, st,
1207 PAN_SYSVAL_ID(sysval),
1208 &uniforms[i]);
1209 break;
1210 case PAN_SYSVAL_NUM_WORK_GROUPS:
1211 panfrost_upload_num_work_groups_sysval(batch,
1212 &uniforms[i]);
1213 break;
1214 case PAN_SYSVAL_SAMPLER:
1215 panfrost_upload_sampler_sysval(batch, st,
1216 PAN_SYSVAL_ID(sysval),
1217 &uniforms[i]);
1218 break;
1219 default:
1220 assert(0);
1221 }
1222 }
1223 }
1224
1225 static const void *
1226 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1227 unsigned index)
1228 {
1229 struct pipe_constant_buffer *cb = &buf->cb[index];
1230 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1231
1232 if (rsrc)
1233 return rsrc->bo->cpu;
1234 else if (cb->user_buffer)
1235 return cb->user_buffer;
1236 else
1237 unreachable("No constant buffer");
1238 }
1239
1240 void
1241 panfrost_emit_const_buf(struct panfrost_batch *batch,
1242 enum pipe_shader_type stage,
1243 struct mali_vertex_tiler_postfix *postfix)
1244 {
1245 struct panfrost_context *ctx = batch->ctx;
1246 struct panfrost_shader_variants *all = ctx->shader[stage];
1247
1248 if (!all)
1249 return;
1250
1251 struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1252
1253 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1254
1255 /* Uniforms are implicitly UBO #0 */
1256 bool has_uniforms = buf->enabled_mask & (1 << 0);
1257
1258 /* Allocate room for the sysval and the uniforms */
1259 size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1260 size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1261 size_t size = sys_size + uniform_size;
1262 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1263 size);
1264
1265 /* Upload sysvals requested by the shader */
1266 panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1267
1268 /* Upload uniforms */
1269 if (has_uniforms && uniform_size) {
1270 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1271 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1272 }
1273
1274 /* Next up, attach UBOs. UBO #0 is the uniforms we just
1275 * uploaded */
1276
1277 unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1278 assert(ubo_count >= 1);
1279
1280 size_t sz = sizeof(uint64_t) * ubo_count;
1281 uint64_t ubos[PAN_MAX_CONST_BUFFERS];
1282 int uniform_count = ss->uniform_count;
1283
1284 /* Upload uniforms as a UBO */
1285 ubos[0] = MALI_MAKE_UBO(2 + uniform_count, transfer.gpu);
1286
1287 /* The rest are honest-to-goodness UBOs */
1288
1289 for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1290 size_t usz = buf->cb[ubo].buffer_size;
1291 bool enabled = buf->enabled_mask & (1 << ubo);
1292 bool empty = usz == 0;
1293
1294 if (!enabled || empty) {
1295 /* Stub out disabled UBOs to catch accesses */
1296 ubos[ubo] = MALI_MAKE_UBO(0, 0xDEAD0000);
1297 continue;
1298 }
1299
1300 mali_ptr gpu = panfrost_map_constant_buffer_gpu(batch, stage,
1301 buf, ubo);
1302
1303 unsigned bytes_per_field = 16;
1304 unsigned aligned = ALIGN_POT(usz, bytes_per_field);
1305 ubos[ubo] = MALI_MAKE_UBO(aligned / bytes_per_field, gpu);
1306 }
1307
1308 mali_ptr ubufs = panfrost_upload_transient(batch, ubos, sz);
1309 postfix->uniforms = transfer.gpu;
1310 postfix->uniform_buffers = ubufs;
1311
1312 buf->dirty_mask = 0;
1313 }
1314
1315 void
1316 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1317 const struct pipe_grid_info *info,
1318 struct midgard_payload_vertex_tiler *vtp)
1319 {
1320 struct panfrost_context *ctx = batch->ctx;
1321 struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1322 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1323 unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1324 128));
1325 unsigned shared_size = single_size * info->grid[0] * info->grid[1] *
1326 info->grid[2] * 4;
1327 struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1328 shared_size,
1329 1);
1330
1331 struct mali_shared_memory shared = {
1332 .shared_memory = bo->gpu,
1333 .shared_workgroup_count =
1334 util_logbase2_ceil(info->grid[0]) +
1335 util_logbase2_ceil(info->grid[1]) +
1336 util_logbase2_ceil(info->grid[2]),
1337 .shared_unk1 = 0x2,
1338 .shared_shift = util_logbase2(single_size) - 1
1339 };
1340
1341 vtp->postfix.shared_memory = panfrost_upload_transient(batch, &shared,
1342 sizeof(shared));
1343 }
1344
1345 static mali_ptr
1346 panfrost_get_tex_desc(struct panfrost_batch *batch,
1347 enum pipe_shader_type st,
1348 struct panfrost_sampler_view *view)
1349 {
1350 if (!view)
1351 return (mali_ptr) 0;
1352
1353 struct pipe_sampler_view *pview = &view->base;
1354 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1355
1356 /* Add the BO to the job so it's retained until the job is done. */
1357
1358 panfrost_batch_add_bo(batch, rsrc->bo,
1359 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1360 panfrost_bo_access_for_stage(st));
1361
1362 panfrost_batch_add_bo(batch, view->midgard_bo,
1363 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1364 panfrost_bo_access_for_stage(st));
1365
1366 return view->midgard_bo->gpu;
1367 }
1368
1369 void
1370 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1371 enum pipe_shader_type stage,
1372 struct mali_vertex_tiler_postfix *postfix)
1373 {
1374 struct panfrost_context *ctx = batch->ctx;
1375 struct panfrost_device *device = pan_device(ctx->base.screen);
1376
1377 if (!ctx->sampler_view_count[stage])
1378 return;
1379
1380 if (device->quirks & IS_BIFROST) {
1381 struct bifrost_texture_descriptor *descriptors;
1382
1383 descriptors = malloc(sizeof(struct bifrost_texture_descriptor) *
1384 ctx->sampler_view_count[stage]);
1385
1386 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1387 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1388 struct pipe_sampler_view *pview = &view->base;
1389 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1390
1391 /* Add the BOs to the job so they are retained until the job is done. */
1392
1393 panfrost_batch_add_bo(batch, rsrc->bo,
1394 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1395 panfrost_bo_access_for_stage(stage));
1396
1397 panfrost_batch_add_bo(batch, view->bifrost_bo,
1398 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1399 panfrost_bo_access_for_stage(stage));
1400
1401 memcpy(&descriptors[i], view->bifrost_descriptor, sizeof(*view->bifrost_descriptor));
1402 }
1403
1404 postfix->textures = panfrost_upload_transient(batch,
1405 descriptors,
1406 sizeof(struct bifrost_texture_descriptor) *
1407 ctx->sampler_view_count[stage]);
1408
1409 free(descriptors);
1410 } else {
1411 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1412
1413 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i)
1414 trampolines[i] = panfrost_get_tex_desc(batch, stage,
1415 ctx->sampler_views[stage][i]);
1416
1417 postfix->textures = panfrost_upload_transient(batch,
1418 trampolines,
1419 sizeof(uint64_t) *
1420 ctx->sampler_view_count[stage]);
1421 }
1422 }
1423
1424 void
1425 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1426 enum pipe_shader_type stage,
1427 struct mali_vertex_tiler_postfix *postfix)
1428 {
1429 struct panfrost_context *ctx = batch->ctx;
1430 struct panfrost_device *device = pan_device(ctx->base.screen);
1431
1432 if (!ctx->sampler_count[stage])
1433 return;
1434
1435 if (device->quirks & IS_BIFROST) {
1436 size_t desc_size = sizeof(struct bifrost_sampler_descriptor);
1437 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1438 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1439 transfer_size);
1440 struct bifrost_sampler_descriptor *desc = (struct bifrost_sampler_descriptor *)transfer.cpu;
1441
1442 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1443 desc[i] = ctx->samplers[stage][i]->bifrost_hw;
1444
1445 postfix->sampler_descriptor = transfer.gpu;
1446 } else {
1447 size_t desc_size = sizeof(struct mali_sampler_descriptor);
1448 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1449 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1450 transfer_size);
1451 struct mali_sampler_descriptor *desc = (struct mali_sampler_descriptor *)transfer.cpu;
1452
1453 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1454 desc[i] = ctx->samplers[stage][i]->midgard_hw;
1455
1456 postfix->sampler_descriptor = transfer.gpu;
1457 }
1458 }
1459
1460 void
1461 panfrost_emit_vertex_attr_meta(struct panfrost_batch *batch,
1462 struct mali_vertex_tiler_postfix *vertex_postfix)
1463 {
1464 struct panfrost_context *ctx = batch->ctx;
1465
1466 if (!ctx->vertex)
1467 return;
1468
1469 struct panfrost_vertex_state *so = ctx->vertex;
1470
1471 panfrost_vertex_state_upd_attr_offs(ctx, vertex_postfix);
1472 vertex_postfix->attribute_meta = panfrost_upload_transient(batch, so->hw,
1473 sizeof(*so->hw) *
1474 PAN_MAX_ATTRIBUTE);
1475 }
1476
1477 void
1478 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1479 struct mali_vertex_tiler_postfix *vertex_postfix)
1480 {
1481 struct panfrost_context *ctx = batch->ctx;
1482 struct panfrost_vertex_state *so = ctx->vertex;
1483
1484 /* Staged mali_attr, and index into them. i =/= k, depending on the
1485 * vertex buffer mask and instancing. Twice as much room is allocated,
1486 * for a worst case of NPOT_DIVIDEs which take up extra slot */
1487 union mali_attr attrs[PIPE_MAX_ATTRIBS * 2];
1488 unsigned k = 0;
1489
1490 for (unsigned i = 0; i < so->num_elements; ++i) {
1491 /* We map a mali_attr to be 1:1 with the mali_attr_meta, which
1492 * means duplicating some vertex buffers (who cares? aside from
1493 * maybe some caching implications but I somehow doubt that
1494 * matters) */
1495
1496 struct pipe_vertex_element *elem = &so->pipe[i];
1497 unsigned vbi = elem->vertex_buffer_index;
1498
1499 /* The exception to 1:1 mapping is that we can have multiple
1500 * entries (NPOT divisors), so we fixup anyways */
1501
1502 so->hw[i].index = k;
1503
1504 if (!(ctx->vb_mask & (1 << vbi)))
1505 continue;
1506
1507 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1508 struct panfrost_resource *rsrc;
1509
1510 rsrc = pan_resource(buf->buffer.resource);
1511 if (!rsrc)
1512 continue;
1513
1514 /* Align to 64 bytes by masking off the lower bits. This
1515 * will be adjusted back when we fixup the src_offset in
1516 * mali_attr_meta */
1517
1518 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1519 mali_ptr addr = raw_addr & ~63;
1520 unsigned chopped_addr = raw_addr - addr;
1521
1522 /* Add a dependency of the batch on the vertex buffer */
1523 panfrost_batch_add_bo(batch, rsrc->bo,
1524 PAN_BO_ACCESS_SHARED |
1525 PAN_BO_ACCESS_READ |
1526 PAN_BO_ACCESS_VERTEX_TILER);
1527
1528 /* Set common fields */
1529 attrs[k].elements = addr;
1530 attrs[k].stride = buf->stride;
1531
1532 /* Since we advanced the base pointer, we shrink the buffer
1533 * size */
1534 attrs[k].size = rsrc->base.width0 - buf->buffer_offset;
1535
1536 /* We need to add the extra size we masked off (for
1537 * correctness) so the data doesn't get clamped away */
1538 attrs[k].size += chopped_addr;
1539
1540 /* For non-instancing make sure we initialize */
1541 attrs[k].shift = attrs[k].extra_flags = 0;
1542
1543 /* Instancing uses a dramatically different code path than
1544 * linear, so dispatch for the actual emission now that the
1545 * common code is finished */
1546
1547 unsigned divisor = elem->instance_divisor;
1548
1549 if (divisor && ctx->instance_count == 1) {
1550 /* Silly corner case where there's a divisor(=1) but
1551 * there's no legitimate instancing. So we want *every*
1552 * attribute to be the same. So set stride to zero so
1553 * we don't go anywhere. */
1554
1555 attrs[k].size = attrs[k].stride + chopped_addr;
1556 attrs[k].stride = 0;
1557 attrs[k++].elements |= MALI_ATTR_LINEAR;
1558 } else if (ctx->instance_count <= 1) {
1559 /* Normal, non-instanced attributes */
1560 attrs[k++].elements |= MALI_ATTR_LINEAR;
1561 } else {
1562 unsigned instance_shift = vertex_postfix->instance_shift;
1563 unsigned instance_odd = vertex_postfix->instance_odd;
1564
1565 k += panfrost_vertex_instanced(ctx->padded_count,
1566 instance_shift,
1567 instance_odd,
1568 divisor, &attrs[k]);
1569 }
1570 }
1571
1572 /* Add special gl_VertexID/gl_InstanceID buffers */
1573
1574 panfrost_vertex_id(ctx->padded_count, &attrs[k]);
1575 so->hw[PAN_VERTEX_ID].index = k++;
1576 panfrost_instance_id(ctx->padded_count, &attrs[k]);
1577 so->hw[PAN_INSTANCE_ID].index = k++;
1578
1579 /* Upload whatever we emitted and go */
1580
1581 vertex_postfix->attributes = panfrost_upload_transient(batch, attrs,
1582 k * sizeof(*attrs));
1583 }
1584
1585 static mali_ptr
1586 panfrost_emit_varyings(struct panfrost_batch *batch, union mali_attr *slot,
1587 unsigned stride, unsigned count)
1588 {
1589 /* Fill out the descriptor */
1590 slot->stride = stride;
1591 slot->size = stride * count;
1592 slot->shift = slot->extra_flags = 0;
1593
1594 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1595 slot->size);
1596
1597 slot->elements = transfer.gpu | MALI_ATTR_LINEAR;
1598
1599 return transfer.gpu;
1600 }
1601
1602 static void
1603 panfrost_emit_streamout(struct panfrost_batch *batch, union mali_attr *slot,
1604 unsigned stride, unsigned offset, unsigned count,
1605 struct pipe_stream_output_target *target)
1606 {
1607 /* Fill out the descriptor */
1608 slot->stride = stride * 4;
1609 slot->shift = slot->extra_flags = 0;
1610
1611 unsigned max_size = target->buffer_size;
1612 unsigned expected_size = slot->stride * count;
1613
1614 slot->size = MIN2(max_size, expected_size);
1615
1616 /* Grab the BO and bind it to the batch */
1617 struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1618
1619 /* Varyings are WRITE from the perspective of the VERTEX but READ from
1620 * the perspective of the TILER and FRAGMENT.
1621 */
1622 panfrost_batch_add_bo(batch, bo,
1623 PAN_BO_ACCESS_SHARED |
1624 PAN_BO_ACCESS_RW |
1625 PAN_BO_ACCESS_VERTEX_TILER |
1626 PAN_BO_ACCESS_FRAGMENT);
1627
1628 mali_ptr addr = bo->gpu + target->buffer_offset + (offset * slot->stride);
1629 slot->elements = addr;
1630 }
1631
1632 /* Given a shader and buffer indices, link varying metadata together */
1633
1634 static bool
1635 is_special_varying(gl_varying_slot loc)
1636 {
1637 switch (loc) {
1638 case VARYING_SLOT_POS:
1639 case VARYING_SLOT_PSIZ:
1640 case VARYING_SLOT_PNTC:
1641 case VARYING_SLOT_FACE:
1642 return true;
1643 default:
1644 return false;
1645 }
1646 }
1647
1648 static void
1649 panfrost_emit_varying_meta(void *outptr, struct panfrost_shader_state *ss,
1650 signed general, signed gl_Position,
1651 signed gl_PointSize, signed gl_PointCoord,
1652 signed gl_FrontFacing)
1653 {
1654 struct mali_attr_meta *out = (struct mali_attr_meta *) outptr;
1655
1656 for (unsigned i = 0; i < ss->varying_count; ++i) {
1657 gl_varying_slot location = ss->varyings_loc[i];
1658 int index = -1;
1659
1660 switch (location) {
1661 case VARYING_SLOT_POS:
1662 index = gl_Position;
1663 break;
1664 case VARYING_SLOT_PSIZ:
1665 index = gl_PointSize;
1666 break;
1667 case VARYING_SLOT_PNTC:
1668 index = gl_PointCoord;
1669 break;
1670 case VARYING_SLOT_FACE:
1671 index = gl_FrontFacing;
1672 break;
1673 default:
1674 index = general;
1675 break;
1676 }
1677
1678 assert(index >= 0);
1679 out[i].index = index;
1680 }
1681 }
1682
1683 static bool
1684 has_point_coord(unsigned mask, gl_varying_slot loc)
1685 {
1686 if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1687 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1688 else if (loc == VARYING_SLOT_PNTC)
1689 return (mask & (1 << 8));
1690 else
1691 return false;
1692 }
1693
1694 /* Helpers for manipulating stream out information so we can pack varyings
1695 * accordingly. Compute the src_offset for a given captured varying */
1696
1697 static struct pipe_stream_output *
1698 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1699 {
1700 for (unsigned i = 0; i < info->num_outputs; ++i) {
1701 if (info->output[i].register_index == loc)
1702 return &info->output[i];
1703 }
1704
1705 unreachable("Varying not captured");
1706 }
1707
1708 void
1709 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1710 unsigned vertex_count,
1711 struct mali_vertex_tiler_postfix *vertex_postfix,
1712 struct mali_vertex_tiler_postfix *tiler_postfix,
1713 union midgard_primitive_size *primitive_size)
1714 {
1715 /* Load the shaders */
1716 struct panfrost_context *ctx = batch->ctx;
1717 struct panfrost_shader_state *vs, *fs;
1718 unsigned int num_gen_varyings = 0;
1719 size_t vs_size, fs_size;
1720
1721 /* Allocate the varying descriptor */
1722
1723 vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1724 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1725 vs_size = sizeof(struct mali_attr_meta) * vs->varying_count;
1726 fs_size = sizeof(struct mali_attr_meta) * fs->varying_count;
1727
1728 struct panfrost_transfer trans = panfrost_allocate_transient(batch,
1729 vs_size +
1730 fs_size);
1731
1732 struct pipe_stream_output_info *so = &vs->stream_output;
1733
1734 /* Check if this varying is linked by us. This is the case for
1735 * general-purpose, non-captured varyings. If it is, link it. If it's
1736 * not, use the provided stream out information to determine the
1737 * offset, since it was already linked for us. */
1738
1739 for (unsigned i = 0; i < vs->varying_count; i++) {
1740 gl_varying_slot loc = vs->varyings_loc[i];
1741
1742 bool special = is_special_varying(loc);
1743 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1744
1745 if (captured) {
1746 struct pipe_stream_output *o = pan_get_so(so, loc);
1747
1748 unsigned dst_offset = o->dst_offset * 4; /* dwords */
1749 vs->varyings[i].src_offset = dst_offset;
1750 } else if (!special) {
1751 vs->varyings[i].src_offset = 16 * (num_gen_varyings++);
1752 }
1753 }
1754
1755 /* Conversely, we need to set src_offset for the captured varyings.
1756 * Here, the layout is defined by the stream out info, not us */
1757
1758 /* Link up with fragment varyings */
1759 bool reads_point_coord = fs->reads_point_coord;
1760
1761 for (unsigned i = 0; i < fs->varying_count; i++) {
1762 gl_varying_slot loc = fs->varyings_loc[i];
1763 unsigned src_offset;
1764 signed vs_idx = -1;
1765
1766 /* Link up */
1767 for (unsigned j = 0; j < vs->varying_count; ++j) {
1768 if (vs->varyings_loc[j] == loc) {
1769 vs_idx = j;
1770 break;
1771 }
1772 }
1773
1774 /* Either assign or reuse */
1775 if (vs_idx >= 0)
1776 src_offset = vs->varyings[vs_idx].src_offset;
1777 else
1778 src_offset = 16 * (num_gen_varyings++);
1779
1780 fs->varyings[i].src_offset = src_offset;
1781
1782 if (has_point_coord(fs->point_sprite_mask, loc))
1783 reads_point_coord = true;
1784 }
1785
1786 memcpy(trans.cpu, vs->varyings, vs_size);
1787 memcpy(trans.cpu + vs_size, fs->varyings, fs_size);
1788
1789 union mali_attr varyings[PIPE_MAX_ATTRIBS] = {0};
1790
1791 /* Figure out how many streamout buffers could be bound */
1792 unsigned so_count = ctx->streamout.num_targets;
1793 for (unsigned i = 0; i < vs->varying_count; i++) {
1794 gl_varying_slot loc = vs->varyings_loc[i];
1795
1796 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1797 if (!captured) continue;
1798
1799 struct pipe_stream_output *o = pan_get_so(so, loc);
1800 so_count = MAX2(so_count, o->output_buffer + 1);
1801 }
1802
1803 signed idx = so_count;
1804 signed general = idx++;
1805 signed gl_Position = idx++;
1806 signed gl_PointSize = vs->writes_point_size ? (idx++) : -1;
1807 signed gl_PointCoord = reads_point_coord ? (idx++) : -1;
1808 signed gl_FrontFacing = fs->reads_face ? (idx++) : -1;
1809 signed gl_FragCoord = fs->reads_frag_coord ? (idx++) : -1;
1810
1811 /* Emit the stream out buffers */
1812
1813 unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1814 ctx->vertex_count);
1815
1816 for (unsigned i = 0; i < so_count; ++i) {
1817 if (i < ctx->streamout.num_targets) {
1818 panfrost_emit_streamout(batch, &varyings[i],
1819 so->stride[i],
1820 ctx->streamout.offsets[i],
1821 out_count,
1822 ctx->streamout.targets[i]);
1823 } else {
1824 /* Emit a dummy buffer */
1825 panfrost_emit_varyings(batch, &varyings[i],
1826 so->stride[i] * 4,
1827 out_count);
1828
1829 /* Clear the attribute type */
1830 varyings[i].elements &= ~0xF;
1831 }
1832 }
1833
1834 panfrost_emit_varyings(batch, &varyings[general],
1835 num_gen_varyings * 16,
1836 vertex_count);
1837
1838 mali_ptr varyings_p;
1839
1840 /* fp32 vec4 gl_Position */
1841 varyings_p = panfrost_emit_varyings(batch, &varyings[gl_Position],
1842 sizeof(float) * 4, vertex_count);
1843 tiler_postfix->position_varying = varyings_p;
1844
1845
1846 if (panfrost_writes_point_size(ctx)) {
1847 varyings_p = panfrost_emit_varyings(batch,
1848 &varyings[gl_PointSize],
1849 2, vertex_count);
1850 primitive_size->pointer = varyings_p;
1851 }
1852
1853 if (reads_point_coord)
1854 varyings[gl_PointCoord].elements = MALI_VARYING_POINT_COORD;
1855
1856 if (fs->reads_face)
1857 varyings[gl_FrontFacing].elements = MALI_VARYING_FRONT_FACING;
1858
1859 if (fs->reads_frag_coord)
1860 varyings[gl_FragCoord].elements = MALI_VARYING_FRAG_COORD;
1861
1862 struct panfrost_device *device = pan_device(ctx->base.screen);
1863 assert(!(device->quirks & IS_BIFROST) || !(reads_point_coord));
1864
1865 /* Let's go ahead and link varying meta to the buffer in question, now
1866 * that that information is available. VARYING_SLOT_POS is mapped to
1867 * gl_FragCoord for fragment shaders but gl_Positionf or vertex shaders
1868 * */
1869
1870 panfrost_emit_varying_meta(trans.cpu, vs, general, gl_Position,
1871 gl_PointSize, gl_PointCoord,
1872 gl_FrontFacing);
1873
1874 panfrost_emit_varying_meta(trans.cpu + vs_size, fs, general,
1875 gl_FragCoord, gl_PointSize,
1876 gl_PointCoord, gl_FrontFacing);
1877
1878 /* Replace streamout */
1879
1880 struct mali_attr_meta *ovs = (struct mali_attr_meta *)trans.cpu;
1881 struct mali_attr_meta *ofs = ovs + vs->varying_count;
1882
1883 for (unsigned i = 0; i < vs->varying_count; i++) {
1884 gl_varying_slot loc = vs->varyings_loc[i];
1885
1886 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1887 if (!captured)
1888 continue;
1889
1890 struct pipe_stream_output *o = pan_get_so(so, loc);
1891 ovs[i].index = o->output_buffer;
1892
1893 assert(o->stream == 0);
1894 ovs[i].format = (vs->varyings[i].format & ~MALI_NR_CHANNELS(4))
1895 | MALI_NR_CHANNELS(o->num_components);
1896
1897 if (device->quirks & HAS_SWIZZLES)
1898 ovs[i].swizzle = panfrost_get_default_swizzle(o->num_components);
1899 else
1900 ovs[i].swizzle = panfrost_bifrost_swizzle(o->num_components);
1901
1902 /* Link to the fragment */
1903 signed fs_idx = -1;
1904
1905 /* Link up */
1906 for (unsigned j = 0; j < fs->varying_count; ++j) {
1907 if (fs->varyings_loc[j] == loc) {
1908 fs_idx = j;
1909 break;
1910 }
1911 }
1912
1913 if (fs_idx >= 0) {
1914 ofs[fs_idx].index = ovs[i].index;
1915 ofs[fs_idx].format = ovs[i].format;
1916 ofs[fs_idx].swizzle = ovs[i].swizzle;
1917 }
1918 }
1919
1920 /* Replace point sprite */
1921 for (unsigned i = 0; i < fs->varying_count; i++) {
1922 /* If we have a point sprite replacement, handle that here. We
1923 * have to translate location first. TODO: Flip y in shader.
1924 * We're already keying ... just time crunch .. */
1925
1926 if (has_point_coord(fs->point_sprite_mask,
1927 fs->varyings_loc[i])) {
1928 ofs[i].index = gl_PointCoord;
1929
1930 /* Swizzle out the z/w to 0/1 */
1931 ofs[i].format = MALI_RG16F;
1932 ofs[i].swizzle = panfrost_get_default_swizzle(2);
1933 }
1934 }
1935
1936 /* Fix up unaligned addresses */
1937 for (unsigned i = 0; i < so_count; ++i) {
1938 if (varyings[i].elements < MALI_RECORD_SPECIAL)
1939 continue;
1940
1941 unsigned align = (varyings[i].elements & 63);
1942
1943 /* While we're at it, the SO buffers are linear */
1944
1945 if (!align) {
1946 varyings[i].elements |= MALI_ATTR_LINEAR;
1947 continue;
1948 }
1949
1950 /* We need to adjust alignment */
1951 varyings[i].elements &= ~63;
1952 varyings[i].elements |= MALI_ATTR_LINEAR;
1953 varyings[i].size += align;
1954
1955 for (unsigned v = 0; v < vs->varying_count; ++v) {
1956 if (ovs[v].index != i)
1957 continue;
1958
1959 ovs[v].src_offset = vs->varyings[v].src_offset + align;
1960 }
1961
1962 for (unsigned f = 0; f < fs->varying_count; ++f) {
1963 if (ofs[f].index != i)
1964 continue;
1965
1966 ofs[f].src_offset = fs->varyings[f].src_offset + align;
1967 }
1968 }
1969
1970 varyings_p = panfrost_upload_transient(batch, varyings,
1971 idx * sizeof(*varyings));
1972 vertex_postfix->varyings = varyings_p;
1973 tiler_postfix->varyings = varyings_p;
1974
1975 vertex_postfix->varying_meta = trans.gpu;
1976 tiler_postfix->varying_meta = trans.gpu + vs_size;
1977 }
1978
1979 void
1980 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
1981 struct mali_vertex_tiler_prefix *vertex_prefix,
1982 struct mali_vertex_tiler_postfix *vertex_postfix,
1983 struct mali_vertex_tiler_prefix *tiler_prefix,
1984 struct mali_vertex_tiler_postfix *tiler_postfix,
1985 union midgard_primitive_size *primitive_size)
1986 {
1987 struct panfrost_context *ctx = batch->ctx;
1988 struct panfrost_device *device = pan_device(ctx->base.screen);
1989 bool wallpapering = ctx->wallpaper_batch && batch->tiler_dep;
1990 struct bifrost_payload_vertex bifrost_vertex = {0,};
1991 struct bifrost_payload_tiler bifrost_tiler = {0,};
1992 struct midgard_payload_vertex_tiler midgard_vertex = {0,};
1993 struct midgard_payload_vertex_tiler midgard_tiler = {0,};
1994 void *vp, *tp;
1995 size_t vp_size, tp_size;
1996
1997 if (device->quirks & IS_BIFROST) {
1998 bifrost_vertex.prefix = *vertex_prefix;
1999 bifrost_vertex.postfix = *vertex_postfix;
2000 vp = &bifrost_vertex;
2001 vp_size = sizeof(bifrost_vertex);
2002
2003 bifrost_tiler.prefix = *tiler_prefix;
2004 bifrost_tiler.tiler.primitive_size = *primitive_size;
2005 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
2006 bifrost_tiler.postfix = *tiler_postfix;
2007 tp = &bifrost_tiler;
2008 tp_size = sizeof(bifrost_tiler);
2009 } else {
2010 midgard_vertex.prefix = *vertex_prefix;
2011 midgard_vertex.postfix = *vertex_postfix;
2012 vp = &midgard_vertex;
2013 vp_size = sizeof(midgard_vertex);
2014
2015 midgard_tiler.prefix = *tiler_prefix;
2016 midgard_tiler.postfix = *tiler_postfix;
2017 midgard_tiler.primitive_size = *primitive_size;
2018 tp = &midgard_tiler;
2019 tp_size = sizeof(midgard_tiler);
2020 }
2021
2022 if (wallpapering) {
2023 /* Inject in reverse order, with "predicted" job indices.
2024 * THIS IS A HACK XXX */
2025 panfrost_new_job(batch, JOB_TYPE_TILER, false,
2026 batch->job_index + 2, tp, tp_size, true);
2027 panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
2028 vp, vp_size, true);
2029 return;
2030 }
2031
2032 /* If rasterizer discard is enable, only submit the vertex */
2033
2034 bool rasterizer_discard = ctx->rasterizer &&
2035 ctx->rasterizer->base.rasterizer_discard;
2036
2037 unsigned vertex = panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
2038 vp, vp_size, false);
2039
2040 if (rasterizer_discard)
2041 return;
2042
2043 panfrost_new_job(batch, JOB_TYPE_TILER, false, vertex, tp, tp_size,
2044 false);
2045 }
2046
2047 /* TODO: stop hardcoding this */
2048 mali_ptr
2049 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2050 {
2051 uint16_t locations[] = {
2052 128, 128,
2053 0, 256,
2054 0, 256,
2055 0, 256,
2056 0, 256,
2057 0, 256,
2058 0, 256,
2059 0, 256,
2060 0, 256,
2061 0, 256,
2062 0, 256,
2063 0, 256,
2064 0, 256,
2065 0, 256,
2066 0, 256,
2067 0, 256,
2068 0, 256,
2069 0, 256,
2070 0, 256,
2071 0, 256,
2072 0, 256,
2073 0, 256,
2074 0, 256,
2075 0, 256,
2076 0, 256,
2077 0, 256,
2078 0, 256,
2079 0, 256,
2080 0, 256,
2081 0, 256,
2082 0, 256,
2083 0, 256,
2084 128, 128,
2085 0, 0,
2086 0, 0,
2087 0, 0,
2088 0, 0,
2089 0, 0,
2090 0, 0,
2091 0, 0,
2092 0, 0,
2093 0, 0,
2094 0, 0,
2095 0, 0,
2096 0, 0,
2097 0, 0,
2098 0, 0,
2099 0, 0,
2100 };
2101
2102 return panfrost_upload_transient(batch, locations, 96 * sizeof(uint16_t));
2103 }