34c2c7a2af0211c4d134651504459942825e734c
[mesa.git] / src / gallium / drivers / panfrost / pan_cmdstream.c
1 /*
2 * Copyright (C) 2018 Alyssa Rosenzweig
3 * Copyright (C) 2020 Collabora Ltd.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "util/macros.h"
26 #include "util/u_prim.h"
27 #include "util/u_vbuf.h"
28
29 #include "panfrost-quirks.h"
30
31 #include "pan_allocate.h"
32 #include "pan_bo.h"
33 #include "pan_cmdstream.h"
34 #include "pan_context.h"
35 #include "pan_job.h"
36
37 /* If a BO is accessed for a particular shader stage, will it be in the primary
38 * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
39 * fragment will be primary, e.g. compute jobs will be considered
40 * "vertex/tiler" by analogy */
41
42 static inline uint32_t
43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
44 {
45 assert(stage == PIPE_SHADER_FRAGMENT ||
46 stage == PIPE_SHADER_VERTEX ||
47 stage == PIPE_SHADER_COMPUTE);
48
49 return stage == PIPE_SHADER_FRAGMENT ?
50 PAN_BO_ACCESS_FRAGMENT :
51 PAN_BO_ACCESS_VERTEX_TILER;
52 }
53
54 static void
55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
56 struct mali_vertex_tiler_postfix *postfix)
57 {
58 struct panfrost_device *dev = pan_device(ctx->base.screen);
59 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
60
61 unsigned shift = panfrost_get_stack_shift(batch->stack_size);
62 struct mali_shared_memory shared = {
63 .stack_shift = shift,
64 .scratchpad = panfrost_batch_get_scratchpad(batch, shift, dev->thread_tls_alloc, dev->core_count)->gpu,
65 .shared_workgroup_count = ~0,
66 };
67 postfix->shared_memory = panfrost_upload_transient(batch, &shared, sizeof(shared));
68 }
69
70 static void
71 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
72 struct mali_vertex_tiler_postfix *postfix)
73 {
74 struct panfrost_device *dev = pan_device(ctx->base.screen);
75 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
76
77 /* If we haven't, reserve space for the framebuffer */
78
79 if (!batch->framebuffer.gpu) {
80 unsigned size = (dev->quirks & MIDGARD_SFBD) ?
81 sizeof(struct mali_single_framebuffer) :
82 sizeof(struct mali_framebuffer);
83
84 batch->framebuffer = panfrost_allocate_transient(batch, size);
85
86 /* Tag the pointer */
87 if (!(dev->quirks & MIDGARD_SFBD))
88 batch->framebuffer.gpu |= MALI_MFBD;
89 }
90
91 postfix->shared_memory = batch->framebuffer.gpu;
92 }
93
94 static void
95 panfrost_vt_update_rasterizer(struct panfrost_context *ctx,
96 struct mali_vertex_tiler_prefix *prefix,
97 struct mali_vertex_tiler_postfix *postfix)
98 {
99 struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
100
101 postfix->gl_enables |= 0x7;
102 SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
103 rasterizer && rasterizer->base.front_ccw);
104 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
105 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_FRONT));
106 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
107 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_BACK));
108 SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
109 rasterizer && rasterizer->base.flatshade_first);
110 }
111
112 void
113 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
114 struct mali_vertex_tiler_prefix *prefix,
115 union midgard_primitive_size *primitive_size)
116 {
117 struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
118
119 if (!panfrost_writes_point_size(ctx)) {
120 bool points = prefix->draw_mode == MALI_POINTS;
121 float val = 0.0f;
122
123 if (rasterizer)
124 val = points ?
125 rasterizer->base.point_size :
126 rasterizer->base.line_width;
127
128 primitive_size->constant = val;
129 }
130 }
131
132 static void
133 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
134 struct mali_vertex_tiler_postfix *postfix)
135 {
136 SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
137 if (ctx->occlusion_query)
138 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
139 else
140 postfix->occlusion_counter = 0;
141 }
142
143 void
144 panfrost_vt_init(struct panfrost_context *ctx,
145 enum pipe_shader_type stage,
146 struct mali_vertex_tiler_prefix *prefix,
147 struct mali_vertex_tiler_postfix *postfix)
148 {
149 struct panfrost_device *device = pan_device(ctx->base.screen);
150
151 if (!ctx->shader[stage])
152 return;
153
154 memset(prefix, 0, sizeof(*prefix));
155 memset(postfix, 0, sizeof(*postfix));
156
157 if (device->quirks & IS_BIFROST) {
158 postfix->gl_enables = 0x2;
159 panfrost_vt_emit_shared_memory(ctx, postfix);
160 } else {
161 postfix->gl_enables = 0x6;
162 panfrost_vt_attach_framebuffer(ctx, postfix);
163 }
164
165 if (stage == PIPE_SHADER_FRAGMENT) {
166 panfrost_vt_update_occlusion_query(ctx, postfix);
167 panfrost_vt_update_rasterizer(ctx, prefix, postfix);
168 }
169 }
170
171 static unsigned
172 panfrost_translate_index_size(unsigned size)
173 {
174 switch (size) {
175 case 1:
176 return MALI_DRAW_INDEXED_UINT8;
177
178 case 2:
179 return MALI_DRAW_INDEXED_UINT16;
180
181 case 4:
182 return MALI_DRAW_INDEXED_UINT32;
183
184 default:
185 unreachable("Invalid index size");
186 }
187 }
188
189 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
190 * good for the duration of the draw (transient), could last longer. Also get
191 * the bounds on the index buffer for the range accessed by the draw. We do
192 * these operations together because there are natural optimizations which
193 * require them to be together. */
194
195 static mali_ptr
196 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
197 const struct pipe_draw_info *info,
198 unsigned *min_index, unsigned *max_index)
199 {
200 struct panfrost_resource *rsrc = pan_resource(info->index.resource);
201 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
202 off_t offset = info->start * info->index_size;
203 bool needs_indices = true;
204 mali_ptr out = 0;
205
206 if (info->max_index != ~0u) {
207 *min_index = info->min_index;
208 *max_index = info->max_index;
209 needs_indices = false;
210 }
211
212 if (!info->has_user_indices) {
213 /* Only resources can be directly mapped */
214 panfrost_batch_add_bo(batch, rsrc->bo,
215 PAN_BO_ACCESS_SHARED |
216 PAN_BO_ACCESS_READ |
217 PAN_BO_ACCESS_VERTEX_TILER);
218 out = rsrc->bo->gpu + offset;
219
220 /* Check the cache */
221 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
222 info->start,
223 info->count,
224 min_index,
225 max_index);
226 } else {
227 /* Otherwise, we need to upload to transient memory */
228 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
229 out = panfrost_upload_transient(batch, ibuf8 + offset,
230 info->count *
231 info->index_size);
232 }
233
234 if (needs_indices) {
235 /* Fallback */
236 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
237
238 if (!info->has_user_indices)
239 panfrost_minmax_cache_add(rsrc->index_cache,
240 info->start, info->count,
241 *min_index, *max_index);
242 }
243
244 return out;
245 }
246
247 void
248 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
249 const struct pipe_draw_info *info,
250 enum mali_draw_mode draw_mode,
251 struct mali_vertex_tiler_postfix *vertex_postfix,
252 struct mali_vertex_tiler_prefix *tiler_prefix,
253 struct mali_vertex_tiler_postfix *tiler_postfix,
254 unsigned *vertex_count,
255 unsigned *padded_count)
256 {
257 tiler_prefix->draw_mode = draw_mode;
258
259 unsigned draw_flags = 0;
260
261 if (panfrost_writes_point_size(ctx))
262 draw_flags |= MALI_DRAW_VARYING_SIZE;
263
264 if (info->primitive_restart)
265 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
266
267 /* These doesn't make much sense */
268
269 draw_flags |= 0x3000;
270
271 if (info->index_size) {
272 unsigned min_index = 0, max_index = 0;
273
274 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
275 info,
276 &min_index,
277 &max_index);
278
279 /* Use the corresponding values */
280 *vertex_count = max_index - min_index + 1;
281 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
282 tiler_prefix->offset_bias_correction = -min_index;
283 tiler_prefix->index_count = MALI_POSITIVE(info->count);
284 draw_flags |= panfrost_translate_index_size(info->index_size);
285 } else {
286 tiler_prefix->indices = 0;
287 *vertex_count = ctx->vertex_count;
288 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
289 tiler_prefix->offset_bias_correction = 0;
290 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
291 }
292
293 tiler_prefix->unknown_draw = draw_flags;
294
295 /* Encode the padded vertex count */
296
297 if (info->instance_count > 1) {
298 *padded_count = panfrost_padded_vertex_count(*vertex_count);
299
300 unsigned shift = __builtin_ctz(ctx->padded_count);
301 unsigned k = ctx->padded_count >> (shift + 1);
302
303 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
304 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
305 } else {
306 *padded_count = *vertex_count;
307
308 /* Reset instancing state */
309 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
310 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
311 }
312 }
313
314 static void
315 panfrost_shader_meta_init(struct panfrost_context *ctx,
316 enum pipe_shader_type st,
317 struct mali_shader_meta *meta)
318 {
319 const struct panfrost_device *dev = pan_device(ctx->base.screen);
320 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
321
322 memset(meta, 0, sizeof(*meta));
323 meta->shader = (ss->bo ? ss->bo->gpu : 0) | ss->first_tag;
324 meta->attribute_count = ss->attribute_count;
325 meta->varying_count = ss->varying_count;
326 meta->texture_count = ctx->sampler_view_count[st];
327 meta->sampler_count = ctx->sampler_count[st];
328
329 if (dev->quirks & IS_BIFROST) {
330 if (st == PIPE_SHADER_VERTEX)
331 meta->bifrost1.unk1 = 0x800000;
332 else {
333 /* First clause ATEST |= 0x4000000.
334 * Less than 32 regs |= 0x200 */
335 meta->bifrost1.unk1 = 0x958020;
336 }
337
338 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
339 if (st == PIPE_SHADER_VERTEX)
340 meta->bifrost2.preload_regs = 0xC0;
341 else
342 meta->bifrost2.preload_regs = 0x1;
343 meta->bifrost2.uniform_count = MIN2(ss->uniform_count,
344 ss->uniform_cutoff);
345 } else {
346 meta->midgard1.uniform_count = MIN2(ss->uniform_count,
347 ss->uniform_cutoff);
348 meta->midgard1.work_count = ss->work_reg_count;
349 meta->midgard1.flags_hi = 0x8; /* XXX */
350 meta->midgard1.flags_lo = 0x220;
351 meta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
352 }
353 }
354
355 static unsigned
356 panfrost_translate_compare_func(enum pipe_compare_func in)
357 {
358 switch (in) {
359 case PIPE_FUNC_NEVER:
360 return MALI_FUNC_NEVER;
361
362 case PIPE_FUNC_LESS:
363 return MALI_FUNC_LESS;
364
365 case PIPE_FUNC_EQUAL:
366 return MALI_FUNC_EQUAL;
367
368 case PIPE_FUNC_LEQUAL:
369 return MALI_FUNC_LEQUAL;
370
371 case PIPE_FUNC_GREATER:
372 return MALI_FUNC_GREATER;
373
374 case PIPE_FUNC_NOTEQUAL:
375 return MALI_FUNC_NOTEQUAL;
376
377 case PIPE_FUNC_GEQUAL:
378 return MALI_FUNC_GEQUAL;
379
380 case PIPE_FUNC_ALWAYS:
381 return MALI_FUNC_ALWAYS;
382
383 default:
384 unreachable("Invalid func");
385 }
386 }
387
388 static unsigned
389 panfrost_translate_stencil_op(enum pipe_stencil_op in)
390 {
391 switch (in) {
392 case PIPE_STENCIL_OP_KEEP:
393 return MALI_STENCIL_KEEP;
394
395 case PIPE_STENCIL_OP_ZERO:
396 return MALI_STENCIL_ZERO;
397
398 case PIPE_STENCIL_OP_REPLACE:
399 return MALI_STENCIL_REPLACE;
400
401 case PIPE_STENCIL_OP_INCR:
402 return MALI_STENCIL_INCR;
403
404 case PIPE_STENCIL_OP_DECR:
405 return MALI_STENCIL_DECR;
406
407 case PIPE_STENCIL_OP_INCR_WRAP:
408 return MALI_STENCIL_INCR_WRAP;
409
410 case PIPE_STENCIL_OP_DECR_WRAP:
411 return MALI_STENCIL_DECR_WRAP;
412
413 case PIPE_STENCIL_OP_INVERT:
414 return MALI_STENCIL_INVERT;
415
416 default:
417 unreachable("Invalid stencil op");
418 }
419 }
420
421 static unsigned
422 translate_tex_wrap(enum pipe_tex_wrap w)
423 {
424 switch (w) {
425 case PIPE_TEX_WRAP_REPEAT:
426 return MALI_WRAP_REPEAT;
427
428 case PIPE_TEX_WRAP_CLAMP:
429 return MALI_WRAP_CLAMP;
430
431 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
432 return MALI_WRAP_CLAMP_TO_EDGE;
433
434 case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
435 return MALI_WRAP_CLAMP_TO_BORDER;
436
437 case PIPE_TEX_WRAP_MIRROR_REPEAT:
438 return MALI_WRAP_MIRRORED_REPEAT;
439
440 case PIPE_TEX_WRAP_MIRROR_CLAMP:
441 return MALI_WRAP_MIRRORED_CLAMP;
442
443 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
444 return MALI_WRAP_MIRRORED_CLAMP_TO_EDGE;
445
446 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
447 return MALI_WRAP_MIRRORED_CLAMP_TO_BORDER;
448
449 default:
450 unreachable("Invalid wrap");
451 }
452 }
453
454 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
455 struct mali_sampler_descriptor *hw)
456 {
457 unsigned func = panfrost_translate_compare_func(cso->compare_func);
458 bool min_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
459 bool mag_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
460 bool mip_linear = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR;
461 unsigned min_filter = min_nearest ? MALI_SAMP_MIN_NEAREST : 0;
462 unsigned mag_filter = mag_nearest ? MALI_SAMP_MAG_NEAREST : 0;
463 unsigned mip_filter = mip_linear ?
464 (MALI_SAMP_MIP_LINEAR_1 | MALI_SAMP_MIP_LINEAR_2) : 0;
465 unsigned normalized = cso->normalized_coords ? MALI_SAMP_NORM_COORDS : 0;
466
467 *hw = (struct mali_sampler_descriptor) {
468 .filter_mode = min_filter | mag_filter | mip_filter |
469 normalized,
470 .wrap_s = translate_tex_wrap(cso->wrap_s),
471 .wrap_t = translate_tex_wrap(cso->wrap_t),
472 .wrap_r = translate_tex_wrap(cso->wrap_r),
473 .compare_func = panfrost_flip_compare_func(func),
474 .border_color = {
475 cso->border_color.f[0],
476 cso->border_color.f[1],
477 cso->border_color.f[2],
478 cso->border_color.f[3]
479 },
480 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
481 .max_lod = FIXED_16(cso->max_lod, false),
482 .lod_bias = FIXED_16(cso->lod_bias, true), /* can be negative */
483 .seamless_cube_map = cso->seamless_cube_map,
484 };
485
486 /* If necessary, we disable mipmapping in the sampler descriptor by
487 * clamping the LOD as tight as possible (from 0 to epsilon,
488 * essentially -- remember these are fixed point numbers, so
489 * epsilon=1/256) */
490
491 if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
492 hw->max_lod = hw->min_lod + 1;
493 }
494
495 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
496 struct bifrost_sampler_descriptor *hw)
497 {
498 *hw = (struct bifrost_sampler_descriptor) {
499 .unk1 = 0x1,
500 .wrap_s = translate_tex_wrap(cso->wrap_s),
501 .wrap_t = translate_tex_wrap(cso->wrap_t),
502 .wrap_r = translate_tex_wrap(cso->wrap_r),
503 .unk8 = 0x8,
504 .min_filter = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST,
505 .norm_coords = cso->normalized_coords,
506 .mip_filter = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR,
507 .mag_filter = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR,
508 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
509 .max_lod = FIXED_16(cso->max_lod, false),
510 };
511
512 /* If necessary, we disable mipmapping in the sampler descriptor by
513 * clamping the LOD as tight as possible (from 0 to epsilon,
514 * essentially -- remember these are fixed point numbers, so
515 * epsilon=1/256) */
516
517 if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
518 hw->max_lod = hw->min_lod + 1;
519 }
520
521 static void
522 panfrost_make_stencil_state(const struct pipe_stencil_state *in,
523 struct mali_stencil_test *out)
524 {
525 out->ref = 0; /* Gallium gets it from elsewhere */
526
527 out->mask = in->valuemask;
528 out->func = panfrost_translate_compare_func(in->func);
529 out->sfail = panfrost_translate_stencil_op(in->fail_op);
530 out->dpfail = panfrost_translate_stencil_op(in->zfail_op);
531 out->dppass = panfrost_translate_stencil_op(in->zpass_op);
532 }
533
534 static void
535 panfrost_frag_meta_rasterizer_update(struct panfrost_context *ctx,
536 struct mali_shader_meta *fragmeta)
537 {
538 if (!ctx->rasterizer) {
539 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, true);
540 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, false);
541 fragmeta->depth_units = 0.0f;
542 fragmeta->depth_factor = 0.0f;
543 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, false);
544 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, false);
545 return;
546 }
547
548 bool msaa = ctx->rasterizer->base.multisample;
549
550 /* TODO: Sample size */
551 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
552 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
553 fragmeta->depth_units = ctx->rasterizer->base.offset_units * 2.0f;
554 fragmeta->depth_factor = ctx->rasterizer->base.offset_scale;
555
556 /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
557
558 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A,
559 ctx->rasterizer->base.offset_tri);
560 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B,
561 ctx->rasterizer->base.offset_tri);
562 }
563
564 static void
565 panfrost_frag_meta_zsa_update(struct panfrost_context *ctx,
566 struct mali_shader_meta *fragmeta)
567 {
568 const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
569 int zfunc = PIPE_FUNC_ALWAYS;
570
571 if (!zsa) {
572 struct pipe_stencil_state default_stencil = {
573 .enabled = 0,
574 .func = PIPE_FUNC_ALWAYS,
575 .fail_op = MALI_STENCIL_KEEP,
576 .zfail_op = MALI_STENCIL_KEEP,
577 .zpass_op = MALI_STENCIL_KEEP,
578 .writemask = 0xFF,
579 .valuemask = 0xFF
580 };
581
582 panfrost_make_stencil_state(&default_stencil,
583 &fragmeta->stencil_front);
584 fragmeta->stencil_mask_front = default_stencil.writemask;
585 fragmeta->stencil_back = fragmeta->stencil_front;
586 fragmeta->stencil_mask_back = default_stencil.writemask;
587 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST, false);
588 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK, false);
589 } else {
590 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
591 zsa->stencil[0].enabled);
592 panfrost_make_stencil_state(&zsa->stencil[0],
593 &fragmeta->stencil_front);
594 fragmeta->stencil_mask_front = zsa->stencil[0].writemask;
595 fragmeta->stencil_front.ref = ctx->stencil_ref.ref_value[0];
596
597 /* If back-stencil is not enabled, use the front values */
598
599 if (zsa->stencil[1].enabled) {
600 panfrost_make_stencil_state(&zsa->stencil[1],
601 &fragmeta->stencil_back);
602 fragmeta->stencil_mask_back = zsa->stencil[1].writemask;
603 fragmeta->stencil_back.ref = ctx->stencil_ref.ref_value[1];
604 } else {
605 fragmeta->stencil_back = fragmeta->stencil_front;
606 fragmeta->stencil_mask_back = fragmeta->stencil_mask_front;
607 fragmeta->stencil_back.ref = fragmeta->stencil_front.ref;
608 }
609
610 if (zsa->depth.enabled)
611 zfunc = zsa->depth.func;
612
613 /* Depth state (TODO: Refactor) */
614
615 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
616 zsa->depth.writemask);
617 }
618
619 fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
620 fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(zfunc));
621 }
622
623 static void
624 panfrost_frag_meta_blend_update(struct panfrost_context *ctx,
625 struct mali_shader_meta *fragmeta,
626 void *rts)
627 {
628 const struct panfrost_device *dev = pan_device(ctx->base.screen);
629
630 SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
631 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
632 !ctx->blend->base.dither);
633
634 /* Get blending setup */
635 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
636
637 struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
638 unsigned shader_offset = 0;
639 struct panfrost_bo *shader_bo = NULL;
640
641 for (unsigned c = 0; c < rt_count; ++c)
642 blend[c] = panfrost_get_blend_for_context(ctx, c, &shader_bo,
643 &shader_offset);
644
645 /* If there is a blend shader, work registers are shared. XXX: opt */
646
647 if (!(dev->quirks & IS_BIFROST)) {
648 for (unsigned c = 0; c < rt_count; ++c) {
649 if (blend[c].is_shader)
650 fragmeta->midgard1.work_count = 16;
651 }
652 }
653
654 /* Even on MFBD, the shader descriptor gets blend shaders. It's *also*
655 * copied to the blend_meta appended (by convention), but this is the
656 * field actually read by the hardware. (Or maybe both are read...?).
657 * Specify the last RTi with a blend shader. */
658
659 fragmeta->blend.shader = 0;
660
661 for (signed rt = (rt_count - 1); rt >= 0; --rt) {
662 if (!blend[rt].is_shader)
663 continue;
664
665 fragmeta->blend.shader = blend[rt].shader.gpu |
666 blend[rt].shader.first_tag;
667 break;
668 }
669
670 if (dev->quirks & MIDGARD_SFBD) {
671 /* When only a single render target platform is used, the blend
672 * information is inside the shader meta itself. We additionally
673 * need to signal CAN_DISCARD for nontrivial blend modes (so
674 * we're able to read back the destination buffer) */
675
676 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
677 blend[0].is_shader);
678
679 if (!blend[0].is_shader) {
680 fragmeta->blend.equation = *blend[0].equation.equation;
681 fragmeta->blend.constant = blend[0].equation.constant;
682 }
683
684 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
685 !blend[0].no_blending);
686 return;
687 }
688
689 /* Additional blend descriptor tacked on for jobs using MFBD */
690
691 for (unsigned i = 0; i < rt_count; ++i) {
692 if (dev->quirks & IS_BIFROST) {
693 struct bifrost_blend_rt *brts = rts;
694 struct panfrost_shader_state *fs;
695 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
696
697 brts[i].flags = 0x200;
698 if (blend[i].is_shader) {
699 /* The blend shader's address needs to be at
700 * the same top 32 bit as the fragment shader.
701 * TODO: Ensure that's always the case.
702 */
703 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
704 (fs->bo->gpu & (0xffffffffull << 32)));
705 brts[i].shader = blend[i].shader.gpu;
706 brts[i].unk2 = 0x0;
707 } else {
708 enum pipe_format format = ctx->pipe_framebuffer.cbufs[i]->format;
709 const struct util_format_description *format_desc;
710 format_desc = util_format_description(format);
711
712 brts[i].equation = *blend[i].equation.equation;
713
714 /* TODO: this is a bit more complicated */
715 brts[i].constant = blend[i].equation.constant;
716
717 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
718 brts[i].unk2 = 0x19;
719
720 brts[i].shader_type = fs->blend_types[i];
721 }
722 } else {
723 struct midgard_blend_rt *mrts = rts;
724
725 mrts[i].flags = 0x200;
726
727 bool is_srgb = (ctx->pipe_framebuffer.nr_cbufs > i) &&
728 (ctx->pipe_framebuffer.cbufs[i]) &&
729 util_format_is_srgb(ctx->pipe_framebuffer.cbufs[i]->format);
730
731 SET_BIT(mrts[i].flags, MALI_BLEND_MRT_SHADER, blend[i].is_shader);
732 SET_BIT(mrts[i].flags, MALI_BLEND_LOAD_TIB, !blend[i].no_blending);
733 SET_BIT(mrts[i].flags, MALI_BLEND_SRGB, is_srgb);
734 SET_BIT(mrts[i].flags, MALI_BLEND_NO_DITHER, !ctx->blend->base.dither);
735
736 if (blend[i].is_shader) {
737 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
738 } else {
739 mrts[i].blend.equation = *blend[i].equation.equation;
740 mrts[i].blend.constant = blend[i].equation.constant;
741 }
742 }
743 }
744 }
745
746 static void
747 panfrost_frag_shader_meta_init(struct panfrost_context *ctx,
748 struct mali_shader_meta *fragmeta,
749 void *rts)
750 {
751 const struct panfrost_device *dev = pan_device(ctx->base.screen);
752 struct panfrost_shader_state *fs;
753
754 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
755
756 fragmeta->alpha_coverage = ~MALI_ALPHA_COVERAGE(0.000000);
757 fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x3010;
758 fragmeta->unknown2_4 = 0x4e0;
759
760 /* unknown2_4 has 0x10 bit set on T6XX and T720. We don't know why this
761 * is required (independent of 32-bit/64-bit descriptors), or why it's
762 * not used on later GPU revisions. Otherwise, all shader jobs fault on
763 * these earlier chips (perhaps this is a chicken bit of some kind).
764 * More investigation is needed. */
765
766 SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
767
768 if (dev->quirks & IS_BIFROST) {
769 /* TODO */
770 } else {
771 /* Depending on whether it's legal to in the given shader, we try to
772 * enable early-z testing (or forward-pixel kill?) */
773
774 SET_BIT(fragmeta->midgard1.flags_lo, MALI_EARLY_Z,
775 !fs->can_discard && !fs->writes_depth);
776
777 /* Add the writes Z/S flags if needed. */
778 SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_Z, fs->writes_depth);
779 SET_BIT(fragmeta->midgard1.flags_hi, MALI_WRITES_S, fs->writes_stencil);
780
781 /* Any time texturing is used, derivatives are implicitly calculated,
782 * so we need to enable helper invocations */
783
784 SET_BIT(fragmeta->midgard1.flags_lo, MALI_HELPER_INVOCATIONS,
785 fs->helper_invocations);
786
787 /* CAN_DISCARD should be set if the fragment shader possibly contains a
788 * 'discard' instruction. It is likely this is related to optimizations
789 * related to forward-pixel kill, as per "Mali Performance 3: Is
790 * EGL_BUFFER_PRESERVED a good thing?" by Peter Harris */
791
792 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD, fs->can_discard);
793 SET_BIT(fragmeta->midgard1.flags_lo, 0x400, fs->can_discard);
794 }
795
796 panfrost_frag_meta_rasterizer_update(ctx, fragmeta);
797 panfrost_frag_meta_zsa_update(ctx, fragmeta);
798 panfrost_frag_meta_blend_update(ctx, fragmeta, rts);
799 }
800
801 void
802 panfrost_emit_shader_meta(struct panfrost_batch *batch,
803 enum pipe_shader_type st,
804 struct mali_vertex_tiler_postfix *postfix)
805 {
806 struct panfrost_context *ctx = batch->ctx;
807 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
808
809 if (!ss) {
810 postfix->shader = 0;
811 return;
812 }
813
814 struct mali_shader_meta meta;
815
816 panfrost_shader_meta_init(ctx, st, &meta);
817
818 /* Add the shader BO to the batch. */
819 panfrost_batch_add_bo(batch, ss->bo,
820 PAN_BO_ACCESS_PRIVATE |
821 PAN_BO_ACCESS_READ |
822 panfrost_bo_access_for_stage(st));
823
824 mali_ptr shader_ptr;
825
826 if (st == PIPE_SHADER_FRAGMENT) {
827 struct panfrost_device *dev = pan_device(ctx->base.screen);
828 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
829 size_t desc_size = sizeof(meta);
830 void *rts = NULL;
831 struct panfrost_transfer xfer;
832 unsigned rt_size;
833
834 if (dev->quirks & MIDGARD_SFBD)
835 rt_size = 0;
836 else if (dev->quirks & IS_BIFROST)
837 rt_size = sizeof(struct bifrost_blend_rt);
838 else
839 rt_size = sizeof(struct midgard_blend_rt);
840
841 desc_size += rt_size * rt_count;
842
843 if (rt_size)
844 rts = rzalloc_size(ctx, rt_size * rt_count);
845
846 panfrost_frag_shader_meta_init(ctx, &meta, rts);
847
848 xfer = panfrost_allocate_transient(batch, desc_size);
849
850 memcpy(xfer.cpu, &meta, sizeof(meta));
851 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
852
853 if (rt_size)
854 ralloc_free(rts);
855
856 shader_ptr = xfer.gpu;
857 } else {
858 shader_ptr = panfrost_upload_transient(batch, &meta,
859 sizeof(meta));
860 }
861
862 postfix->shader = shader_ptr;
863 }
864
865 static void
866 panfrost_mali_viewport_init(struct panfrost_context *ctx,
867 struct mali_viewport *mvp)
868 {
869 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
870
871 /* Clip bounds are encoded as floats. The viewport itself is encoded as
872 * (somewhat) asymmetric ints. */
873
874 const struct pipe_scissor_state *ss = &ctx->scissor;
875
876 memset(mvp, 0, sizeof(*mvp));
877
878 /* By default, do no viewport clipping, i.e. clip to (-inf, inf) in
879 * each direction. Clipping to the viewport in theory should work, but
880 * in practice causes issues when we're not explicitly trying to
881 * scissor */
882
883 *mvp = (struct mali_viewport) {
884 .clip_minx = -INFINITY,
885 .clip_miny = -INFINITY,
886 .clip_maxx = INFINITY,
887 .clip_maxy = INFINITY,
888 };
889
890 /* Always scissor to the viewport by default. */
891 float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
892 float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
893
894 float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
895 float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
896
897 float minz = (vp->translate[2] - fabsf(vp->scale[2]));
898 float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
899
900 /* Apply the scissor test */
901
902 unsigned minx, miny, maxx, maxy;
903
904 if (ss && ctx->rasterizer && ctx->rasterizer->base.scissor) {
905 minx = MAX2(ss->minx, vp_minx);
906 miny = MAX2(ss->miny, vp_miny);
907 maxx = MIN2(ss->maxx, vp_maxx);
908 maxy = MIN2(ss->maxy, vp_maxy);
909 } else {
910 minx = vp_minx;
911 miny = vp_miny;
912 maxx = vp_maxx;
913 maxy = vp_maxy;
914 }
915
916 /* Hardware needs the min/max to be strictly ordered, so flip if we
917 * need to. The viewport transformation in the vertex shader will
918 * handle the negatives if we don't */
919
920 if (miny > maxy) {
921 unsigned temp = miny;
922 miny = maxy;
923 maxy = temp;
924 }
925
926 if (minx > maxx) {
927 unsigned temp = minx;
928 minx = maxx;
929 maxx = temp;
930 }
931
932 if (minz > maxz) {
933 float temp = minz;
934 minz = maxz;
935 maxz = temp;
936 }
937
938 /* Clamp to the framebuffer size as a last check */
939
940 minx = MIN2(ctx->pipe_framebuffer.width, minx);
941 maxx = MIN2(ctx->pipe_framebuffer.width, maxx);
942
943 miny = MIN2(ctx->pipe_framebuffer.height, miny);
944 maxy = MIN2(ctx->pipe_framebuffer.height, maxy);
945
946 /* Upload */
947
948 mvp->viewport0[0] = minx;
949 mvp->viewport1[0] = MALI_POSITIVE(maxx);
950
951 mvp->viewport0[1] = miny;
952 mvp->viewport1[1] = MALI_POSITIVE(maxy);
953
954 mvp->clip_minz = minz;
955 mvp->clip_maxz = maxz;
956 }
957
958 void
959 panfrost_emit_viewport(struct panfrost_batch *batch,
960 struct mali_vertex_tiler_postfix *tiler_postfix)
961 {
962 struct panfrost_context *ctx = batch->ctx;
963 struct mali_viewport mvp;
964
965 panfrost_mali_viewport_init(batch->ctx, &mvp);
966
967 /* Update the job, unless we're doing wallpapering (whose lack of
968 * scissor we can ignore, since if we "miss" a tile of wallpaper, it'll
969 * just... be faster :) */
970
971 if (!ctx->wallpaper_batch)
972 panfrost_batch_union_scissor(batch, mvp.viewport0[0],
973 mvp.viewport0[1],
974 mvp.viewport1[0] + 1,
975 mvp.viewport1[1] + 1);
976
977 tiler_postfix->viewport = panfrost_upload_transient(batch, &mvp,
978 sizeof(mvp));
979 }
980
981 static mali_ptr
982 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
983 enum pipe_shader_type st,
984 struct panfrost_constant_buffer *buf,
985 unsigned index)
986 {
987 struct pipe_constant_buffer *cb = &buf->cb[index];
988 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
989
990 if (rsrc) {
991 panfrost_batch_add_bo(batch, rsrc->bo,
992 PAN_BO_ACCESS_SHARED |
993 PAN_BO_ACCESS_READ |
994 panfrost_bo_access_for_stage(st));
995
996 /* Alignment gauranteed by
997 * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
998 return rsrc->bo->gpu + cb->buffer_offset;
999 } else if (cb->user_buffer) {
1000 return panfrost_upload_transient(batch,
1001 cb->user_buffer +
1002 cb->buffer_offset,
1003 cb->buffer_size);
1004 } else {
1005 unreachable("No constant buffer");
1006 }
1007 }
1008
1009 struct sysval_uniform {
1010 union {
1011 float f[4];
1012 int32_t i[4];
1013 uint32_t u[4];
1014 uint64_t du[2];
1015 };
1016 };
1017
1018 static void
1019 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
1020 struct sysval_uniform *uniform)
1021 {
1022 struct panfrost_context *ctx = batch->ctx;
1023 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1024
1025 uniform->f[0] = vp->scale[0];
1026 uniform->f[1] = vp->scale[1];
1027 uniform->f[2] = vp->scale[2];
1028 }
1029
1030 static void
1031 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
1032 struct sysval_uniform *uniform)
1033 {
1034 struct panfrost_context *ctx = batch->ctx;
1035 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1036
1037 uniform->f[0] = vp->translate[0];
1038 uniform->f[1] = vp->translate[1];
1039 uniform->f[2] = vp->translate[2];
1040 }
1041
1042 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
1043 enum pipe_shader_type st,
1044 unsigned int sysvalid,
1045 struct sysval_uniform *uniform)
1046 {
1047 struct panfrost_context *ctx = batch->ctx;
1048 unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
1049 unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
1050 bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
1051 struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
1052
1053 assert(dim);
1054 uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
1055
1056 if (dim > 1)
1057 uniform->i[1] = u_minify(tex->texture->height0,
1058 tex->u.tex.first_level);
1059
1060 if (dim > 2)
1061 uniform->i[2] = u_minify(tex->texture->depth0,
1062 tex->u.tex.first_level);
1063
1064 if (is_array)
1065 uniform->i[dim] = tex->texture->array_size;
1066 }
1067
1068 static void
1069 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
1070 enum pipe_shader_type st,
1071 unsigned ssbo_id,
1072 struct sysval_uniform *uniform)
1073 {
1074 struct panfrost_context *ctx = batch->ctx;
1075
1076 assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
1077 struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
1078
1079 /* Compute address */
1080 struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
1081
1082 panfrost_batch_add_bo(batch, bo,
1083 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
1084 panfrost_bo_access_for_stage(st));
1085
1086 /* Upload address and size as sysval */
1087 uniform->du[0] = bo->gpu + sb.buffer_offset;
1088 uniform->u[2] = sb.buffer_size;
1089 }
1090
1091 static void
1092 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1093 enum pipe_shader_type st,
1094 unsigned samp_idx,
1095 struct sysval_uniform *uniform)
1096 {
1097 struct panfrost_context *ctx = batch->ctx;
1098 struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1099
1100 uniform->f[0] = sampl->min_lod;
1101 uniform->f[1] = sampl->max_lod;
1102 uniform->f[2] = sampl->lod_bias;
1103
1104 /* Even without any errata, Midgard represents "no mipmapping" as
1105 * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1106 * panfrost_create_sampler_state which also explains our choice of
1107 * epsilon value (again to keep behaviour consistent) */
1108
1109 if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1110 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1111 }
1112
1113 static void
1114 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1115 struct sysval_uniform *uniform)
1116 {
1117 struct panfrost_context *ctx = batch->ctx;
1118
1119 uniform->u[0] = ctx->compute_grid->grid[0];
1120 uniform->u[1] = ctx->compute_grid->grid[1];
1121 uniform->u[2] = ctx->compute_grid->grid[2];
1122 }
1123
1124 static void
1125 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1126 struct panfrost_shader_state *ss,
1127 enum pipe_shader_type st)
1128 {
1129 struct sysval_uniform *uniforms = (void *)buf;
1130
1131 for (unsigned i = 0; i < ss->sysval_count; ++i) {
1132 int sysval = ss->sysval[i];
1133
1134 switch (PAN_SYSVAL_TYPE(sysval)) {
1135 case PAN_SYSVAL_VIEWPORT_SCALE:
1136 panfrost_upload_viewport_scale_sysval(batch,
1137 &uniforms[i]);
1138 break;
1139 case PAN_SYSVAL_VIEWPORT_OFFSET:
1140 panfrost_upload_viewport_offset_sysval(batch,
1141 &uniforms[i]);
1142 break;
1143 case PAN_SYSVAL_TEXTURE_SIZE:
1144 panfrost_upload_txs_sysval(batch, st,
1145 PAN_SYSVAL_ID(sysval),
1146 &uniforms[i]);
1147 break;
1148 case PAN_SYSVAL_SSBO:
1149 panfrost_upload_ssbo_sysval(batch, st,
1150 PAN_SYSVAL_ID(sysval),
1151 &uniforms[i]);
1152 break;
1153 case PAN_SYSVAL_NUM_WORK_GROUPS:
1154 panfrost_upload_num_work_groups_sysval(batch,
1155 &uniforms[i]);
1156 break;
1157 case PAN_SYSVAL_SAMPLER:
1158 panfrost_upload_sampler_sysval(batch, st,
1159 PAN_SYSVAL_ID(sysval),
1160 &uniforms[i]);
1161 break;
1162 default:
1163 assert(0);
1164 }
1165 }
1166 }
1167
1168 static const void *
1169 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1170 unsigned index)
1171 {
1172 struct pipe_constant_buffer *cb = &buf->cb[index];
1173 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1174
1175 if (rsrc)
1176 return rsrc->bo->cpu;
1177 else if (cb->user_buffer)
1178 return cb->user_buffer;
1179 else
1180 unreachable("No constant buffer");
1181 }
1182
1183 void
1184 panfrost_emit_const_buf(struct panfrost_batch *batch,
1185 enum pipe_shader_type stage,
1186 struct mali_vertex_tiler_postfix *postfix)
1187 {
1188 struct panfrost_context *ctx = batch->ctx;
1189 struct panfrost_shader_variants *all = ctx->shader[stage];
1190
1191 if (!all)
1192 return;
1193
1194 struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1195
1196 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1197
1198 /* Uniforms are implicitly UBO #0 */
1199 bool has_uniforms = buf->enabled_mask & (1 << 0);
1200
1201 /* Allocate room for the sysval and the uniforms */
1202 size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1203 size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1204 size_t size = sys_size + uniform_size;
1205 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1206 size);
1207
1208 /* Upload sysvals requested by the shader */
1209 panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1210
1211 /* Upload uniforms */
1212 if (has_uniforms && uniform_size) {
1213 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1214 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1215 }
1216
1217 /* Next up, attach UBOs. UBO #0 is the uniforms we just
1218 * uploaded */
1219
1220 unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1221 assert(ubo_count >= 1);
1222
1223 size_t sz = sizeof(uint64_t) * ubo_count;
1224 uint64_t ubos[PAN_MAX_CONST_BUFFERS];
1225 int uniform_count = ss->uniform_count;
1226
1227 /* Upload uniforms as a UBO */
1228 ubos[0] = MALI_MAKE_UBO(2 + uniform_count, transfer.gpu);
1229
1230 /* The rest are honest-to-goodness UBOs */
1231
1232 for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1233 size_t usz = buf->cb[ubo].buffer_size;
1234 bool enabled = buf->enabled_mask & (1 << ubo);
1235 bool empty = usz == 0;
1236
1237 if (!enabled || empty) {
1238 /* Stub out disabled UBOs to catch accesses */
1239 ubos[ubo] = MALI_MAKE_UBO(0, 0xDEAD0000);
1240 continue;
1241 }
1242
1243 mali_ptr gpu = panfrost_map_constant_buffer_gpu(batch, stage,
1244 buf, ubo);
1245
1246 unsigned bytes_per_field = 16;
1247 unsigned aligned = ALIGN_POT(usz, bytes_per_field);
1248 ubos[ubo] = MALI_MAKE_UBO(aligned / bytes_per_field, gpu);
1249 }
1250
1251 mali_ptr ubufs = panfrost_upload_transient(batch, ubos, sz);
1252 postfix->uniforms = transfer.gpu;
1253 postfix->uniform_buffers = ubufs;
1254
1255 buf->dirty_mask = 0;
1256 }
1257
1258 void
1259 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1260 const struct pipe_grid_info *info,
1261 struct midgard_payload_vertex_tiler *vtp)
1262 {
1263 struct panfrost_context *ctx = batch->ctx;
1264 struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1265 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1266 unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1267 128));
1268 unsigned shared_size = single_size * info->grid[0] * info->grid[1] *
1269 info->grid[2] * 4;
1270 struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1271 shared_size,
1272 1);
1273
1274 struct mali_shared_memory shared = {
1275 .shared_memory = bo->gpu,
1276 .shared_workgroup_count =
1277 util_logbase2_ceil(info->grid[0]) +
1278 util_logbase2_ceil(info->grid[1]) +
1279 util_logbase2_ceil(info->grid[2]),
1280 .shared_unk1 = 0x2,
1281 .shared_shift = util_logbase2(single_size) - 1
1282 };
1283
1284 vtp->postfix.shared_memory = panfrost_upload_transient(batch, &shared,
1285 sizeof(shared));
1286 }
1287
1288 static mali_ptr
1289 panfrost_get_tex_desc(struct panfrost_batch *batch,
1290 enum pipe_shader_type st,
1291 struct panfrost_sampler_view *view)
1292 {
1293 if (!view)
1294 return (mali_ptr) 0;
1295
1296 struct pipe_sampler_view *pview = &view->base;
1297 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1298
1299 /* Add the BO to the job so it's retained until the job is done. */
1300
1301 panfrost_batch_add_bo(batch, rsrc->bo,
1302 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1303 panfrost_bo_access_for_stage(st));
1304
1305 panfrost_batch_add_bo(batch, view->midgard_bo,
1306 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1307 panfrost_bo_access_for_stage(st));
1308
1309 return view->midgard_bo->gpu;
1310 }
1311
1312 void
1313 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1314 enum pipe_shader_type stage,
1315 struct mali_vertex_tiler_postfix *postfix)
1316 {
1317 struct panfrost_context *ctx = batch->ctx;
1318 struct panfrost_device *device = pan_device(ctx->base.screen);
1319
1320 if (!ctx->sampler_view_count[stage])
1321 return;
1322
1323 if (device->quirks & IS_BIFROST) {
1324 struct bifrost_texture_descriptor *descriptors;
1325
1326 descriptors = malloc(sizeof(struct bifrost_texture_descriptor) *
1327 ctx->sampler_view_count[stage]);
1328
1329 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1330 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1331 struct pipe_sampler_view *pview = &view->base;
1332 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1333
1334 /* Add the BOs to the job so they are retained until the job is done. */
1335
1336 panfrost_batch_add_bo(batch, rsrc->bo,
1337 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1338 panfrost_bo_access_for_stage(stage));
1339
1340 panfrost_batch_add_bo(batch, view->bifrost_bo,
1341 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1342 panfrost_bo_access_for_stage(stage));
1343
1344 memcpy(&descriptors[i], view->bifrost_descriptor, sizeof(*view->bifrost_descriptor));
1345 }
1346
1347 postfix->textures = panfrost_upload_transient(batch,
1348 descriptors,
1349 sizeof(struct bifrost_texture_descriptor) *
1350 ctx->sampler_view_count[stage]);
1351
1352 free(descriptors);
1353 } else {
1354 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1355
1356 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i)
1357 trampolines[i] = panfrost_get_tex_desc(batch, stage,
1358 ctx->sampler_views[stage][i]);
1359
1360 postfix->textures = panfrost_upload_transient(batch,
1361 trampolines,
1362 sizeof(uint64_t) *
1363 ctx->sampler_view_count[stage]);
1364 }
1365 }
1366
1367 void
1368 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1369 enum pipe_shader_type stage,
1370 struct mali_vertex_tiler_postfix *postfix)
1371 {
1372 struct panfrost_context *ctx = batch->ctx;
1373 struct panfrost_device *device = pan_device(ctx->base.screen);
1374
1375 if (!ctx->sampler_count[stage])
1376 return;
1377
1378 if (device->quirks & IS_BIFROST) {
1379 size_t desc_size = sizeof(struct bifrost_sampler_descriptor);
1380 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1381 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1382 transfer_size);
1383 struct bifrost_sampler_descriptor *desc = (struct bifrost_sampler_descriptor *)transfer.cpu;
1384
1385 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1386 desc[i] = ctx->samplers[stage][i]->bifrost_hw;
1387
1388 postfix->sampler_descriptor = transfer.gpu;
1389 } else {
1390 size_t desc_size = sizeof(struct mali_sampler_descriptor);
1391 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1392 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1393 transfer_size);
1394 struct mali_sampler_descriptor *desc = (struct mali_sampler_descriptor *)transfer.cpu;
1395
1396 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1397 desc[i] = ctx->samplers[stage][i]->midgard_hw;
1398
1399 postfix->sampler_descriptor = transfer.gpu;
1400 }
1401 }
1402
1403 void
1404 panfrost_emit_vertex_attr_meta(struct panfrost_batch *batch,
1405 struct mali_vertex_tiler_postfix *vertex_postfix)
1406 {
1407 struct panfrost_context *ctx = batch->ctx;
1408
1409 if (!ctx->vertex)
1410 return;
1411
1412 struct panfrost_vertex_state *so = ctx->vertex;
1413
1414 panfrost_vertex_state_upd_attr_offs(ctx, vertex_postfix);
1415 vertex_postfix->attribute_meta = panfrost_upload_transient(batch, so->hw,
1416 sizeof(*so->hw) *
1417 PAN_MAX_ATTRIBUTE);
1418 }
1419
1420 void
1421 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1422 struct mali_vertex_tiler_postfix *vertex_postfix)
1423 {
1424 struct panfrost_context *ctx = batch->ctx;
1425 struct panfrost_vertex_state *so = ctx->vertex;
1426
1427 /* Staged mali_attr, and index into them. i =/= k, depending on the
1428 * vertex buffer mask and instancing. Twice as much room is allocated,
1429 * for a worst case of NPOT_DIVIDEs which take up extra slot */
1430 union mali_attr attrs[PIPE_MAX_ATTRIBS * 2];
1431 unsigned k = 0;
1432
1433 for (unsigned i = 0; i < so->num_elements; ++i) {
1434 /* We map a mali_attr to be 1:1 with the mali_attr_meta, which
1435 * means duplicating some vertex buffers (who cares? aside from
1436 * maybe some caching implications but I somehow doubt that
1437 * matters) */
1438
1439 struct pipe_vertex_element *elem = &so->pipe[i];
1440 unsigned vbi = elem->vertex_buffer_index;
1441
1442 /* The exception to 1:1 mapping is that we can have multiple
1443 * entries (NPOT divisors), so we fixup anyways */
1444
1445 so->hw[i].index = k;
1446
1447 if (!(ctx->vb_mask & (1 << vbi)))
1448 continue;
1449
1450 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1451 struct panfrost_resource *rsrc;
1452
1453 rsrc = pan_resource(buf->buffer.resource);
1454 if (!rsrc)
1455 continue;
1456
1457 /* Align to 64 bytes by masking off the lower bits. This
1458 * will be adjusted back when we fixup the src_offset in
1459 * mali_attr_meta */
1460
1461 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1462 mali_ptr addr = raw_addr & ~63;
1463 unsigned chopped_addr = raw_addr - addr;
1464
1465 /* Add a dependency of the batch on the vertex buffer */
1466 panfrost_batch_add_bo(batch, rsrc->bo,
1467 PAN_BO_ACCESS_SHARED |
1468 PAN_BO_ACCESS_READ |
1469 PAN_BO_ACCESS_VERTEX_TILER);
1470
1471 /* Set common fields */
1472 attrs[k].elements = addr;
1473 attrs[k].stride = buf->stride;
1474
1475 /* Since we advanced the base pointer, we shrink the buffer
1476 * size */
1477 attrs[k].size = rsrc->base.width0 - buf->buffer_offset;
1478
1479 /* We need to add the extra size we masked off (for
1480 * correctness) so the data doesn't get clamped away */
1481 attrs[k].size += chopped_addr;
1482
1483 /* For non-instancing make sure we initialize */
1484 attrs[k].shift = attrs[k].extra_flags = 0;
1485
1486 /* Instancing uses a dramatically different code path than
1487 * linear, so dispatch for the actual emission now that the
1488 * common code is finished */
1489
1490 unsigned divisor = elem->instance_divisor;
1491
1492 if (divisor && ctx->instance_count == 1) {
1493 /* Silly corner case where there's a divisor(=1) but
1494 * there's no legitimate instancing. So we want *every*
1495 * attribute to be the same. So set stride to zero so
1496 * we don't go anywhere. */
1497
1498 attrs[k].size = attrs[k].stride + chopped_addr;
1499 attrs[k].stride = 0;
1500 attrs[k++].elements |= MALI_ATTR_LINEAR;
1501 } else if (ctx->instance_count <= 1) {
1502 /* Normal, non-instanced attributes */
1503 attrs[k++].elements |= MALI_ATTR_LINEAR;
1504 } else {
1505 unsigned instance_shift = vertex_postfix->instance_shift;
1506 unsigned instance_odd = vertex_postfix->instance_odd;
1507
1508 k += panfrost_vertex_instanced(ctx->padded_count,
1509 instance_shift,
1510 instance_odd,
1511 divisor, &attrs[k]);
1512 }
1513 }
1514
1515 /* Add special gl_VertexID/gl_InstanceID buffers */
1516
1517 panfrost_vertex_id(ctx->padded_count, &attrs[k]);
1518 so->hw[PAN_VERTEX_ID].index = k++;
1519 panfrost_instance_id(ctx->padded_count, &attrs[k]);
1520 so->hw[PAN_INSTANCE_ID].index = k++;
1521
1522 /* Upload whatever we emitted and go */
1523
1524 vertex_postfix->attributes = panfrost_upload_transient(batch, attrs,
1525 k * sizeof(*attrs));
1526 }
1527
1528 static mali_ptr
1529 panfrost_emit_varyings(struct panfrost_batch *batch, union mali_attr *slot,
1530 unsigned stride, unsigned count)
1531 {
1532 /* Fill out the descriptor */
1533 slot->stride = stride;
1534 slot->size = stride * count;
1535 slot->shift = slot->extra_flags = 0;
1536
1537 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1538 slot->size);
1539
1540 slot->elements = transfer.gpu | MALI_ATTR_LINEAR;
1541
1542 return transfer.gpu;
1543 }
1544
1545 static void
1546 panfrost_emit_streamout(struct panfrost_batch *batch, union mali_attr *slot,
1547 unsigned stride, unsigned offset, unsigned count,
1548 struct pipe_stream_output_target *target)
1549 {
1550 /* Fill out the descriptor */
1551 slot->stride = stride * 4;
1552 slot->shift = slot->extra_flags = 0;
1553
1554 unsigned max_size = target->buffer_size;
1555 unsigned expected_size = slot->stride * count;
1556
1557 slot->size = MIN2(max_size, expected_size);
1558
1559 /* Grab the BO and bind it to the batch */
1560 struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1561
1562 /* Varyings are WRITE from the perspective of the VERTEX but READ from
1563 * the perspective of the TILER and FRAGMENT.
1564 */
1565 panfrost_batch_add_bo(batch, bo,
1566 PAN_BO_ACCESS_SHARED |
1567 PAN_BO_ACCESS_RW |
1568 PAN_BO_ACCESS_VERTEX_TILER |
1569 PAN_BO_ACCESS_FRAGMENT);
1570
1571 mali_ptr addr = bo->gpu + target->buffer_offset + (offset * slot->stride);
1572 slot->elements = addr;
1573 }
1574
1575 /* Given a shader and buffer indices, link varying metadata together */
1576
1577 static bool
1578 is_special_varying(gl_varying_slot loc)
1579 {
1580 switch (loc) {
1581 case VARYING_SLOT_POS:
1582 case VARYING_SLOT_PSIZ:
1583 case VARYING_SLOT_PNTC:
1584 case VARYING_SLOT_FACE:
1585 return true;
1586 default:
1587 return false;
1588 }
1589 }
1590
1591 static void
1592 panfrost_emit_varying_meta(void *outptr, struct panfrost_shader_state *ss,
1593 signed general, signed gl_Position,
1594 signed gl_PointSize, signed gl_PointCoord,
1595 signed gl_FrontFacing)
1596 {
1597 struct mali_attr_meta *out = (struct mali_attr_meta *) outptr;
1598
1599 for (unsigned i = 0; i < ss->varying_count; ++i) {
1600 gl_varying_slot location = ss->varyings_loc[i];
1601 int index = -1;
1602
1603 switch (location) {
1604 case VARYING_SLOT_POS:
1605 index = gl_Position;
1606 break;
1607 case VARYING_SLOT_PSIZ:
1608 index = gl_PointSize;
1609 break;
1610 case VARYING_SLOT_PNTC:
1611 index = gl_PointCoord;
1612 break;
1613 case VARYING_SLOT_FACE:
1614 index = gl_FrontFacing;
1615 break;
1616 default:
1617 index = general;
1618 break;
1619 }
1620
1621 assert(index >= 0);
1622 out[i].index = index;
1623 }
1624 }
1625
1626 static bool
1627 has_point_coord(unsigned mask, gl_varying_slot loc)
1628 {
1629 if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1630 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1631 else if (loc == VARYING_SLOT_PNTC)
1632 return (mask & (1 << 8));
1633 else
1634 return false;
1635 }
1636
1637 /* Helpers for manipulating stream out information so we can pack varyings
1638 * accordingly. Compute the src_offset for a given captured varying */
1639
1640 static struct pipe_stream_output *
1641 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1642 {
1643 for (unsigned i = 0; i < info->num_outputs; ++i) {
1644 if (info->output[i].register_index == loc)
1645 return &info->output[i];
1646 }
1647
1648 unreachable("Varying not captured");
1649 }
1650
1651 /* TODO: Integers */
1652 static enum mali_format
1653 pan_xfb_format(unsigned nr_components)
1654 {
1655 switch (nr_components) {
1656 case 1: return MALI_R32F;
1657 case 2: return MALI_RG32F;
1658 case 3: return MALI_RGB32F;
1659 case 4: return MALI_RGBA32F;
1660 default: unreachable("Invalid format");
1661 }
1662 }
1663
1664 void
1665 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1666 unsigned vertex_count,
1667 struct mali_vertex_tiler_postfix *vertex_postfix,
1668 struct mali_vertex_tiler_postfix *tiler_postfix,
1669 union midgard_primitive_size *primitive_size)
1670 {
1671 /* Load the shaders */
1672 struct panfrost_context *ctx = batch->ctx;
1673 struct panfrost_shader_state *vs, *fs;
1674 unsigned int num_gen_varyings = 0;
1675 size_t vs_size, fs_size;
1676
1677 /* Allocate the varying descriptor */
1678
1679 vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1680 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1681 vs_size = sizeof(struct mali_attr_meta) * vs->varying_count;
1682 fs_size = sizeof(struct mali_attr_meta) * fs->varying_count;
1683
1684 struct panfrost_transfer trans = panfrost_allocate_transient(batch,
1685 vs_size +
1686 fs_size);
1687
1688 struct pipe_stream_output_info *so = &vs->stream_output;
1689
1690 /* Check if this varying is linked by us. This is the case for
1691 * general-purpose, non-captured varyings. If it is, link it. If it's
1692 * not, use the provided stream out information to determine the
1693 * offset, since it was already linked for us. */
1694
1695 for (unsigned i = 0; i < vs->varying_count; i++) {
1696 gl_varying_slot loc = vs->varyings_loc[i];
1697
1698 bool special = is_special_varying(loc);
1699 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1700
1701 if (captured) {
1702 struct pipe_stream_output *o = pan_get_so(so, loc);
1703
1704 unsigned dst_offset = o->dst_offset * 4; /* dwords */
1705 vs->varyings[i].src_offset = dst_offset;
1706 } else if (!special) {
1707 vs->varyings[i].src_offset = 16 * (num_gen_varyings++);
1708 }
1709 }
1710
1711 /* Conversely, we need to set src_offset for the captured varyings.
1712 * Here, the layout is defined by the stream out info, not us */
1713
1714 /* Link up with fragment varyings */
1715 bool reads_point_coord = fs->reads_point_coord;
1716
1717 for (unsigned i = 0; i < fs->varying_count; i++) {
1718 gl_varying_slot loc = fs->varyings_loc[i];
1719 unsigned src_offset;
1720 signed vs_idx = -1;
1721
1722 /* Link up */
1723 for (unsigned j = 0; j < vs->varying_count; ++j) {
1724 if (vs->varyings_loc[j] == loc) {
1725 vs_idx = j;
1726 break;
1727 }
1728 }
1729
1730 /* Either assign or reuse */
1731 if (vs_idx >= 0)
1732 src_offset = vs->varyings[vs_idx].src_offset;
1733 else
1734 src_offset = 16 * (num_gen_varyings++);
1735
1736 fs->varyings[i].src_offset = src_offset;
1737
1738 if (has_point_coord(fs->point_sprite_mask, loc))
1739 reads_point_coord = true;
1740 }
1741
1742 memcpy(trans.cpu, vs->varyings, vs_size);
1743 memcpy(trans.cpu + vs_size, fs->varyings, fs_size);
1744
1745 union mali_attr varyings[PIPE_MAX_ATTRIBS] = {0};
1746
1747 /* Figure out how many streamout buffers could be bound */
1748 unsigned so_count = ctx->streamout.num_targets;
1749 for (unsigned i = 0; i < vs->varying_count; i++) {
1750 gl_varying_slot loc = vs->varyings_loc[i];
1751
1752 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1753 if (!captured) continue;
1754
1755 struct pipe_stream_output *o = pan_get_so(so, loc);
1756 so_count = MAX2(so_count, o->output_buffer + 1);
1757 }
1758
1759 signed idx = so_count;
1760 signed general = idx++;
1761 signed gl_Position = idx++;
1762 signed gl_PointSize = vs->writes_point_size ? (idx++) : -1;
1763 signed gl_PointCoord = reads_point_coord ? (idx++) : -1;
1764 signed gl_FrontFacing = fs->reads_face ? (idx++) : -1;
1765 signed gl_FragCoord = fs->reads_frag_coord ? (idx++) : -1;
1766
1767 /* Emit the stream out buffers */
1768
1769 unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1770 ctx->vertex_count);
1771
1772 for (unsigned i = 0; i < so_count; ++i) {
1773 if (i < ctx->streamout.num_targets) {
1774 panfrost_emit_streamout(batch, &varyings[i],
1775 so->stride[i],
1776 ctx->streamout.offsets[i],
1777 out_count,
1778 ctx->streamout.targets[i]);
1779 } else {
1780 /* Emit a dummy buffer */
1781 panfrost_emit_varyings(batch, &varyings[i],
1782 so->stride[i] * 4,
1783 out_count);
1784
1785 /* Clear the attribute type */
1786 varyings[i].elements &= ~0xF;
1787 }
1788 }
1789
1790 panfrost_emit_varyings(batch, &varyings[general],
1791 num_gen_varyings * 16,
1792 vertex_count);
1793
1794 mali_ptr varyings_p;
1795
1796 /* fp32 vec4 gl_Position */
1797 varyings_p = panfrost_emit_varyings(batch, &varyings[gl_Position],
1798 sizeof(float) * 4, vertex_count);
1799 tiler_postfix->position_varying = varyings_p;
1800
1801
1802 if (panfrost_writes_point_size(ctx)) {
1803 varyings_p = panfrost_emit_varyings(batch,
1804 &varyings[gl_PointSize],
1805 2, vertex_count);
1806 primitive_size->pointer = varyings_p;
1807 }
1808
1809 if (reads_point_coord)
1810 varyings[gl_PointCoord].elements = MALI_VARYING_POINT_COORD;
1811
1812 if (fs->reads_face)
1813 varyings[gl_FrontFacing].elements = MALI_VARYING_FRONT_FACING;
1814
1815 if (fs->reads_frag_coord)
1816 varyings[gl_FragCoord].elements = MALI_VARYING_FRAG_COORD;
1817
1818 struct panfrost_device *device = pan_device(ctx->base.screen);
1819 assert(!(device->quirks & IS_BIFROST) || !(reads_point_coord));
1820
1821 /* Let's go ahead and link varying meta to the buffer in question, now
1822 * that that information is available. VARYING_SLOT_POS is mapped to
1823 * gl_FragCoord for fragment shaders but gl_Positionf or vertex shaders
1824 * */
1825
1826 panfrost_emit_varying_meta(trans.cpu, vs, general, gl_Position,
1827 gl_PointSize, gl_PointCoord,
1828 gl_FrontFacing);
1829
1830 panfrost_emit_varying_meta(trans.cpu + vs_size, fs, general,
1831 gl_FragCoord, gl_PointSize,
1832 gl_PointCoord, gl_FrontFacing);
1833
1834 /* Replace streamout */
1835
1836 struct mali_attr_meta *ovs = (struct mali_attr_meta *)trans.cpu;
1837 struct mali_attr_meta *ofs = ovs + vs->varying_count;
1838
1839 for (unsigned i = 0; i < vs->varying_count; i++) {
1840 gl_varying_slot loc = vs->varyings_loc[i];
1841
1842 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1843 if (!captured)
1844 continue;
1845
1846 struct pipe_stream_output *o = pan_get_so(so, loc);
1847 ovs[i].index = o->output_buffer;
1848
1849 /* Set the type appropriately. TODO: Integer varyings XXX */
1850 assert(o->stream == 0);
1851 ovs[i].format = pan_xfb_format(o->num_components);
1852
1853 if (device->quirks & HAS_SWIZZLES)
1854 ovs[i].swizzle = panfrost_get_default_swizzle(o->num_components);
1855 else
1856 ovs[i].swizzle = panfrost_bifrost_swizzle(o->num_components);
1857
1858 /* Link to the fragment */
1859 signed fs_idx = -1;
1860
1861 /* Link up */
1862 for (unsigned j = 0; j < fs->varying_count; ++j) {
1863 if (fs->varyings_loc[j] == loc) {
1864 fs_idx = j;
1865 break;
1866 }
1867 }
1868
1869 if (fs_idx >= 0) {
1870 ofs[fs_idx].index = ovs[i].index;
1871 ofs[fs_idx].format = ovs[i].format;
1872 ofs[fs_idx].swizzle = ovs[i].swizzle;
1873 }
1874 }
1875
1876 /* Replace point sprite */
1877 for (unsigned i = 0; i < fs->varying_count; i++) {
1878 /* If we have a point sprite replacement, handle that here. We
1879 * have to translate location first. TODO: Flip y in shader.
1880 * We're already keying ... just time crunch .. */
1881
1882 if (has_point_coord(fs->point_sprite_mask,
1883 fs->varyings_loc[i])) {
1884 ofs[i].index = gl_PointCoord;
1885
1886 /* Swizzle out the z/w to 0/1 */
1887 ofs[i].format = MALI_RG16F;
1888 ofs[i].swizzle = panfrost_get_default_swizzle(2);
1889 }
1890 }
1891
1892 /* Fix up unaligned addresses */
1893 for (unsigned i = 0; i < so_count; ++i) {
1894 if (varyings[i].elements < MALI_RECORD_SPECIAL)
1895 continue;
1896
1897 unsigned align = (varyings[i].elements & 63);
1898
1899 /* While we're at it, the SO buffers are linear */
1900
1901 if (!align) {
1902 varyings[i].elements |= MALI_ATTR_LINEAR;
1903 continue;
1904 }
1905
1906 /* We need to adjust alignment */
1907 varyings[i].elements &= ~63;
1908 varyings[i].elements |= MALI_ATTR_LINEAR;
1909 varyings[i].size += align;
1910
1911 for (unsigned v = 0; v < vs->varying_count; ++v) {
1912 if (ovs[v].index != i)
1913 continue;
1914
1915 ovs[v].src_offset = vs->varyings[v].src_offset + align;
1916 }
1917
1918 for (unsigned f = 0; f < fs->varying_count; ++f) {
1919 if (ofs[f].index != i)
1920 continue;
1921
1922 ofs[f].src_offset = fs->varyings[f].src_offset + align;
1923 }
1924 }
1925
1926 varyings_p = panfrost_upload_transient(batch, varyings,
1927 idx * sizeof(*varyings));
1928 vertex_postfix->varyings = varyings_p;
1929 tiler_postfix->varyings = varyings_p;
1930
1931 vertex_postfix->varying_meta = trans.gpu;
1932 tiler_postfix->varying_meta = trans.gpu + vs_size;
1933 }
1934
1935 void
1936 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
1937 struct mali_vertex_tiler_prefix *vertex_prefix,
1938 struct mali_vertex_tiler_postfix *vertex_postfix,
1939 struct mali_vertex_tiler_prefix *tiler_prefix,
1940 struct mali_vertex_tiler_postfix *tiler_postfix,
1941 union midgard_primitive_size *primitive_size)
1942 {
1943 struct panfrost_context *ctx = batch->ctx;
1944 struct panfrost_device *device = pan_device(ctx->base.screen);
1945 bool wallpapering = ctx->wallpaper_batch && batch->tiler_dep;
1946 struct bifrost_payload_vertex bifrost_vertex = {0,};
1947 struct bifrost_payload_tiler bifrost_tiler = {0,};
1948 struct midgard_payload_vertex_tiler midgard_vertex = {0,};
1949 struct midgard_payload_vertex_tiler midgard_tiler = {0,};
1950 void *vp, *tp;
1951 size_t vp_size, tp_size;
1952
1953 if (device->quirks & IS_BIFROST) {
1954 bifrost_vertex.prefix = *vertex_prefix;
1955 bifrost_vertex.postfix = *vertex_postfix;
1956 vp = &bifrost_vertex;
1957 vp_size = sizeof(bifrost_vertex);
1958
1959 bifrost_tiler.prefix = *tiler_prefix;
1960 bifrost_tiler.tiler.primitive_size = *primitive_size;
1961 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
1962 bifrost_tiler.postfix = *tiler_postfix;
1963 tp = &bifrost_tiler;
1964 tp_size = sizeof(bifrost_tiler);
1965 } else {
1966 midgard_vertex.prefix = *vertex_prefix;
1967 midgard_vertex.postfix = *vertex_postfix;
1968 vp = &midgard_vertex;
1969 vp_size = sizeof(midgard_vertex);
1970
1971 midgard_tiler.prefix = *tiler_prefix;
1972 midgard_tiler.postfix = *tiler_postfix;
1973 midgard_tiler.primitive_size = *primitive_size;
1974 tp = &midgard_tiler;
1975 tp_size = sizeof(midgard_tiler);
1976 }
1977
1978 if (wallpapering) {
1979 /* Inject in reverse order, with "predicted" job indices.
1980 * THIS IS A HACK XXX */
1981 panfrost_new_job(batch, JOB_TYPE_TILER, false,
1982 batch->job_index + 2, tp, tp_size, true);
1983 panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
1984 vp, vp_size, true);
1985 return;
1986 }
1987
1988 /* If rasterizer discard is enable, only submit the vertex */
1989
1990 bool rasterizer_discard = ctx->rasterizer &&
1991 ctx->rasterizer->base.rasterizer_discard;
1992
1993 unsigned vertex = panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
1994 vp, vp_size, false);
1995
1996 if (rasterizer_discard)
1997 return;
1998
1999 panfrost_new_job(batch, JOB_TYPE_TILER, false, vertex, tp, tp_size,
2000 false);
2001 }
2002
2003 /* TODO: stop hardcoding this */
2004 mali_ptr
2005 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2006 {
2007 uint16_t locations[] = {
2008 128, 128,
2009 0, 256,
2010 0, 256,
2011 0, 256,
2012 0, 256,
2013 0, 256,
2014 0, 256,
2015 0, 256,
2016 0, 256,
2017 0, 256,
2018 0, 256,
2019 0, 256,
2020 0, 256,
2021 0, 256,
2022 0, 256,
2023 0, 256,
2024 0, 256,
2025 0, 256,
2026 0, 256,
2027 0, 256,
2028 0, 256,
2029 0, 256,
2030 0, 256,
2031 0, 256,
2032 0, 256,
2033 0, 256,
2034 0, 256,
2035 0, 256,
2036 0, 256,
2037 0, 256,
2038 0, 256,
2039 0, 256,
2040 128, 128,
2041 0, 0,
2042 0, 0,
2043 0, 0,
2044 0, 0,
2045 0, 0,
2046 0, 0,
2047 0, 0,
2048 0, 0,
2049 0, 0,
2050 0, 0,
2051 0, 0,
2052 0, 0,
2053 0, 0,
2054 0, 0,
2055 0, 0,
2056 };
2057
2058 return panfrost_upload_transient(batch, locations, 96 * sizeof(uint16_t));
2059 }