panfrost: Add Bifrost texture trampoline BO to batch
[mesa.git] / src / gallium / drivers / panfrost / pan_cmdstream.c
1 /*
2 * Copyright (C) 2018 Alyssa Rosenzweig
3 * Copyright (C) 2020 Collabora Ltd.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "util/macros.h"
26 #include "util/u_prim.h"
27 #include "util/u_vbuf.h"
28
29 #include "panfrost-quirks.h"
30
31 #include "pan_allocate.h"
32 #include "pan_bo.h"
33 #include "pan_cmdstream.h"
34 #include "pan_context.h"
35 #include "pan_job.h"
36
37 /* If a BO is accessed for a particular shader stage, will it be in the primary
38 * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
39 * fragment will be primary, e.g. compute jobs will be considered
40 * "vertex/tiler" by analogy */
41
42 static inline uint32_t
43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
44 {
45 assert(stage == PIPE_SHADER_FRAGMENT ||
46 stage == PIPE_SHADER_VERTEX ||
47 stage == PIPE_SHADER_COMPUTE);
48
49 return stage == PIPE_SHADER_FRAGMENT ?
50 PAN_BO_ACCESS_FRAGMENT :
51 PAN_BO_ACCESS_VERTEX_TILER;
52 }
53
54 static void
55 panfrost_vt_emit_shared_memory(struct panfrost_context *ctx,
56 struct mali_vertex_tiler_postfix *postfix)
57 {
58 struct panfrost_device *dev = pan_device(ctx->base.screen);
59 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
60
61 unsigned shift = panfrost_get_stack_shift(batch->stack_size);
62 struct mali_shared_memory shared = {
63 .stack_shift = shift,
64 .scratchpad = panfrost_batch_get_scratchpad(batch, shift, dev->thread_tls_alloc, dev->core_count)->gpu,
65 .shared_workgroup_count = ~0,
66 };
67 postfix->shared_memory = panfrost_upload_transient(batch, &shared, sizeof(shared));
68 }
69
70 static void
71 panfrost_vt_attach_framebuffer(struct panfrost_context *ctx,
72 struct mali_vertex_tiler_postfix *postfix)
73 {
74 struct panfrost_device *dev = pan_device(ctx->base.screen);
75 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
76
77 /* If we haven't, reserve space for the framebuffer */
78
79 if (!batch->framebuffer.gpu) {
80 unsigned size = (dev->quirks & MIDGARD_SFBD) ?
81 sizeof(struct mali_single_framebuffer) :
82 sizeof(struct mali_framebuffer);
83
84 batch->framebuffer = panfrost_allocate_transient(batch, size);
85
86 /* Tag the pointer */
87 if (!(dev->quirks & MIDGARD_SFBD))
88 batch->framebuffer.gpu |= MALI_MFBD;
89 }
90
91 postfix->shared_memory = batch->framebuffer.gpu;
92 }
93
94 static void
95 panfrost_vt_update_rasterizer(struct panfrost_context *ctx,
96 struct mali_vertex_tiler_prefix *prefix,
97 struct mali_vertex_tiler_postfix *postfix)
98 {
99 struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
100
101 postfix->gl_enables |= 0x7;
102 SET_BIT(postfix->gl_enables, MALI_FRONT_CCW_TOP,
103 rasterizer && rasterizer->base.front_ccw);
104 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_FRONT,
105 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_FRONT));
106 SET_BIT(postfix->gl_enables, MALI_CULL_FACE_BACK,
107 rasterizer && (rasterizer->base.cull_face & PIPE_FACE_BACK));
108 SET_BIT(prefix->unknown_draw, MALI_DRAW_FLATSHADE_FIRST,
109 rasterizer && rasterizer->base.flatshade_first);
110 }
111
112 void
113 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
114 struct mali_vertex_tiler_prefix *prefix,
115 union midgard_primitive_size *primitive_size)
116 {
117 struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
118
119 if (!panfrost_writes_point_size(ctx)) {
120 bool points = prefix->draw_mode == MALI_POINTS;
121 float val = 0.0f;
122
123 if (rasterizer)
124 val = points ?
125 rasterizer->base.point_size :
126 rasterizer->base.line_width;
127
128 primitive_size->constant = val;
129 }
130 }
131
132 static void
133 panfrost_vt_update_occlusion_query(struct panfrost_context *ctx,
134 struct mali_vertex_tiler_postfix *postfix)
135 {
136 SET_BIT(postfix->gl_enables, MALI_OCCLUSION_QUERY, ctx->occlusion_query);
137 if (ctx->occlusion_query)
138 postfix->occlusion_counter = ctx->occlusion_query->bo->gpu;
139 else
140 postfix->occlusion_counter = 0;
141 }
142
143 void
144 panfrost_vt_init(struct panfrost_context *ctx,
145 enum pipe_shader_type stage,
146 struct mali_vertex_tiler_prefix *prefix,
147 struct mali_vertex_tiler_postfix *postfix)
148 {
149 struct panfrost_device *device = pan_device(ctx->base.screen);
150
151 if (!ctx->shader[stage])
152 return;
153
154 memset(prefix, 0, sizeof(*prefix));
155 memset(postfix, 0, sizeof(*postfix));
156
157 if (device->quirks & IS_BIFROST) {
158 postfix->gl_enables = 0x2;
159 panfrost_vt_emit_shared_memory(ctx, postfix);
160 } else {
161 postfix->gl_enables = 0x6;
162 panfrost_vt_attach_framebuffer(ctx, postfix);
163 }
164
165 if (stage == PIPE_SHADER_FRAGMENT) {
166 panfrost_vt_update_occlusion_query(ctx, postfix);
167 panfrost_vt_update_rasterizer(ctx, prefix, postfix);
168 }
169 }
170
171 static unsigned
172 panfrost_translate_index_size(unsigned size)
173 {
174 switch (size) {
175 case 1:
176 return MALI_DRAW_INDEXED_UINT8;
177
178 case 2:
179 return MALI_DRAW_INDEXED_UINT16;
180
181 case 4:
182 return MALI_DRAW_INDEXED_UINT32;
183
184 default:
185 unreachable("Invalid index size");
186 }
187 }
188
189 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
190 * good for the duration of the draw (transient), could last longer. Also get
191 * the bounds on the index buffer for the range accessed by the draw. We do
192 * these operations together because there are natural optimizations which
193 * require them to be together. */
194
195 static mali_ptr
196 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
197 const struct pipe_draw_info *info,
198 unsigned *min_index, unsigned *max_index)
199 {
200 struct panfrost_resource *rsrc = pan_resource(info->index.resource);
201 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
202 off_t offset = info->start * info->index_size;
203 bool needs_indices = true;
204 mali_ptr out = 0;
205
206 if (info->max_index != ~0u) {
207 *min_index = info->min_index;
208 *max_index = info->max_index;
209 needs_indices = false;
210 }
211
212 if (!info->has_user_indices) {
213 /* Only resources can be directly mapped */
214 panfrost_batch_add_bo(batch, rsrc->bo,
215 PAN_BO_ACCESS_SHARED |
216 PAN_BO_ACCESS_READ |
217 PAN_BO_ACCESS_VERTEX_TILER);
218 out = rsrc->bo->gpu + offset;
219
220 /* Check the cache */
221 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
222 info->start,
223 info->count,
224 min_index,
225 max_index);
226 } else {
227 /* Otherwise, we need to upload to transient memory */
228 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
229 out = panfrost_upload_transient(batch, ibuf8 + offset,
230 info->count *
231 info->index_size);
232 }
233
234 if (needs_indices) {
235 /* Fallback */
236 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
237
238 if (!info->has_user_indices)
239 panfrost_minmax_cache_add(rsrc->index_cache,
240 info->start, info->count,
241 *min_index, *max_index);
242 }
243
244 return out;
245 }
246
247 void
248 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
249 const struct pipe_draw_info *info,
250 enum mali_draw_mode draw_mode,
251 struct mali_vertex_tiler_postfix *vertex_postfix,
252 struct mali_vertex_tiler_prefix *tiler_prefix,
253 struct mali_vertex_tiler_postfix *tiler_postfix,
254 unsigned *vertex_count,
255 unsigned *padded_count)
256 {
257 tiler_prefix->draw_mode = draw_mode;
258
259 unsigned draw_flags = 0;
260
261 if (panfrost_writes_point_size(ctx))
262 draw_flags |= MALI_DRAW_VARYING_SIZE;
263
264 if (info->primitive_restart)
265 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
266
267 /* These doesn't make much sense */
268
269 draw_flags |= 0x3000;
270
271 if (info->index_size) {
272 unsigned min_index = 0, max_index = 0;
273
274 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
275 info,
276 &min_index,
277 &max_index);
278
279 /* Use the corresponding values */
280 *vertex_count = max_index - min_index + 1;
281 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
282 tiler_prefix->offset_bias_correction = -min_index;
283 tiler_prefix->index_count = MALI_POSITIVE(info->count);
284 draw_flags |= panfrost_translate_index_size(info->index_size);
285 } else {
286 tiler_prefix->indices = 0;
287 *vertex_count = ctx->vertex_count;
288 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
289 tiler_prefix->offset_bias_correction = 0;
290 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
291 }
292
293 tiler_prefix->unknown_draw = draw_flags;
294
295 /* Encode the padded vertex count */
296
297 if (info->instance_count > 1) {
298 *padded_count = panfrost_padded_vertex_count(*vertex_count);
299
300 unsigned shift = __builtin_ctz(ctx->padded_count);
301 unsigned k = ctx->padded_count >> (shift + 1);
302
303 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
304 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
305 } else {
306 *padded_count = *vertex_count;
307
308 /* Reset instancing state */
309 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
310 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
311 }
312 }
313
314 static void
315 panfrost_shader_meta_init(struct panfrost_context *ctx,
316 enum pipe_shader_type st,
317 struct mali_shader_meta *meta)
318 {
319 const struct panfrost_device *dev = pan_device(ctx->base.screen);
320 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
321
322 memset(meta, 0, sizeof(*meta));
323 meta->shader = (ss->bo ? ss->bo->gpu : 0) | ss->first_tag;
324 meta->attribute_count = ss->attribute_count;
325 meta->varying_count = ss->varying_count;
326 meta->texture_count = ctx->sampler_view_count[st];
327 meta->sampler_count = ctx->sampler_count[st];
328
329 if (dev->quirks & IS_BIFROST) {
330 if (st == PIPE_SHADER_VERTEX)
331 meta->bifrost1.unk1 = 0x800000;
332 else {
333 /* First clause ATEST |= 0x4000000.
334 * Less than 32 regs |= 0x200 */
335 meta->bifrost1.unk1 = 0x958020;
336 }
337
338 meta->bifrost1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
339 if (st == PIPE_SHADER_VERTEX)
340 meta->bifrost2.preload_regs = 0xC0;
341 else
342 meta->bifrost2.preload_regs = 0x1;
343 meta->bifrost2.uniform_count = MIN2(ss->uniform_count,
344 ss->uniform_cutoff);
345 } else {
346 meta->midgard1.uniform_count = MIN2(ss->uniform_count,
347 ss->uniform_cutoff);
348 meta->midgard1.work_count = ss->work_reg_count;
349 meta->midgard1.flags_hi = 0x8; /* XXX */
350 meta->midgard1.flags_lo = 0x220;
351 meta->midgard1.uniform_buffer_count = panfrost_ubo_count(ctx, st);
352 }
353 }
354
355 static unsigned
356 panfrost_translate_compare_func(enum pipe_compare_func in)
357 {
358 switch (in) {
359 case PIPE_FUNC_NEVER:
360 return MALI_FUNC_NEVER;
361
362 case PIPE_FUNC_LESS:
363 return MALI_FUNC_LESS;
364
365 case PIPE_FUNC_EQUAL:
366 return MALI_FUNC_EQUAL;
367
368 case PIPE_FUNC_LEQUAL:
369 return MALI_FUNC_LEQUAL;
370
371 case PIPE_FUNC_GREATER:
372 return MALI_FUNC_GREATER;
373
374 case PIPE_FUNC_NOTEQUAL:
375 return MALI_FUNC_NOTEQUAL;
376
377 case PIPE_FUNC_GEQUAL:
378 return MALI_FUNC_GEQUAL;
379
380 case PIPE_FUNC_ALWAYS:
381 return MALI_FUNC_ALWAYS;
382
383 default:
384 unreachable("Invalid func");
385 }
386 }
387
388 static unsigned
389 panfrost_translate_stencil_op(enum pipe_stencil_op in)
390 {
391 switch (in) {
392 case PIPE_STENCIL_OP_KEEP:
393 return MALI_STENCIL_KEEP;
394
395 case PIPE_STENCIL_OP_ZERO:
396 return MALI_STENCIL_ZERO;
397
398 case PIPE_STENCIL_OP_REPLACE:
399 return MALI_STENCIL_REPLACE;
400
401 case PIPE_STENCIL_OP_INCR:
402 return MALI_STENCIL_INCR;
403
404 case PIPE_STENCIL_OP_DECR:
405 return MALI_STENCIL_DECR;
406
407 case PIPE_STENCIL_OP_INCR_WRAP:
408 return MALI_STENCIL_INCR_WRAP;
409
410 case PIPE_STENCIL_OP_DECR_WRAP:
411 return MALI_STENCIL_DECR_WRAP;
412
413 case PIPE_STENCIL_OP_INVERT:
414 return MALI_STENCIL_INVERT;
415
416 default:
417 unreachable("Invalid stencil op");
418 }
419 }
420
421 static unsigned
422 translate_tex_wrap(enum pipe_tex_wrap w)
423 {
424 switch (w) {
425 case PIPE_TEX_WRAP_REPEAT:
426 return MALI_WRAP_REPEAT;
427
428 case PIPE_TEX_WRAP_CLAMP:
429 return MALI_WRAP_CLAMP;
430
431 case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
432 return MALI_WRAP_CLAMP_TO_EDGE;
433
434 case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
435 return MALI_WRAP_CLAMP_TO_BORDER;
436
437 case PIPE_TEX_WRAP_MIRROR_REPEAT:
438 return MALI_WRAP_MIRRORED_REPEAT;
439
440 case PIPE_TEX_WRAP_MIRROR_CLAMP:
441 return MALI_WRAP_MIRRORED_CLAMP;
442
443 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
444 return MALI_WRAP_MIRRORED_CLAMP_TO_EDGE;
445
446 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
447 return MALI_WRAP_MIRRORED_CLAMP_TO_BORDER;
448
449 default:
450 unreachable("Invalid wrap");
451 }
452 }
453
454 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
455 struct mali_sampler_descriptor *hw)
456 {
457 unsigned func = panfrost_translate_compare_func(cso->compare_func);
458 bool min_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
459 bool mag_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
460 bool mip_linear = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR;
461 unsigned min_filter = min_nearest ? MALI_SAMP_MIN_NEAREST : 0;
462 unsigned mag_filter = mag_nearest ? MALI_SAMP_MAG_NEAREST : 0;
463 unsigned mip_filter = mip_linear ?
464 (MALI_SAMP_MIP_LINEAR_1 | MALI_SAMP_MIP_LINEAR_2) : 0;
465 unsigned normalized = cso->normalized_coords ? MALI_SAMP_NORM_COORDS : 0;
466
467 *hw = (struct mali_sampler_descriptor) {
468 .filter_mode = min_filter | mag_filter | mip_filter |
469 normalized,
470 .wrap_s = translate_tex_wrap(cso->wrap_s),
471 .wrap_t = translate_tex_wrap(cso->wrap_t),
472 .wrap_r = translate_tex_wrap(cso->wrap_r),
473 .compare_func = panfrost_flip_compare_func(func),
474 .border_color = {
475 cso->border_color.f[0],
476 cso->border_color.f[1],
477 cso->border_color.f[2],
478 cso->border_color.f[3]
479 },
480 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
481 .max_lod = FIXED_16(cso->max_lod, false),
482 .lod_bias = FIXED_16(cso->lod_bias, true), /* can be negative */
483 .seamless_cube_map = cso->seamless_cube_map,
484 };
485
486 /* If necessary, we disable mipmapping in the sampler descriptor by
487 * clamping the LOD as tight as possible (from 0 to epsilon,
488 * essentially -- remember these are fixed point numbers, so
489 * epsilon=1/256) */
490
491 if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
492 hw->max_lod = hw->min_lod + 1;
493 }
494
495 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
496 struct bifrost_sampler_descriptor *hw)
497 {
498 *hw = (struct bifrost_sampler_descriptor) {
499 .unk1 = 0x1,
500 .wrap_s = translate_tex_wrap(cso->wrap_s),
501 .wrap_t = translate_tex_wrap(cso->wrap_t),
502 .wrap_r = translate_tex_wrap(cso->wrap_r),
503 .unk8 = 0x8,
504 .min_filter = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST,
505 .norm_coords = cso->normalized_coords,
506 .mip_filter = cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR,
507 .mag_filter = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR,
508 .min_lod = FIXED_16(cso->min_lod, false), /* clamp at 0 */
509 .max_lod = FIXED_16(cso->max_lod, false),
510 };
511
512 /* If necessary, we disable mipmapping in the sampler descriptor by
513 * clamping the LOD as tight as possible (from 0 to epsilon,
514 * essentially -- remember these are fixed point numbers, so
515 * epsilon=1/256) */
516
517 if (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
518 hw->max_lod = hw->min_lod + 1;
519 }
520
521 static void
522 panfrost_make_stencil_state(const struct pipe_stencil_state *in,
523 struct mali_stencil_test *out)
524 {
525 out->ref = 0; /* Gallium gets it from elsewhere */
526
527 out->mask = in->valuemask;
528 out->func = panfrost_translate_compare_func(in->func);
529 out->sfail = panfrost_translate_stencil_op(in->fail_op);
530 out->dpfail = panfrost_translate_stencil_op(in->zfail_op);
531 out->dppass = panfrost_translate_stencil_op(in->zpass_op);
532 }
533
534 static void
535 panfrost_frag_meta_rasterizer_update(struct panfrost_context *ctx,
536 struct mali_shader_meta *fragmeta)
537 {
538 if (!ctx->rasterizer) {
539 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, true);
540 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, false);
541 fragmeta->depth_units = 0.0f;
542 fragmeta->depth_factor = 0.0f;
543 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A, false);
544 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B, false);
545 return;
546 }
547
548 bool msaa = ctx->rasterizer->base.multisample;
549
550 /* TODO: Sample size */
551 SET_BIT(fragmeta->unknown2_3, MALI_HAS_MSAA, msaa);
552 SET_BIT(fragmeta->unknown2_4, MALI_NO_MSAA, !msaa);
553 fragmeta->depth_units = ctx->rasterizer->base.offset_units * 2.0f;
554 fragmeta->depth_factor = ctx->rasterizer->base.offset_scale;
555
556 /* XXX: Which bit is which? Does this maybe allow offseting not-tri? */
557
558 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_A,
559 ctx->rasterizer->base.offset_tri);
560 SET_BIT(fragmeta->unknown2_4, MALI_DEPTH_RANGE_B,
561 ctx->rasterizer->base.offset_tri);
562 }
563
564 static void
565 panfrost_frag_meta_zsa_update(struct panfrost_context *ctx,
566 struct mali_shader_meta *fragmeta)
567 {
568 const struct pipe_depth_stencil_alpha_state *zsa = ctx->depth_stencil;
569 int zfunc = PIPE_FUNC_ALWAYS;
570
571 if (!zsa) {
572 struct pipe_stencil_state default_stencil = {
573 .enabled = 0,
574 .func = PIPE_FUNC_ALWAYS,
575 .fail_op = MALI_STENCIL_KEEP,
576 .zfail_op = MALI_STENCIL_KEEP,
577 .zpass_op = MALI_STENCIL_KEEP,
578 .writemask = 0xFF,
579 .valuemask = 0xFF
580 };
581
582 panfrost_make_stencil_state(&default_stencil,
583 &fragmeta->stencil_front);
584 fragmeta->stencil_mask_front = default_stencil.writemask;
585 fragmeta->stencil_back = fragmeta->stencil_front;
586 fragmeta->stencil_mask_back = default_stencil.writemask;
587 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST, false);
588 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK, false);
589 } else {
590 SET_BIT(fragmeta->unknown2_4, MALI_STENCIL_TEST,
591 zsa->stencil[0].enabled);
592 panfrost_make_stencil_state(&zsa->stencil[0],
593 &fragmeta->stencil_front);
594 fragmeta->stencil_mask_front = zsa->stencil[0].writemask;
595 fragmeta->stencil_front.ref = ctx->stencil_ref.ref_value[0];
596
597 /* If back-stencil is not enabled, use the front values */
598
599 if (zsa->stencil[1].enabled) {
600 panfrost_make_stencil_state(&zsa->stencil[1],
601 &fragmeta->stencil_back);
602 fragmeta->stencil_mask_back = zsa->stencil[1].writemask;
603 fragmeta->stencil_back.ref = ctx->stencil_ref.ref_value[1];
604 } else {
605 fragmeta->stencil_back = fragmeta->stencil_front;
606 fragmeta->stencil_mask_back = fragmeta->stencil_mask_front;
607 fragmeta->stencil_back.ref = fragmeta->stencil_front.ref;
608 }
609
610 if (zsa->depth.enabled)
611 zfunc = zsa->depth.func;
612
613 /* Depth state (TODO: Refactor) */
614
615 SET_BIT(fragmeta->unknown2_3, MALI_DEPTH_WRITEMASK,
616 zsa->depth.writemask);
617 }
618
619 fragmeta->unknown2_3 &= ~MALI_DEPTH_FUNC_MASK;
620 fragmeta->unknown2_3 |= MALI_DEPTH_FUNC(panfrost_translate_compare_func(zfunc));
621 }
622
623 static void
624 panfrost_frag_meta_blend_update(struct panfrost_context *ctx,
625 struct mali_shader_meta *fragmeta,
626 void *rts)
627 {
628 const struct panfrost_device *dev = pan_device(ctx->base.screen);
629
630 SET_BIT(fragmeta->unknown2_4, MALI_NO_DITHER,
631 (dev->quirks & MIDGARD_SFBD) && ctx->blend &&
632 !ctx->blend->base.dither);
633
634 /* Get blending setup */
635 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
636
637 struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
638 unsigned shader_offset = 0;
639 struct panfrost_bo *shader_bo = NULL;
640
641 for (unsigned c = 0; c < rt_count; ++c)
642 blend[c] = panfrost_get_blend_for_context(ctx, c, &shader_bo,
643 &shader_offset);
644
645 /* If there is a blend shader, work registers are shared. XXX: opt */
646
647 for (unsigned c = 0; c < rt_count; ++c) {
648 if (blend[c].is_shader)
649 fragmeta->midgard1.work_count = 16;
650 }
651
652 /* Even on MFBD, the shader descriptor gets blend shaders. It's *also*
653 * copied to the blend_meta appended (by convention), but this is the
654 * field actually read by the hardware. (Or maybe both are read...?).
655 * Specify the last RTi with a blend shader. */
656
657 fragmeta->blend.shader = 0;
658
659 for (signed rt = (rt_count - 1); rt >= 0; --rt) {
660 if (!blend[rt].is_shader)
661 continue;
662
663 fragmeta->blend.shader = blend[rt].shader.gpu |
664 blend[rt].shader.first_tag;
665 break;
666 }
667
668 if (dev->quirks & MIDGARD_SFBD) {
669 /* When only a single render target platform is used, the blend
670 * information is inside the shader meta itself. We additionally
671 * need to signal CAN_DISCARD for nontrivial blend modes (so
672 * we're able to read back the destination buffer) */
673
674 SET_BIT(fragmeta->unknown2_3, MALI_HAS_BLEND_SHADER,
675 blend[0].is_shader);
676
677 if (!blend[0].is_shader) {
678 fragmeta->blend.equation = *blend[0].equation.equation;
679 fragmeta->blend.constant = blend[0].equation.constant;
680 }
681
682 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD,
683 !blend[0].no_blending);
684 return;
685 }
686
687 /* Additional blend descriptor tacked on for jobs using MFBD */
688
689 for (unsigned i = 0; i < rt_count; ++i) {
690 if (dev->quirks & IS_BIFROST) {
691 struct bifrost_blend_rt *brts = rts;
692 struct panfrost_shader_state *fs;
693 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
694
695 brts[i].flags = 0x200;
696 if (blend[i].is_shader) {
697 /* The blend shader's address needs to be at
698 * the same top 32 bit as the fragment shader.
699 * TODO: Ensure that's always the case.
700 */
701 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
702 (fs->bo->gpu & (0xffffffffull << 32)));
703 brts[i].shader = blend[i].shader.gpu;
704 brts[i].unk2 = 0x0;
705 } else {
706 enum pipe_format format = ctx->pipe_framebuffer.cbufs[i]->format;
707 const struct util_format_description *format_desc;
708 format_desc = util_format_description(format);
709
710 brts[i].equation = *blend[i].equation.equation;
711
712 /* TODO: this is a bit more complicated */
713 brts[i].constant = blend[i].equation.constant;
714
715 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
716 brts[i].unk2 = 0x19;
717
718 brts[i].shader_type = fs->blend_types[i];
719 }
720 } else {
721 struct midgard_blend_rt *mrts = rts;
722
723 mrts[i].flags = 0x200;
724
725 bool is_srgb = (ctx->pipe_framebuffer.nr_cbufs > i) &&
726 (ctx->pipe_framebuffer.cbufs[i]) &&
727 util_format_is_srgb(ctx->pipe_framebuffer.cbufs[i]->format);
728
729 SET_BIT(mrts[i].flags, MALI_BLEND_MRT_SHADER, blend[i].is_shader);
730 SET_BIT(mrts[i].flags, MALI_BLEND_LOAD_TIB, !blend[i].no_blending);
731 SET_BIT(mrts[i].flags, MALI_BLEND_SRGB, is_srgb);
732 SET_BIT(mrts[i].flags, MALI_BLEND_NO_DITHER, !ctx->blend->base.dither);
733
734 if (blend[i].is_shader) {
735 mrts[i].blend.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
736 } else {
737 mrts[i].blend.equation = *blend[i].equation.equation;
738 mrts[i].blend.constant = blend[i].equation.constant;
739 }
740 }
741 }
742 }
743
744 static void
745 panfrost_frag_shader_meta_init(struct panfrost_context *ctx,
746 struct mali_shader_meta *fragmeta,
747 void *rts)
748 {
749 const struct panfrost_device *dev = pan_device(ctx->base.screen);
750 struct panfrost_shader_state *fs;
751
752 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
753
754 fragmeta->alpha_coverage = ~MALI_ALPHA_COVERAGE(0.000000);
755 fragmeta->unknown2_3 = MALI_DEPTH_FUNC(MALI_FUNC_ALWAYS) | 0x3010;
756 fragmeta->unknown2_4 = 0x4e0;
757
758 /* unknown2_4 has 0x10 bit set on T6XX and T720. We don't know why this
759 * is required (independent of 32-bit/64-bit descriptors), or why it's
760 * not used on later GPU revisions. Otherwise, all shader jobs fault on
761 * these earlier chips (perhaps this is a chicken bit of some kind).
762 * More investigation is needed. */
763
764 SET_BIT(fragmeta->unknown2_4, 0x10, dev->quirks & MIDGARD_SFBD);
765
766 /* Depending on whether it's legal to in the given shader, we try to
767 * enable early-z testing (or forward-pixel kill?) */
768
769 SET_BIT(fragmeta->midgard1.flags_lo, MALI_EARLY_Z,
770 !fs->can_discard && !fs->writes_depth);
771
772 /* Add the writes Z/S flags if needed. */
773 SET_BIT(fragmeta->midgard1.flags_lo, MALI_WRITES_Z, fs->writes_depth);
774 SET_BIT(fragmeta->midgard1.flags_hi, MALI_WRITES_S, fs->writes_stencil);
775
776 /* Any time texturing is used, derivatives are implicitly calculated,
777 * so we need to enable helper invocations */
778
779 SET_BIT(fragmeta->midgard1.flags_lo, MALI_HELPER_INVOCATIONS,
780 fs->helper_invocations);
781
782 /* CAN_DISCARD should be set if the fragment shader possibly contains a
783 * 'discard' instruction. It is likely this is related to optimizations
784 * related to forward-pixel kill, as per "Mali Performance 3: Is
785 * EGL_BUFFER_PRESERVED a good thing?" by Peter Harris */
786
787 SET_BIT(fragmeta->unknown2_3, MALI_CAN_DISCARD, fs->can_discard);
788 SET_BIT(fragmeta->midgard1.flags_lo, 0x400, fs->can_discard);
789
790 panfrost_frag_meta_rasterizer_update(ctx, fragmeta);
791 panfrost_frag_meta_zsa_update(ctx, fragmeta);
792 panfrost_frag_meta_blend_update(ctx, fragmeta, rts);
793 }
794
795 void
796 panfrost_emit_shader_meta(struct panfrost_batch *batch,
797 enum pipe_shader_type st,
798 struct mali_vertex_tiler_postfix *postfix)
799 {
800 struct panfrost_context *ctx = batch->ctx;
801 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, st);
802
803 if (!ss) {
804 postfix->shader = 0;
805 return;
806 }
807
808 struct mali_shader_meta meta;
809
810 panfrost_shader_meta_init(ctx, st, &meta);
811
812 /* Add the shader BO to the batch. */
813 panfrost_batch_add_bo(batch, ss->bo,
814 PAN_BO_ACCESS_PRIVATE |
815 PAN_BO_ACCESS_READ |
816 panfrost_bo_access_for_stage(st));
817
818 mali_ptr shader_ptr;
819
820 if (st == PIPE_SHADER_FRAGMENT) {
821 struct panfrost_device *dev = pan_device(ctx->base.screen);
822 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
823 size_t desc_size = sizeof(meta);
824 void *rts = NULL;
825 struct panfrost_transfer xfer;
826 unsigned rt_size;
827
828 if (dev->quirks & MIDGARD_SFBD)
829 rt_size = 0;
830 else if (dev->quirks & IS_BIFROST)
831 rt_size = sizeof(struct bifrost_blend_rt);
832 else
833 rt_size = sizeof(struct midgard_blend_rt);
834
835 desc_size += rt_size * rt_count;
836
837 if (rt_size)
838 rts = rzalloc_size(ctx, rt_size * rt_count);
839
840 panfrost_frag_shader_meta_init(ctx, &meta, rts);
841
842 xfer = panfrost_allocate_transient(batch, desc_size);
843
844 memcpy(xfer.cpu, &meta, sizeof(meta));
845 memcpy(xfer.cpu + sizeof(meta), rts, rt_size * rt_count);
846
847 if (rt_size)
848 ralloc_free(rts);
849
850 shader_ptr = xfer.gpu;
851 } else {
852 shader_ptr = panfrost_upload_transient(batch, &meta,
853 sizeof(meta));
854 }
855
856 postfix->shader = shader_ptr;
857 }
858
859 static void
860 panfrost_mali_viewport_init(struct panfrost_context *ctx,
861 struct mali_viewport *mvp)
862 {
863 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
864
865 /* Clip bounds are encoded as floats. The viewport itself is encoded as
866 * (somewhat) asymmetric ints. */
867
868 const struct pipe_scissor_state *ss = &ctx->scissor;
869
870 memset(mvp, 0, sizeof(*mvp));
871
872 /* By default, do no viewport clipping, i.e. clip to (-inf, inf) in
873 * each direction. Clipping to the viewport in theory should work, but
874 * in practice causes issues when we're not explicitly trying to
875 * scissor */
876
877 *mvp = (struct mali_viewport) {
878 .clip_minx = -INFINITY,
879 .clip_miny = -INFINITY,
880 .clip_maxx = INFINITY,
881 .clip_maxy = INFINITY,
882 };
883
884 /* Always scissor to the viewport by default. */
885 float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
886 float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
887
888 float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
889 float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
890
891 float minz = (vp->translate[2] - fabsf(vp->scale[2]));
892 float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
893
894 /* Apply the scissor test */
895
896 unsigned minx, miny, maxx, maxy;
897
898 if (ss && ctx->rasterizer && ctx->rasterizer->base.scissor) {
899 minx = MAX2(ss->minx, vp_minx);
900 miny = MAX2(ss->miny, vp_miny);
901 maxx = MIN2(ss->maxx, vp_maxx);
902 maxy = MIN2(ss->maxy, vp_maxy);
903 } else {
904 minx = vp_minx;
905 miny = vp_miny;
906 maxx = vp_maxx;
907 maxy = vp_maxy;
908 }
909
910 /* Hardware needs the min/max to be strictly ordered, so flip if we
911 * need to. The viewport transformation in the vertex shader will
912 * handle the negatives if we don't */
913
914 if (miny > maxy) {
915 unsigned temp = miny;
916 miny = maxy;
917 maxy = temp;
918 }
919
920 if (minx > maxx) {
921 unsigned temp = minx;
922 minx = maxx;
923 maxx = temp;
924 }
925
926 if (minz > maxz) {
927 float temp = minz;
928 minz = maxz;
929 maxz = temp;
930 }
931
932 /* Clamp to the framebuffer size as a last check */
933
934 minx = MIN2(ctx->pipe_framebuffer.width, minx);
935 maxx = MIN2(ctx->pipe_framebuffer.width, maxx);
936
937 miny = MIN2(ctx->pipe_framebuffer.height, miny);
938 maxy = MIN2(ctx->pipe_framebuffer.height, maxy);
939
940 /* Upload */
941
942 mvp->viewport0[0] = minx;
943 mvp->viewport1[0] = MALI_POSITIVE(maxx);
944
945 mvp->viewport0[1] = miny;
946 mvp->viewport1[1] = MALI_POSITIVE(maxy);
947
948 mvp->clip_minz = minz;
949 mvp->clip_maxz = maxz;
950 }
951
952 void
953 panfrost_emit_viewport(struct panfrost_batch *batch,
954 struct mali_vertex_tiler_postfix *tiler_postfix)
955 {
956 struct panfrost_context *ctx = batch->ctx;
957 struct mali_viewport mvp;
958
959 panfrost_mali_viewport_init(batch->ctx, &mvp);
960
961 /* Update the job, unless we're doing wallpapering (whose lack of
962 * scissor we can ignore, since if we "miss" a tile of wallpaper, it'll
963 * just... be faster :) */
964
965 if (!ctx->wallpaper_batch)
966 panfrost_batch_union_scissor(batch, mvp.viewport0[0],
967 mvp.viewport0[1],
968 mvp.viewport1[0] + 1,
969 mvp.viewport1[1] + 1);
970
971 tiler_postfix->viewport = panfrost_upload_transient(batch, &mvp,
972 sizeof(mvp));
973 }
974
975 static mali_ptr
976 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
977 enum pipe_shader_type st,
978 struct panfrost_constant_buffer *buf,
979 unsigned index)
980 {
981 struct pipe_constant_buffer *cb = &buf->cb[index];
982 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
983
984 if (rsrc) {
985 panfrost_batch_add_bo(batch, rsrc->bo,
986 PAN_BO_ACCESS_SHARED |
987 PAN_BO_ACCESS_READ |
988 panfrost_bo_access_for_stage(st));
989
990 /* Alignment gauranteed by
991 * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
992 return rsrc->bo->gpu + cb->buffer_offset;
993 } else if (cb->user_buffer) {
994 return panfrost_upload_transient(batch,
995 cb->user_buffer +
996 cb->buffer_offset,
997 cb->buffer_size);
998 } else {
999 unreachable("No constant buffer");
1000 }
1001 }
1002
1003 struct sysval_uniform {
1004 union {
1005 float f[4];
1006 int32_t i[4];
1007 uint32_t u[4];
1008 uint64_t du[2];
1009 };
1010 };
1011
1012 static void
1013 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
1014 struct sysval_uniform *uniform)
1015 {
1016 struct panfrost_context *ctx = batch->ctx;
1017 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1018
1019 uniform->f[0] = vp->scale[0];
1020 uniform->f[1] = vp->scale[1];
1021 uniform->f[2] = vp->scale[2];
1022 }
1023
1024 static void
1025 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
1026 struct sysval_uniform *uniform)
1027 {
1028 struct panfrost_context *ctx = batch->ctx;
1029 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
1030
1031 uniform->f[0] = vp->translate[0];
1032 uniform->f[1] = vp->translate[1];
1033 uniform->f[2] = vp->translate[2];
1034 }
1035
1036 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
1037 enum pipe_shader_type st,
1038 unsigned int sysvalid,
1039 struct sysval_uniform *uniform)
1040 {
1041 struct panfrost_context *ctx = batch->ctx;
1042 unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
1043 unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
1044 bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
1045 struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
1046
1047 assert(dim);
1048 uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
1049
1050 if (dim > 1)
1051 uniform->i[1] = u_minify(tex->texture->height0,
1052 tex->u.tex.first_level);
1053
1054 if (dim > 2)
1055 uniform->i[2] = u_minify(tex->texture->depth0,
1056 tex->u.tex.first_level);
1057
1058 if (is_array)
1059 uniform->i[dim] = tex->texture->array_size;
1060 }
1061
1062 static void
1063 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
1064 enum pipe_shader_type st,
1065 unsigned ssbo_id,
1066 struct sysval_uniform *uniform)
1067 {
1068 struct panfrost_context *ctx = batch->ctx;
1069
1070 assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
1071 struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
1072
1073 /* Compute address */
1074 struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
1075
1076 panfrost_batch_add_bo(batch, bo,
1077 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
1078 panfrost_bo_access_for_stage(st));
1079
1080 /* Upload address and size as sysval */
1081 uniform->du[0] = bo->gpu + sb.buffer_offset;
1082 uniform->u[2] = sb.buffer_size;
1083 }
1084
1085 static void
1086 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
1087 enum pipe_shader_type st,
1088 unsigned samp_idx,
1089 struct sysval_uniform *uniform)
1090 {
1091 struct panfrost_context *ctx = batch->ctx;
1092 struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
1093
1094 uniform->f[0] = sampl->min_lod;
1095 uniform->f[1] = sampl->max_lod;
1096 uniform->f[2] = sampl->lod_bias;
1097
1098 /* Even without any errata, Midgard represents "no mipmapping" as
1099 * fixing the LOD with the clamps; keep behaviour consistent. c.f.
1100 * panfrost_create_sampler_state which also explains our choice of
1101 * epsilon value (again to keep behaviour consistent) */
1102
1103 if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
1104 uniform->f[1] = uniform->f[0] + (1.0/256.0);
1105 }
1106
1107 static void
1108 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
1109 struct sysval_uniform *uniform)
1110 {
1111 struct panfrost_context *ctx = batch->ctx;
1112
1113 uniform->u[0] = ctx->compute_grid->grid[0];
1114 uniform->u[1] = ctx->compute_grid->grid[1];
1115 uniform->u[2] = ctx->compute_grid->grid[2];
1116 }
1117
1118 static void
1119 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
1120 struct panfrost_shader_state *ss,
1121 enum pipe_shader_type st)
1122 {
1123 struct sysval_uniform *uniforms = (void *)buf;
1124
1125 for (unsigned i = 0; i < ss->sysval_count; ++i) {
1126 int sysval = ss->sysval[i];
1127
1128 switch (PAN_SYSVAL_TYPE(sysval)) {
1129 case PAN_SYSVAL_VIEWPORT_SCALE:
1130 panfrost_upload_viewport_scale_sysval(batch,
1131 &uniforms[i]);
1132 break;
1133 case PAN_SYSVAL_VIEWPORT_OFFSET:
1134 panfrost_upload_viewport_offset_sysval(batch,
1135 &uniforms[i]);
1136 break;
1137 case PAN_SYSVAL_TEXTURE_SIZE:
1138 panfrost_upload_txs_sysval(batch, st,
1139 PAN_SYSVAL_ID(sysval),
1140 &uniforms[i]);
1141 break;
1142 case PAN_SYSVAL_SSBO:
1143 panfrost_upload_ssbo_sysval(batch, st,
1144 PAN_SYSVAL_ID(sysval),
1145 &uniforms[i]);
1146 break;
1147 case PAN_SYSVAL_NUM_WORK_GROUPS:
1148 panfrost_upload_num_work_groups_sysval(batch,
1149 &uniforms[i]);
1150 break;
1151 case PAN_SYSVAL_SAMPLER:
1152 panfrost_upload_sampler_sysval(batch, st,
1153 PAN_SYSVAL_ID(sysval),
1154 &uniforms[i]);
1155 break;
1156 default:
1157 assert(0);
1158 }
1159 }
1160 }
1161
1162 static const void *
1163 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
1164 unsigned index)
1165 {
1166 struct pipe_constant_buffer *cb = &buf->cb[index];
1167 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
1168
1169 if (rsrc)
1170 return rsrc->bo->cpu;
1171 else if (cb->user_buffer)
1172 return cb->user_buffer;
1173 else
1174 unreachable("No constant buffer");
1175 }
1176
1177 void
1178 panfrost_emit_const_buf(struct panfrost_batch *batch,
1179 enum pipe_shader_type stage,
1180 struct mali_vertex_tiler_postfix *postfix)
1181 {
1182 struct panfrost_context *ctx = batch->ctx;
1183 struct panfrost_shader_variants *all = ctx->shader[stage];
1184
1185 if (!all)
1186 return;
1187
1188 struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
1189
1190 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1191
1192 /* Uniforms are implicitly UBO #0 */
1193 bool has_uniforms = buf->enabled_mask & (1 << 0);
1194
1195 /* Allocate room for the sysval and the uniforms */
1196 size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
1197 size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
1198 size_t size = sys_size + uniform_size;
1199 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1200 size);
1201
1202 /* Upload sysvals requested by the shader */
1203 panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
1204
1205 /* Upload uniforms */
1206 if (has_uniforms && uniform_size) {
1207 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
1208 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
1209 }
1210
1211 /* Next up, attach UBOs. UBO #0 is the uniforms we just
1212 * uploaded */
1213
1214 unsigned ubo_count = panfrost_ubo_count(ctx, stage);
1215 assert(ubo_count >= 1);
1216
1217 size_t sz = sizeof(uint64_t) * ubo_count;
1218 uint64_t ubos[PAN_MAX_CONST_BUFFERS];
1219 int uniform_count = ss->uniform_count;
1220
1221 /* Upload uniforms as a UBO */
1222 ubos[0] = MALI_MAKE_UBO(2 + uniform_count, transfer.gpu);
1223
1224 /* The rest are honest-to-goodness UBOs */
1225
1226 for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
1227 size_t usz = buf->cb[ubo].buffer_size;
1228 bool enabled = buf->enabled_mask & (1 << ubo);
1229 bool empty = usz == 0;
1230
1231 if (!enabled || empty) {
1232 /* Stub out disabled UBOs to catch accesses */
1233 ubos[ubo] = MALI_MAKE_UBO(0, 0xDEAD0000);
1234 continue;
1235 }
1236
1237 mali_ptr gpu = panfrost_map_constant_buffer_gpu(batch, stage,
1238 buf, ubo);
1239
1240 unsigned bytes_per_field = 16;
1241 unsigned aligned = ALIGN_POT(usz, bytes_per_field);
1242 ubos[ubo] = MALI_MAKE_UBO(aligned / bytes_per_field, gpu);
1243 }
1244
1245 mali_ptr ubufs = panfrost_upload_transient(batch, ubos, sz);
1246 postfix->uniforms = transfer.gpu;
1247 postfix->uniform_buffers = ubufs;
1248
1249 buf->dirty_mask = 0;
1250 }
1251
1252 void
1253 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1254 const struct pipe_grid_info *info,
1255 struct midgard_payload_vertex_tiler *vtp)
1256 {
1257 struct panfrost_context *ctx = batch->ctx;
1258 struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1259 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1260 unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1261 128));
1262 unsigned shared_size = single_size * info->grid[0] * info->grid[1] *
1263 info->grid[2] * 4;
1264 struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1265 shared_size,
1266 1);
1267
1268 struct mali_shared_memory shared = {
1269 .shared_memory = bo->gpu,
1270 .shared_workgroup_count =
1271 util_logbase2_ceil(info->grid[0]) +
1272 util_logbase2_ceil(info->grid[1]) +
1273 util_logbase2_ceil(info->grid[2]),
1274 .shared_unk1 = 0x2,
1275 .shared_shift = util_logbase2(single_size) - 1
1276 };
1277
1278 vtp->postfix.shared_memory = panfrost_upload_transient(batch, &shared,
1279 sizeof(shared));
1280 }
1281
1282 static mali_ptr
1283 panfrost_get_tex_desc(struct panfrost_batch *batch,
1284 enum pipe_shader_type st,
1285 struct panfrost_sampler_view *view)
1286 {
1287 if (!view)
1288 return (mali_ptr) 0;
1289
1290 struct pipe_sampler_view *pview = &view->base;
1291 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1292
1293 /* Add the BO to the job so it's retained until the job is done. */
1294
1295 panfrost_batch_add_bo(batch, rsrc->bo,
1296 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1297 panfrost_bo_access_for_stage(st));
1298
1299 panfrost_batch_add_bo(batch, view->midgard_bo,
1300 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1301 panfrost_bo_access_for_stage(st));
1302
1303 return view->midgard_bo->gpu;
1304 }
1305
1306 void
1307 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1308 enum pipe_shader_type stage,
1309 struct mali_vertex_tiler_postfix *postfix)
1310 {
1311 struct panfrost_context *ctx = batch->ctx;
1312 struct panfrost_device *device = pan_device(ctx->base.screen);
1313
1314 if (!ctx->sampler_view_count[stage])
1315 return;
1316
1317 if (device->quirks & IS_BIFROST) {
1318 struct bifrost_texture_descriptor *descriptors;
1319
1320 descriptors = malloc(sizeof(struct bifrost_texture_descriptor) *
1321 ctx->sampler_view_count[stage]);
1322
1323 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1324 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1325 struct pipe_sampler_view *pview = &view->base;
1326 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1327
1328 /* Add the BOs to the job so they are retained until the job is done. */
1329
1330 panfrost_batch_add_bo(batch, rsrc->bo,
1331 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1332 panfrost_bo_access_for_stage(stage));
1333
1334 panfrost_batch_add_bo(batch, view->bifrost_bo,
1335 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1336 panfrost_bo_access_for_stage(stage));
1337
1338 memcpy(&descriptors[i], view->bifrost_descriptor, sizeof(*view->bifrost_descriptor));
1339 }
1340
1341 postfix->textures = panfrost_upload_transient(batch,
1342 descriptors,
1343 sizeof(struct bifrost_texture_descriptor) *
1344 ctx->sampler_view_count[stage]);
1345
1346 free(descriptors);
1347 } else {
1348 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1349
1350 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i)
1351 trampolines[i] = panfrost_get_tex_desc(batch, stage,
1352 ctx->sampler_views[stage][i]);
1353
1354 postfix->textures = panfrost_upload_transient(batch,
1355 trampolines,
1356 sizeof(uint64_t) *
1357 ctx->sampler_view_count[stage]);
1358 }
1359 }
1360
1361 void
1362 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1363 enum pipe_shader_type stage,
1364 struct mali_vertex_tiler_postfix *postfix)
1365 {
1366 struct panfrost_context *ctx = batch->ctx;
1367 struct panfrost_device *device = pan_device(ctx->base.screen);
1368
1369 if (!ctx->sampler_count[stage])
1370 return;
1371
1372 if (device->quirks & IS_BIFROST) {
1373 size_t desc_size = sizeof(struct bifrost_sampler_descriptor);
1374 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1375 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1376 transfer_size);
1377 struct bifrost_sampler_descriptor *desc = (struct bifrost_sampler_descriptor *)transfer.cpu;
1378
1379 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1380 desc[i] = ctx->samplers[stage][i]->bifrost_hw;
1381
1382 postfix->sampler_descriptor = transfer.gpu;
1383 } else {
1384 size_t desc_size = sizeof(struct mali_sampler_descriptor);
1385 size_t transfer_size = desc_size * ctx->sampler_count[stage];
1386 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1387 transfer_size);
1388 struct mali_sampler_descriptor *desc = (struct mali_sampler_descriptor *)transfer.cpu;
1389
1390 for (int i = 0; i < ctx->sampler_count[stage]; ++i)
1391 desc[i] = ctx->samplers[stage][i]->midgard_hw;
1392
1393 postfix->sampler_descriptor = transfer.gpu;
1394 }
1395 }
1396
1397 void
1398 panfrost_emit_vertex_attr_meta(struct panfrost_batch *batch,
1399 struct mali_vertex_tiler_postfix *vertex_postfix)
1400 {
1401 struct panfrost_context *ctx = batch->ctx;
1402
1403 if (!ctx->vertex)
1404 return;
1405
1406 struct panfrost_vertex_state *so = ctx->vertex;
1407
1408 panfrost_vertex_state_upd_attr_offs(ctx, vertex_postfix);
1409 vertex_postfix->attribute_meta = panfrost_upload_transient(batch, so->hw,
1410 sizeof(*so->hw) *
1411 PAN_MAX_ATTRIBUTE);
1412 }
1413
1414 void
1415 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1416 struct mali_vertex_tiler_postfix *vertex_postfix)
1417 {
1418 struct panfrost_context *ctx = batch->ctx;
1419 struct panfrost_vertex_state *so = ctx->vertex;
1420
1421 /* Staged mali_attr, and index into them. i =/= k, depending on the
1422 * vertex buffer mask and instancing. Twice as much room is allocated,
1423 * for a worst case of NPOT_DIVIDEs which take up extra slot */
1424 union mali_attr attrs[PIPE_MAX_ATTRIBS * 2];
1425 unsigned k = 0;
1426
1427 for (unsigned i = 0; i < so->num_elements; ++i) {
1428 /* We map a mali_attr to be 1:1 with the mali_attr_meta, which
1429 * means duplicating some vertex buffers (who cares? aside from
1430 * maybe some caching implications but I somehow doubt that
1431 * matters) */
1432
1433 struct pipe_vertex_element *elem = &so->pipe[i];
1434 unsigned vbi = elem->vertex_buffer_index;
1435
1436 /* The exception to 1:1 mapping is that we can have multiple
1437 * entries (NPOT divisors), so we fixup anyways */
1438
1439 so->hw[i].index = k;
1440
1441 if (!(ctx->vb_mask & (1 << vbi)))
1442 continue;
1443
1444 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1445 struct panfrost_resource *rsrc;
1446
1447 rsrc = pan_resource(buf->buffer.resource);
1448 if (!rsrc)
1449 continue;
1450
1451 /* Align to 64 bytes by masking off the lower bits. This
1452 * will be adjusted back when we fixup the src_offset in
1453 * mali_attr_meta */
1454
1455 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1456 mali_ptr addr = raw_addr & ~63;
1457 unsigned chopped_addr = raw_addr - addr;
1458
1459 /* Add a dependency of the batch on the vertex buffer */
1460 panfrost_batch_add_bo(batch, rsrc->bo,
1461 PAN_BO_ACCESS_SHARED |
1462 PAN_BO_ACCESS_READ |
1463 PAN_BO_ACCESS_VERTEX_TILER);
1464
1465 /* Set common fields */
1466 attrs[k].elements = addr;
1467 attrs[k].stride = buf->stride;
1468
1469 /* Since we advanced the base pointer, we shrink the buffer
1470 * size */
1471 attrs[k].size = rsrc->base.width0 - buf->buffer_offset;
1472
1473 /* We need to add the extra size we masked off (for
1474 * correctness) so the data doesn't get clamped away */
1475 attrs[k].size += chopped_addr;
1476
1477 /* For non-instancing make sure we initialize */
1478 attrs[k].shift = attrs[k].extra_flags = 0;
1479
1480 /* Instancing uses a dramatically different code path than
1481 * linear, so dispatch for the actual emission now that the
1482 * common code is finished */
1483
1484 unsigned divisor = elem->instance_divisor;
1485
1486 if (divisor && ctx->instance_count == 1) {
1487 /* Silly corner case where there's a divisor(=1) but
1488 * there's no legitimate instancing. So we want *every*
1489 * attribute to be the same. So set stride to zero so
1490 * we don't go anywhere. */
1491
1492 attrs[k].size = attrs[k].stride + chopped_addr;
1493 attrs[k].stride = 0;
1494 attrs[k++].elements |= MALI_ATTR_LINEAR;
1495 } else if (ctx->instance_count <= 1) {
1496 /* Normal, non-instanced attributes */
1497 attrs[k++].elements |= MALI_ATTR_LINEAR;
1498 } else {
1499 unsigned instance_shift = vertex_postfix->instance_shift;
1500 unsigned instance_odd = vertex_postfix->instance_odd;
1501
1502 k += panfrost_vertex_instanced(ctx->padded_count,
1503 instance_shift,
1504 instance_odd,
1505 divisor, &attrs[k]);
1506 }
1507 }
1508
1509 /* Add special gl_VertexID/gl_InstanceID buffers */
1510
1511 panfrost_vertex_id(ctx->padded_count, &attrs[k]);
1512 so->hw[PAN_VERTEX_ID].index = k++;
1513 panfrost_instance_id(ctx->padded_count, &attrs[k]);
1514 so->hw[PAN_INSTANCE_ID].index = k++;
1515
1516 /* Upload whatever we emitted and go */
1517
1518 vertex_postfix->attributes = panfrost_upload_transient(batch, attrs,
1519 k * sizeof(*attrs));
1520 }
1521
1522 static mali_ptr
1523 panfrost_emit_varyings(struct panfrost_batch *batch, union mali_attr *slot,
1524 unsigned stride, unsigned count)
1525 {
1526 /* Fill out the descriptor */
1527 slot->stride = stride;
1528 slot->size = stride * count;
1529 slot->shift = slot->extra_flags = 0;
1530
1531 struct panfrost_transfer transfer = panfrost_allocate_transient(batch,
1532 slot->size);
1533
1534 slot->elements = transfer.gpu | MALI_ATTR_LINEAR;
1535
1536 return transfer.gpu;
1537 }
1538
1539 static void
1540 panfrost_emit_streamout(struct panfrost_batch *batch, union mali_attr *slot,
1541 unsigned stride, unsigned offset, unsigned count,
1542 struct pipe_stream_output_target *target)
1543 {
1544 /* Fill out the descriptor */
1545 slot->stride = stride * 4;
1546 slot->shift = slot->extra_flags = 0;
1547
1548 unsigned max_size = target->buffer_size;
1549 unsigned expected_size = slot->stride * count;
1550
1551 slot->size = MIN2(max_size, expected_size);
1552
1553 /* Grab the BO and bind it to the batch */
1554 struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1555
1556 /* Varyings are WRITE from the perspective of the VERTEX but READ from
1557 * the perspective of the TILER and FRAGMENT.
1558 */
1559 panfrost_batch_add_bo(batch, bo,
1560 PAN_BO_ACCESS_SHARED |
1561 PAN_BO_ACCESS_RW |
1562 PAN_BO_ACCESS_VERTEX_TILER |
1563 PAN_BO_ACCESS_FRAGMENT);
1564
1565 mali_ptr addr = bo->gpu + target->buffer_offset + (offset * slot->stride);
1566 slot->elements = addr;
1567 }
1568
1569 /* Given a shader and buffer indices, link varying metadata together */
1570
1571 static bool
1572 is_special_varying(gl_varying_slot loc)
1573 {
1574 switch (loc) {
1575 case VARYING_SLOT_POS:
1576 case VARYING_SLOT_PSIZ:
1577 case VARYING_SLOT_PNTC:
1578 case VARYING_SLOT_FACE:
1579 return true;
1580 default:
1581 return false;
1582 }
1583 }
1584
1585 static void
1586 panfrost_emit_varying_meta(void *outptr, struct panfrost_shader_state *ss,
1587 signed general, signed gl_Position,
1588 signed gl_PointSize, signed gl_PointCoord,
1589 signed gl_FrontFacing)
1590 {
1591 struct mali_attr_meta *out = (struct mali_attr_meta *) outptr;
1592
1593 for (unsigned i = 0; i < ss->varying_count; ++i) {
1594 gl_varying_slot location = ss->varyings_loc[i];
1595 int index = -1;
1596
1597 switch (location) {
1598 case VARYING_SLOT_POS:
1599 index = gl_Position;
1600 break;
1601 case VARYING_SLOT_PSIZ:
1602 index = gl_PointSize;
1603 break;
1604 case VARYING_SLOT_PNTC:
1605 index = gl_PointCoord;
1606 break;
1607 case VARYING_SLOT_FACE:
1608 index = gl_FrontFacing;
1609 break;
1610 default:
1611 index = general;
1612 break;
1613 }
1614
1615 assert(index >= 0);
1616 out[i].index = index;
1617 }
1618 }
1619
1620 static bool
1621 has_point_coord(unsigned mask, gl_varying_slot loc)
1622 {
1623 if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1624 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1625 else if (loc == VARYING_SLOT_PNTC)
1626 return (mask & (1 << 8));
1627 else
1628 return false;
1629 }
1630
1631 /* Helpers for manipulating stream out information so we can pack varyings
1632 * accordingly. Compute the src_offset for a given captured varying */
1633
1634 static struct pipe_stream_output *
1635 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1636 {
1637 for (unsigned i = 0; i < info->num_outputs; ++i) {
1638 if (info->output[i].register_index == loc)
1639 return &info->output[i];
1640 }
1641
1642 unreachable("Varying not captured");
1643 }
1644
1645 /* TODO: Integers */
1646 static enum mali_format
1647 pan_xfb_format(unsigned nr_components)
1648 {
1649 switch (nr_components) {
1650 case 1: return MALI_R32F;
1651 case 2: return MALI_RG32F;
1652 case 3: return MALI_RGB32F;
1653 case 4: return MALI_RGBA32F;
1654 default: unreachable("Invalid format");
1655 }
1656 }
1657
1658 void
1659 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1660 unsigned vertex_count,
1661 struct mali_vertex_tiler_postfix *vertex_postfix,
1662 struct mali_vertex_tiler_postfix *tiler_postfix,
1663 union midgard_primitive_size *primitive_size)
1664 {
1665 /* Load the shaders */
1666 struct panfrost_context *ctx = batch->ctx;
1667 struct panfrost_shader_state *vs, *fs;
1668 unsigned int num_gen_varyings = 0;
1669 size_t vs_size, fs_size;
1670
1671 /* Allocate the varying descriptor */
1672
1673 vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1674 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1675 vs_size = sizeof(struct mali_attr_meta) * vs->varying_count;
1676 fs_size = sizeof(struct mali_attr_meta) * fs->varying_count;
1677
1678 struct panfrost_transfer trans = panfrost_allocate_transient(batch,
1679 vs_size +
1680 fs_size);
1681
1682 struct pipe_stream_output_info *so = &vs->stream_output;
1683
1684 /* Check if this varying is linked by us. This is the case for
1685 * general-purpose, non-captured varyings. If it is, link it. If it's
1686 * not, use the provided stream out information to determine the
1687 * offset, since it was already linked for us. */
1688
1689 for (unsigned i = 0; i < vs->varying_count; i++) {
1690 gl_varying_slot loc = vs->varyings_loc[i];
1691
1692 bool special = is_special_varying(loc);
1693 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1694
1695 if (captured) {
1696 struct pipe_stream_output *o = pan_get_so(so, loc);
1697
1698 unsigned dst_offset = o->dst_offset * 4; /* dwords */
1699 vs->varyings[i].src_offset = dst_offset;
1700 } else if (!special) {
1701 vs->varyings[i].src_offset = 16 * (num_gen_varyings++);
1702 }
1703 }
1704
1705 /* Conversely, we need to set src_offset for the captured varyings.
1706 * Here, the layout is defined by the stream out info, not us */
1707
1708 /* Link up with fragment varyings */
1709 bool reads_point_coord = fs->reads_point_coord;
1710
1711 for (unsigned i = 0; i < fs->varying_count; i++) {
1712 gl_varying_slot loc = fs->varyings_loc[i];
1713 unsigned src_offset;
1714 signed vs_idx = -1;
1715
1716 /* Link up */
1717 for (unsigned j = 0; j < vs->varying_count; ++j) {
1718 if (vs->varyings_loc[j] == loc) {
1719 vs_idx = j;
1720 break;
1721 }
1722 }
1723
1724 /* Either assign or reuse */
1725 if (vs_idx >= 0)
1726 src_offset = vs->varyings[vs_idx].src_offset;
1727 else
1728 src_offset = 16 * (num_gen_varyings++);
1729
1730 fs->varyings[i].src_offset = src_offset;
1731
1732 if (has_point_coord(fs->point_sprite_mask, loc))
1733 reads_point_coord = true;
1734 }
1735
1736 memcpy(trans.cpu, vs->varyings, vs_size);
1737 memcpy(trans.cpu + vs_size, fs->varyings, fs_size);
1738
1739 union mali_attr varyings[PIPE_MAX_ATTRIBS] = {0};
1740
1741 /* Figure out how many streamout buffers could be bound */
1742 unsigned so_count = ctx->streamout.num_targets;
1743 for (unsigned i = 0; i < vs->varying_count; i++) {
1744 gl_varying_slot loc = vs->varyings_loc[i];
1745
1746 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1747 if (!captured) continue;
1748
1749 struct pipe_stream_output *o = pan_get_so(so, loc);
1750 so_count = MAX2(so_count, o->output_buffer + 1);
1751 }
1752
1753 signed idx = so_count;
1754 signed general = idx++;
1755 signed gl_Position = idx++;
1756 signed gl_PointSize = vs->writes_point_size ? (idx++) : -1;
1757 signed gl_PointCoord = reads_point_coord ? (idx++) : -1;
1758 signed gl_FrontFacing = fs->reads_face ? (idx++) : -1;
1759 signed gl_FragCoord = fs->reads_frag_coord ? (idx++) : -1;
1760
1761 /* Emit the stream out buffers */
1762
1763 unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1764 ctx->vertex_count);
1765
1766 for (unsigned i = 0; i < so_count; ++i) {
1767 if (i < ctx->streamout.num_targets) {
1768 panfrost_emit_streamout(batch, &varyings[i],
1769 so->stride[i],
1770 ctx->streamout.offsets[i],
1771 out_count,
1772 ctx->streamout.targets[i]);
1773 } else {
1774 /* Emit a dummy buffer */
1775 panfrost_emit_varyings(batch, &varyings[i],
1776 so->stride[i] * 4,
1777 out_count);
1778
1779 /* Clear the attribute type */
1780 varyings[i].elements &= ~0xF;
1781 }
1782 }
1783
1784 panfrost_emit_varyings(batch, &varyings[general],
1785 num_gen_varyings * 16,
1786 vertex_count);
1787
1788 mali_ptr varyings_p;
1789
1790 /* fp32 vec4 gl_Position */
1791 varyings_p = panfrost_emit_varyings(batch, &varyings[gl_Position],
1792 sizeof(float) * 4, vertex_count);
1793 tiler_postfix->position_varying = varyings_p;
1794
1795
1796 if (panfrost_writes_point_size(ctx)) {
1797 varyings_p = panfrost_emit_varyings(batch,
1798 &varyings[gl_PointSize],
1799 2, vertex_count);
1800 primitive_size->pointer = varyings_p;
1801 }
1802
1803 if (reads_point_coord)
1804 varyings[gl_PointCoord].elements = MALI_VARYING_POINT_COORD;
1805
1806 if (fs->reads_face)
1807 varyings[gl_FrontFacing].elements = MALI_VARYING_FRONT_FACING;
1808
1809 if (fs->reads_frag_coord)
1810 varyings[gl_FragCoord].elements = MALI_VARYING_FRAG_COORD;
1811
1812 struct panfrost_device *device = pan_device(ctx->base.screen);
1813 assert(!(device->quirks & IS_BIFROST) || !(reads_point_coord || fs->reads_face || fs->reads_frag_coord));
1814
1815 /* Let's go ahead and link varying meta to the buffer in question, now
1816 * that that information is available. VARYING_SLOT_POS is mapped to
1817 * gl_FragCoord for fragment shaders but gl_Positionf or vertex shaders
1818 * */
1819
1820 panfrost_emit_varying_meta(trans.cpu, vs, general, gl_Position,
1821 gl_PointSize, gl_PointCoord,
1822 gl_FrontFacing);
1823
1824 panfrost_emit_varying_meta(trans.cpu + vs_size, fs, general,
1825 gl_FragCoord, gl_PointSize,
1826 gl_PointCoord, gl_FrontFacing);
1827
1828 /* Replace streamout */
1829
1830 struct mali_attr_meta *ovs = (struct mali_attr_meta *)trans.cpu;
1831 struct mali_attr_meta *ofs = ovs + vs->varying_count;
1832
1833 for (unsigned i = 0; i < vs->varying_count; i++) {
1834 gl_varying_slot loc = vs->varyings_loc[i];
1835
1836 bool captured = ((vs->so_mask & (1ll << loc)) ? true : false);
1837 if (!captured)
1838 continue;
1839
1840 struct pipe_stream_output *o = pan_get_so(so, loc);
1841 ovs[i].index = o->output_buffer;
1842
1843 /* Set the type appropriately. TODO: Integer varyings XXX */
1844 assert(o->stream == 0);
1845 ovs[i].format = pan_xfb_format(o->num_components);
1846
1847 if (device->quirks & HAS_SWIZZLES)
1848 ovs[i].swizzle = panfrost_get_default_swizzle(o->num_components);
1849 else
1850 ovs[i].swizzle = panfrost_bifrost_swizzle(o->num_components);
1851
1852 /* Link to the fragment */
1853 signed fs_idx = -1;
1854
1855 /* Link up */
1856 for (unsigned j = 0; j < fs->varying_count; ++j) {
1857 if (fs->varyings_loc[j] == loc) {
1858 fs_idx = j;
1859 break;
1860 }
1861 }
1862
1863 if (fs_idx >= 0) {
1864 ofs[fs_idx].index = ovs[i].index;
1865 ofs[fs_idx].format = ovs[i].format;
1866 ofs[fs_idx].swizzle = ovs[i].swizzle;
1867 }
1868 }
1869
1870 /* Replace point sprite */
1871 for (unsigned i = 0; i < fs->varying_count; i++) {
1872 /* If we have a point sprite replacement, handle that here. We
1873 * have to translate location first. TODO: Flip y in shader.
1874 * We're already keying ... just time crunch .. */
1875
1876 if (has_point_coord(fs->point_sprite_mask,
1877 fs->varyings_loc[i])) {
1878 ofs[i].index = gl_PointCoord;
1879
1880 /* Swizzle out the z/w to 0/1 */
1881 ofs[i].format = MALI_RG16F;
1882 ofs[i].swizzle = panfrost_get_default_swizzle(2);
1883 }
1884 }
1885
1886 /* Fix up unaligned addresses */
1887 for (unsigned i = 0; i < so_count; ++i) {
1888 if (varyings[i].elements < MALI_RECORD_SPECIAL)
1889 continue;
1890
1891 unsigned align = (varyings[i].elements & 63);
1892
1893 /* While we're at it, the SO buffers are linear */
1894
1895 if (!align) {
1896 varyings[i].elements |= MALI_ATTR_LINEAR;
1897 continue;
1898 }
1899
1900 /* We need to adjust alignment */
1901 varyings[i].elements &= ~63;
1902 varyings[i].elements |= MALI_ATTR_LINEAR;
1903 varyings[i].size += align;
1904
1905 for (unsigned v = 0; v < vs->varying_count; ++v) {
1906 if (ovs[v].index != i)
1907 continue;
1908
1909 ovs[v].src_offset = vs->varyings[v].src_offset + align;
1910 }
1911
1912 for (unsigned f = 0; f < fs->varying_count; ++f) {
1913 if (ofs[f].index != i)
1914 continue;
1915
1916 ofs[f].src_offset = fs->varyings[f].src_offset + align;
1917 }
1918 }
1919
1920 varyings_p = panfrost_upload_transient(batch, varyings,
1921 idx * sizeof(*varyings));
1922 vertex_postfix->varyings = varyings_p;
1923 tiler_postfix->varyings = varyings_p;
1924
1925 vertex_postfix->varying_meta = trans.gpu;
1926 tiler_postfix->varying_meta = trans.gpu + vs_size;
1927 }
1928
1929 void
1930 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
1931 struct mali_vertex_tiler_prefix *vertex_prefix,
1932 struct mali_vertex_tiler_postfix *vertex_postfix,
1933 struct mali_vertex_tiler_prefix *tiler_prefix,
1934 struct mali_vertex_tiler_postfix *tiler_postfix,
1935 union midgard_primitive_size *primitive_size)
1936 {
1937 struct panfrost_context *ctx = batch->ctx;
1938 struct panfrost_device *device = pan_device(ctx->base.screen);
1939 bool wallpapering = ctx->wallpaper_batch && batch->tiler_dep;
1940 struct bifrost_payload_vertex bifrost_vertex = {0,};
1941 struct bifrost_payload_tiler bifrost_tiler = {0,};
1942 struct midgard_payload_vertex_tiler midgard_vertex = {0,};
1943 struct midgard_payload_vertex_tiler midgard_tiler = {0,};
1944 void *vp, *tp;
1945 size_t vp_size, tp_size;
1946
1947 if (device->quirks & IS_BIFROST) {
1948 bifrost_vertex.prefix = *vertex_prefix;
1949 bifrost_vertex.postfix = *vertex_postfix;
1950 vp = &bifrost_vertex;
1951 vp_size = sizeof(bifrost_vertex);
1952
1953 bifrost_tiler.prefix = *tiler_prefix;
1954 bifrost_tiler.tiler.primitive_size = *primitive_size;
1955 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
1956 bifrost_tiler.postfix = *tiler_postfix;
1957 tp = &bifrost_tiler;
1958 tp_size = sizeof(bifrost_tiler);
1959 } else {
1960 midgard_vertex.prefix = *vertex_prefix;
1961 midgard_vertex.postfix = *vertex_postfix;
1962 vp = &midgard_vertex;
1963 vp_size = sizeof(midgard_vertex);
1964
1965 midgard_tiler.prefix = *tiler_prefix;
1966 midgard_tiler.postfix = *tiler_postfix;
1967 midgard_tiler.primitive_size = *primitive_size;
1968 tp = &midgard_tiler;
1969 tp_size = sizeof(midgard_tiler);
1970 }
1971
1972 if (wallpapering) {
1973 /* Inject in reverse order, with "predicted" job indices.
1974 * THIS IS A HACK XXX */
1975 panfrost_new_job(batch, JOB_TYPE_TILER, false,
1976 batch->job_index + 2, tp, tp_size, true);
1977 panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
1978 vp, vp_size, true);
1979 return;
1980 }
1981
1982 /* If rasterizer discard is enable, only submit the vertex */
1983
1984 bool rasterizer_discard = ctx->rasterizer &&
1985 ctx->rasterizer->base.rasterizer_discard;
1986
1987 unsigned vertex = panfrost_new_job(batch, JOB_TYPE_VERTEX, false, 0,
1988 vp, vp_size, false);
1989
1990 if (rasterizer_discard)
1991 return;
1992
1993 panfrost_new_job(batch, JOB_TYPE_TILER, false, vertex, tp, tp_size,
1994 false);
1995 }
1996
1997 /* TODO: stop hardcoding this */
1998 mali_ptr
1999 panfrost_emit_sample_locations(struct panfrost_batch *batch)
2000 {
2001 uint16_t locations[] = {
2002 128, 128,
2003 0, 256,
2004 0, 256,
2005 0, 256,
2006 0, 256,
2007 0, 256,
2008 0, 256,
2009 0, 256,
2010 0, 256,
2011 0, 256,
2012 0, 256,
2013 0, 256,
2014 0, 256,
2015 0, 256,
2016 0, 256,
2017 0, 256,
2018 0, 256,
2019 0, 256,
2020 0, 256,
2021 0, 256,
2022 0, 256,
2023 0, 256,
2024 0, 256,
2025 0, 256,
2026 0, 256,
2027 0, 256,
2028 0, 256,
2029 0, 256,
2030 0, 256,
2031 0, 256,
2032 0, 256,
2033 0, 256,
2034 128, 128,
2035 0, 0,
2036 0, 0,
2037 0, 0,
2038 0, 0,
2039 0, 0,
2040 0, 0,
2041 0, 0,
2042 0, 0,
2043 0, 0,
2044 0, 0,
2045 0, 0,
2046 0, 0,
2047 0, 0,
2048 0, 0,
2049 0, 0,
2050 };
2051
2052 return panfrost_upload_transient(batch, locations, 96 * sizeof(uint16_t));
2053 }