87b3372c4098b787c9813317b1574431c2e35758
[mesa.git] / src / gallium / drivers / panfrost / pan_cmdstream.c
1 /*
2 * Copyright (C) 2018 Alyssa Rosenzweig
3 * Copyright (C) 2020 Collabora Ltd.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25 #include "util/macros.h"
26 #include "util/u_prim.h"
27 #include "util/u_vbuf.h"
28
29 #include "panfrost-quirks.h"
30
31 #include "pan_pool.h"
32 #include "pan_bo.h"
33 #include "pan_cmdstream.h"
34 #include "pan_context.h"
35 #include "pan_job.h"
36
37 /* If a BO is accessed for a particular shader stage, will it be in the primary
38 * batch (vertex/tiler) or the secondary batch (fragment)? Anything but
39 * fragment will be primary, e.g. compute jobs will be considered
40 * "vertex/tiler" by analogy */
41
42 static inline uint32_t
43 panfrost_bo_access_for_stage(enum pipe_shader_type stage)
44 {
45 assert(stage == PIPE_SHADER_FRAGMENT ||
46 stage == PIPE_SHADER_VERTEX ||
47 stage == PIPE_SHADER_COMPUTE);
48
49 return stage == PIPE_SHADER_FRAGMENT ?
50 PAN_BO_ACCESS_FRAGMENT :
51 PAN_BO_ACCESS_VERTEX_TILER;
52 }
53
54 mali_ptr
55 panfrost_vt_emit_shared_memory(struct panfrost_batch *batch)
56 {
57 struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
58
59 struct mali_shared_memory shared = {
60 .shared_workgroup_count = ~0,
61 };
62
63 if (batch->stack_size) {
64 struct panfrost_bo *stack =
65 panfrost_batch_get_scratchpad(batch, batch->stack_size,
66 dev->thread_tls_alloc,
67 dev->core_count);
68
69 shared.stack_shift = panfrost_get_stack_shift(batch->stack_size);
70 shared.scratchpad = stack->gpu;
71 }
72
73 return panfrost_pool_upload_aligned(&batch->pool, &shared, sizeof(shared), 64);
74 }
75
76 void
77 panfrost_vt_update_primitive_size(struct panfrost_context *ctx,
78 struct mali_vertex_tiler_prefix *prefix,
79 union midgard_primitive_size *primitive_size)
80 {
81 struct panfrost_rasterizer *rasterizer = ctx->rasterizer;
82
83 if (!panfrost_writes_point_size(ctx)) {
84 float val = (prefix->draw_mode == MALI_DRAW_MODE_POINTS) ?
85 rasterizer->base.point_size :
86 rasterizer->base.line_width;
87
88 primitive_size->constant = val;
89 }
90 }
91
92 static unsigned
93 panfrost_translate_index_size(unsigned size)
94 {
95 switch (size) {
96 case 1:
97 return MALI_DRAW_INDEXED_UINT8;
98
99 case 2:
100 return MALI_DRAW_INDEXED_UINT16;
101
102 case 4:
103 return MALI_DRAW_INDEXED_UINT32;
104
105 default:
106 unreachable("Invalid index size");
107 }
108 }
109
110 /* Gets a GPU address for the associated index buffer. Only gauranteed to be
111 * good for the duration of the draw (transient), could last longer. Also get
112 * the bounds on the index buffer for the range accessed by the draw. We do
113 * these operations together because there are natural optimizations which
114 * require them to be together. */
115
116 static mali_ptr
117 panfrost_get_index_buffer_bounded(struct panfrost_context *ctx,
118 const struct pipe_draw_info *info,
119 unsigned *min_index, unsigned *max_index)
120 {
121 struct panfrost_resource *rsrc = pan_resource(info->index.resource);
122 struct panfrost_batch *batch = panfrost_get_batch_for_fbo(ctx);
123 off_t offset = info->start * info->index_size;
124 bool needs_indices = true;
125 mali_ptr out = 0;
126
127 if (info->max_index != ~0u) {
128 *min_index = info->min_index;
129 *max_index = info->max_index;
130 needs_indices = false;
131 }
132
133 if (!info->has_user_indices) {
134 /* Only resources can be directly mapped */
135 panfrost_batch_add_bo(batch, rsrc->bo,
136 PAN_BO_ACCESS_SHARED |
137 PAN_BO_ACCESS_READ |
138 PAN_BO_ACCESS_VERTEX_TILER);
139 out = rsrc->bo->gpu + offset;
140
141 /* Check the cache */
142 needs_indices = !panfrost_minmax_cache_get(rsrc->index_cache,
143 info->start,
144 info->count,
145 min_index,
146 max_index);
147 } else {
148 /* Otherwise, we need to upload to transient memory */
149 const uint8_t *ibuf8 = (const uint8_t *) info->index.user;
150 struct panfrost_transfer T =
151 panfrost_pool_alloc_aligned(&batch->pool,
152 info->count * info->index_size,
153 info->index_size);
154
155 memcpy(T.cpu, ibuf8 + offset, info->count * info->index_size);
156 out = T.gpu;
157 }
158
159 if (needs_indices) {
160 /* Fallback */
161 u_vbuf_get_minmax_index(&ctx->base, info, min_index, max_index);
162
163 if (!info->has_user_indices)
164 panfrost_minmax_cache_add(rsrc->index_cache,
165 info->start, info->count,
166 *min_index, *max_index);
167 }
168
169 return out;
170 }
171
172 void
173 panfrost_vt_set_draw_info(struct panfrost_context *ctx,
174 const struct pipe_draw_info *info,
175 enum mali_draw_mode draw_mode,
176 struct mali_vertex_tiler_postfix *vertex_postfix,
177 struct mali_vertex_tiler_prefix *tiler_prefix,
178 struct mali_vertex_tiler_postfix *tiler_postfix,
179 unsigned *vertex_count,
180 unsigned *padded_count)
181 {
182 tiler_prefix->draw_mode = draw_mode;
183
184 unsigned draw_flags = 0;
185
186 if (panfrost_writes_point_size(ctx))
187 draw_flags |= MALI_DRAW_VARYING_SIZE;
188
189 if (info->primitive_restart)
190 draw_flags |= MALI_DRAW_PRIMITIVE_RESTART_FIXED_INDEX;
191
192 /* These doesn't make much sense */
193
194 draw_flags |= 0x3000;
195
196 if (info->index_size) {
197 unsigned min_index = 0, max_index = 0;
198
199 tiler_prefix->indices = panfrost_get_index_buffer_bounded(ctx,
200 info,
201 &min_index,
202 &max_index);
203
204 /* Use the corresponding values */
205 *vertex_count = max_index - min_index + 1;
206 tiler_postfix->offset_start = vertex_postfix->offset_start = min_index + info->index_bias;
207 tiler_prefix->offset_bias_correction = -min_index;
208 tiler_prefix->index_count = MALI_POSITIVE(info->count);
209 draw_flags |= panfrost_translate_index_size(info->index_size);
210 } else {
211 tiler_prefix->indices = 0;
212 *vertex_count = ctx->vertex_count;
213 tiler_postfix->offset_start = vertex_postfix->offset_start = info->start;
214 tiler_prefix->offset_bias_correction = 0;
215 tiler_prefix->index_count = MALI_POSITIVE(ctx->vertex_count);
216 }
217
218 tiler_prefix->unknown_draw = draw_flags;
219 ctx->offset_start = vertex_postfix->offset_start;
220
221 /* Encode the padded vertex count */
222
223 if (info->instance_count > 1) {
224 *padded_count = panfrost_padded_vertex_count(*vertex_count);
225
226 unsigned shift = __builtin_ctz(ctx->padded_count);
227 unsigned k = ctx->padded_count >> (shift + 1);
228
229 tiler_postfix->instance_shift = vertex_postfix->instance_shift = shift;
230 tiler_postfix->instance_odd = vertex_postfix->instance_odd = k;
231 } else {
232 *padded_count = *vertex_count;
233
234 /* Reset instancing state */
235 tiler_postfix->instance_shift = vertex_postfix->instance_shift = 0;
236 tiler_postfix->instance_odd = vertex_postfix->instance_odd = 0;
237 }
238 }
239
240 static unsigned
241 translate_tex_wrap(enum pipe_tex_wrap w)
242 {
243 switch (w) {
244 case PIPE_TEX_WRAP_REPEAT: return MALI_WRAP_MODE_REPEAT;
245 case PIPE_TEX_WRAP_CLAMP: return MALI_WRAP_MODE_CLAMP;
246 case PIPE_TEX_WRAP_CLAMP_TO_EDGE: return MALI_WRAP_MODE_CLAMP_TO_EDGE;
247 case PIPE_TEX_WRAP_CLAMP_TO_BORDER: return MALI_WRAP_MODE_CLAMP_TO_BORDER;
248 case PIPE_TEX_WRAP_MIRROR_REPEAT: return MALI_WRAP_MODE_MIRRORED_REPEAT;
249 case PIPE_TEX_WRAP_MIRROR_CLAMP: return MALI_WRAP_MODE_MIRRORED_CLAMP;
250 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_EDGE;
251 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: return MALI_WRAP_MODE_MIRRORED_CLAMP_TO_BORDER;
252 default: unreachable("Invalid wrap");
253 }
254 }
255
256 /* The hardware compares in the wrong order order, so we have to flip before
257 * encoding. Yes, really. */
258
259 static enum mali_func
260 panfrost_sampler_compare_func(const struct pipe_sampler_state *cso)
261 {
262 if (!cso->compare_mode)
263 return MALI_FUNC_NEVER;
264
265 enum mali_func f = panfrost_translate_compare_func(cso->compare_func);
266 return panfrost_flip_compare_func(f);
267 }
268
269 static enum mali_mipmap_mode
270 pan_pipe_to_mipmode(enum pipe_tex_mipfilter f)
271 {
272 switch (f) {
273 case PIPE_TEX_MIPFILTER_NEAREST: return MALI_MIPMAP_MODE_NEAREST;
274 case PIPE_TEX_MIPFILTER_LINEAR: return MALI_MIPMAP_MODE_TRILINEAR;
275 case PIPE_TEX_MIPFILTER_NONE: return MALI_MIPMAP_MODE_NONE;
276 default: unreachable("Invalid");
277 }
278 }
279
280 void panfrost_sampler_desc_init(const struct pipe_sampler_state *cso,
281 struct mali_midgard_sampler_packed *hw)
282 {
283 pan_pack(hw, MIDGARD_SAMPLER, cfg) {
284 cfg.magnify_nearest = cso->mag_img_filter == PIPE_TEX_FILTER_NEAREST;
285 cfg.minify_nearest = cso->min_img_filter == PIPE_TEX_FILTER_NEAREST;
286 cfg.mipmap_mode = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR) ?
287 MALI_MIPMAP_MODE_TRILINEAR : MALI_MIPMAP_MODE_NEAREST;
288 cfg.normalized_coordinates = cso->normalized_coords;
289
290 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
291
292 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
293
294 /* If necessary, we disable mipmapping in the sampler descriptor by
295 * clamping the LOD as tight as possible (from 0 to epsilon,
296 * essentially -- remember these are fixed point numbers, so
297 * epsilon=1/256) */
298
299 cfg.maximum_lod = (cso->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) ?
300 cfg.minimum_lod + 1 :
301 FIXED_16(cso->max_lod, false);
302
303 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
304 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
305 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
306
307 cfg.compare_function = panfrost_sampler_compare_func(cso);
308 cfg.seamless_cube_map = cso->seamless_cube_map;
309
310 cfg.border_color_r = cso->border_color.f[0];
311 cfg.border_color_g = cso->border_color.f[1];
312 cfg.border_color_b = cso->border_color.f[2];
313 cfg.border_color_a = cso->border_color.f[3];
314 }
315 }
316
317 void panfrost_sampler_desc_init_bifrost(const struct pipe_sampler_state *cso,
318 struct mali_bifrost_sampler_packed *hw)
319 {
320 pan_pack(hw, BIFROST_SAMPLER, cfg) {
321 cfg.magnify_linear = cso->mag_img_filter == PIPE_TEX_FILTER_LINEAR;
322 cfg.minify_linear = cso->min_img_filter == PIPE_TEX_FILTER_LINEAR;
323 cfg.mipmap_mode = pan_pipe_to_mipmode(cso->min_mip_filter);
324 cfg.normalized_coordinates = cso->normalized_coords;
325
326 cfg.lod_bias = FIXED_16(cso->lod_bias, true);
327 cfg.minimum_lod = FIXED_16(cso->min_lod, false);
328 cfg.maximum_lod = FIXED_16(cso->max_lod, false);
329
330 cfg.wrap_mode_s = translate_tex_wrap(cso->wrap_s);
331 cfg.wrap_mode_t = translate_tex_wrap(cso->wrap_t);
332 cfg.wrap_mode_r = translate_tex_wrap(cso->wrap_r);
333
334 cfg.compare_function = panfrost_sampler_compare_func(cso);
335 cfg.seamless_cube_map = cso->seamless_cube_map;
336 }
337 }
338
339 static bool
340 panfrost_fs_required(
341 struct panfrost_shader_state *fs,
342 struct panfrost_blend_final *blend,
343 unsigned rt_count)
344 {
345 /* If we generally have side effects */
346 if (fs->fs_sidefx)
347 return true;
348
349 /* If colour is written we need to execute */
350 for (unsigned i = 0; i < rt_count; ++i) {
351 if (!blend[i].no_colour)
352 return true;
353 }
354
355 /* If depth is written and not implied we need to execute.
356 * TODO: Predicate on Z/S writes being enabled */
357 return (fs->writes_depth || fs->writes_stencil);
358 }
359
360 static void
361 panfrost_emit_blend(struct panfrost_batch *batch, void *rts,
362 struct panfrost_blend_final *blend)
363 {
364 const struct panfrost_device *dev = pan_device(batch->ctx->base.screen);
365 struct panfrost_shader_state *fs = panfrost_get_shader_state(batch->ctx, PIPE_SHADER_FRAGMENT);
366 unsigned rt_count = batch->key.nr_cbufs;
367
368 struct bifrost_blend_rt *brts = rts;
369
370 /* Disable blending for depth-only */
371
372 if (rt_count == 0) {
373 if (dev->quirks & IS_BIFROST) {
374 memset(brts, 0, sizeof(*brts));
375 brts[0].unk2 = 0x3;
376 } else {
377 pan_pack(rts, MIDGARD_BLEND_OPAQUE, cfg) {
378 cfg.equation = 0xf0122122; /* Replace */
379 }
380 }
381 }
382
383 for (unsigned i = 0; i < rt_count; ++i) {
384 struct mali_blend_flags_packed flags = {};
385
386 pan_pack(&flags, BLEND_FLAGS, cfg) {
387 if (blend[i].no_colour) {
388 cfg.enable = false;
389 break;
390 }
391
392 batch->draws |= (PIPE_CLEAR_COLOR0 << i);
393
394 cfg.srgb = util_format_is_srgb(batch->key.cbufs[i]->format);
395 cfg.load_destination = blend[i].load_dest;
396 cfg.dither_disable = !batch->ctx->blend->base.dither;
397
398 if (!(dev->quirks & IS_BIFROST))
399 cfg.midgard_blend_shader = blend[i].is_shader;
400 }
401
402 if (dev->quirks & IS_BIFROST) {
403 memset(brts + i, 0, sizeof(brts[i]));
404 brts[i].flags = flags.opaque[0];
405
406 if (blend[i].is_shader) {
407 /* The blend shader's address needs to be at
408 * the same top 32 bit as the fragment shader.
409 * TODO: Ensure that's always the case.
410 */
411 assert((blend[i].shader.gpu & (0xffffffffull << 32)) ==
412 (fs->bo->gpu & (0xffffffffull << 32)));
413 brts[i].shader = blend[i].shader.gpu;
414 brts[i].unk2 = 0x0;
415 } else {
416 enum pipe_format format = batch->key.cbufs[i]->format;
417 const struct util_format_description *format_desc;
418 format_desc = util_format_description(format);
419
420 brts[i].equation = blend[i].equation.equation;
421
422 /* TODO: this is a bit more complicated */
423 brts[i].constant = blend[i].equation.constant;
424
425 brts[i].format = panfrost_format_to_bifrost_blend(format_desc);
426
427 /* 0x19 disables blending and forces REPLACE
428 * mode (equivalent to rgb_mode = alpha_mode =
429 * x122, colour mask = 0xF). 0x1a allows
430 * blending. */
431 brts[i].unk2 = blend[i].opaque ? 0x19 : 0x1a;
432
433 brts[i].shader_type = fs->blend_types[i];
434 }
435 } else {
436 pan_pack(rts, MIDGARD_BLEND_OPAQUE, cfg) {
437 cfg.flags = flags;
438
439 if (blend[i].is_shader) {
440 cfg.shader = blend[i].shader.gpu | blend[i].shader.first_tag;
441 } else {
442 cfg.equation = blend[i].equation.equation.opaque[0];
443 cfg.constant = blend[i].equation.constant;
444 }
445 }
446
447 rts += MALI_MIDGARD_BLEND_LENGTH;
448 }
449 }
450 }
451
452 static void
453 panfrost_emit_frag_shader(struct panfrost_context *ctx,
454 struct mali_state_packed *fragmeta,
455 struct panfrost_blend_final *blend)
456 {
457 const struct panfrost_device *dev = pan_device(ctx->base.screen);
458 struct panfrost_shader_state *fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
459 struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
460 const struct panfrost_zsa_state *zsa = ctx->depth_stencil;
461 unsigned rt_count = ctx->pipe_framebuffer.nr_cbufs;
462 bool alpha_to_coverage = ctx->blend->base.alpha_to_coverage;
463
464 /* Built up here */
465 struct mali_shader_packed shader = fs->shader;
466 struct mali_preload_packed preload = fs->preload;
467 uint32_t properties;
468 struct mali_multisample_misc_packed multisample_misc;
469 struct mali_stencil_mask_misc_packed stencil_mask_misc;
470 union midgard_blend sfbd_blend = { 0 };
471
472 if (!panfrost_fs_required(fs, blend, rt_count)) {
473 if (dev->quirks & IS_BIFROST) {
474 pan_pack(&shader, SHADER, cfg) {}
475
476 pan_pack(&properties, BIFROST_PROPERTIES, cfg) {
477 cfg.unknown = 0x950020; /* XXX */
478 cfg.early_z_enable = true;
479 }
480
481 preload.opaque[0] = 0;
482 } else {
483 pan_pack(&shader, SHADER, cfg) {
484 cfg.shader = 0x1;
485 }
486
487 pan_pack(&properties, MIDGARD_PROPERTIES, cfg) {
488 cfg.work_register_count = 1;
489 cfg.depth_source = MALI_DEPTH_SOURCE_FIXED_FUNCTION;
490 cfg.early_z_enable = true;
491 }
492 }
493 } else if (dev->quirks & IS_BIFROST) {
494 bool no_blend = true;
495
496 for (unsigned i = 0; i < rt_count; ++i)
497 no_blend &= (!blend[i].load_dest | blend[i].no_colour);
498
499 pan_pack(&properties, BIFROST_PROPERTIES, cfg) {
500 cfg.early_z_enable = !fs->can_discard && !fs->writes_depth && no_blend;
501 }
502
503 /* Combine with prepacked properties */
504 properties |= fs->properties.opaque[0];
505 } else {
506 /* Reasons to disable early-Z from a shader perspective */
507 bool late_z = fs->can_discard || fs->writes_global ||
508 fs->writes_depth || fs->writes_stencil;
509
510 /* If either depth or stencil is enabled, discard matters */
511 bool zs_enabled =
512 (zsa->base.depth.enabled && zsa->base.depth.func != PIPE_FUNC_ALWAYS) ||
513 zsa->base.stencil[0].enabled;
514
515 bool has_blend_shader = false;
516
517 for (unsigned c = 0; c < rt_count; ++c)
518 has_blend_shader |= blend[c].is_shader;
519
520 pan_pack(&properties, MIDGARD_PROPERTIES, cfg) {
521 /* TODO: Reduce this limit? */
522 if (has_blend_shader)
523 cfg.work_register_count = MAX2(fs->work_reg_count, 8);
524 else
525 cfg.work_register_count = fs->work_reg_count;
526
527 cfg.early_z_enable = !(late_z || alpha_to_coverage);
528 cfg.reads_tilebuffer = fs->outputs_read || (!zs_enabled && fs->can_discard);
529 cfg.reads_depth_stencil = zs_enabled && fs->can_discard;
530 }
531
532 properties |= fs->properties.opaque[0];
533 }
534
535 pan_pack(&multisample_misc, MULTISAMPLE_MISC, cfg) {
536 bool msaa = rast->multisample;
537 cfg.multisample_enable = msaa;
538 cfg.sample_mask = (msaa ? ctx->sample_mask : ~0) & 0xFFFF;
539
540 /* EXT_shader_framebuffer_fetch requires per-sample */
541 bool per_sample = ctx->min_samples > 1 || fs->outputs_read;
542 cfg.evaluate_per_sample = msaa && per_sample;
543
544 if (dev->quirks & MIDGARD_SFBD) {
545 cfg.sfbd_load_destination = blend[0].load_dest;
546 cfg.sfbd_blend_shader = blend[0].is_shader;
547 }
548
549 cfg.depth_function = zsa->base.depth.enabled ?
550 panfrost_translate_compare_func(zsa->base.depth.func) :
551 MALI_FUNC_ALWAYS;
552
553 cfg.depth_write_mask = zsa->base.depth.writemask;
554 cfg.near_discard = rast->depth_clip_near;
555 cfg.far_discard = rast->depth_clip_far;
556 cfg.unknown_2 = true;
557 }
558
559 pan_pack(&stencil_mask_misc, STENCIL_MASK_MISC, cfg) {
560 cfg.stencil_mask_front = zsa->stencil_mask_front;
561 cfg.stencil_mask_back = zsa->stencil_mask_back;
562 cfg.stencil_enable = zsa->base.stencil[0].enabled;
563 cfg.alpha_to_coverage = alpha_to_coverage;
564
565 if (dev->quirks & MIDGARD_SFBD) {
566 cfg.sfbd_write_enable = !blend[0].no_colour;
567 cfg.sfbd_srgb = util_format_is_srgb(ctx->pipe_framebuffer.cbufs[0]->format);
568 cfg.sfbd_dither_disable = !ctx->blend->base.dither;
569 }
570
571 cfg.unknown_1 = 0x7;
572 cfg.depth_range_1 = cfg.depth_range_2 = rast->offset_tri;
573 cfg.single_sampled_lines = !rast->multisample;
574 }
575
576 if (dev->quirks & MIDGARD_SFBD) {
577 if (blend[0].is_shader) {
578 sfbd_blend.shader = blend[0].shader.gpu |
579 blend[0].shader.first_tag;
580 } else {
581 sfbd_blend.equation = blend[0].equation.equation;
582 sfbd_blend.constant = blend[0].equation.constant;
583 }
584 } else if (!(dev->quirks & IS_BIFROST)) {
585 /* Bug where MRT-capable hw apparently reads the last blend
586 * shader from here instead of the usual location? */
587
588 for (signed rt = ((signed) rt_count - 1); rt >= 0; --rt) {
589 if (!blend[rt].is_shader)
590 continue;
591
592 sfbd_blend.shader = blend[rt].shader.gpu |
593 blend[rt].shader.first_tag;
594 break;
595 }
596 }
597
598 pan_pack(fragmeta, STATE_OPAQUE, cfg) {
599 cfg.shader = fs->shader;
600 cfg.properties = properties;
601 cfg.depth_units = rast->offset_units * 2.0f;
602 cfg.depth_factor = rast->offset_scale;
603 cfg.multisample_misc = multisample_misc;
604 cfg.stencil_mask_misc = stencil_mask_misc;
605
606 cfg.stencil_front = zsa->stencil_front;
607 cfg.stencil_back = zsa->stencil_back;
608
609 /* Bottom bits for stencil ref, exactly one word */
610 bool back_enab = zsa->base.stencil[1].enabled;
611 cfg.stencil_front.opaque[0] |= ctx->stencil_ref.ref_value[0];
612 cfg.stencil_back.opaque[0] |= ctx->stencil_ref.ref_value[back_enab ? 1 : 0];
613
614 if (dev->quirks & IS_BIFROST)
615 cfg.preload = preload;
616 else
617 memcpy(&cfg.sfbd_blend, &sfbd_blend, sizeof(sfbd_blend));
618 }
619 }
620
621 mali_ptr
622 panfrost_emit_compute_shader_meta(struct panfrost_batch *batch, enum pipe_shader_type stage)
623 {
624 struct panfrost_shader_state *ss = panfrost_get_shader_state(batch->ctx, stage);
625
626 panfrost_batch_add_bo(batch, ss->bo,
627 PAN_BO_ACCESS_PRIVATE |
628 PAN_BO_ACCESS_READ |
629 PAN_BO_ACCESS_VERTEX_TILER);
630
631 panfrost_batch_add_bo(batch, pan_resource(ss->upload.rsrc)->bo,
632 PAN_BO_ACCESS_PRIVATE |
633 PAN_BO_ACCESS_READ |
634 PAN_BO_ACCESS_VERTEX_TILER);
635
636 return pan_resource(ss->upload.rsrc)->bo->gpu + ss->upload.offset;
637 }
638
639 mali_ptr
640 panfrost_emit_frag_shader_meta(struct panfrost_batch *batch)
641 {
642 struct panfrost_context *ctx = batch->ctx;
643 struct panfrost_shader_state *ss = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
644
645 /* Add the shader BO to the batch. */
646 panfrost_batch_add_bo(batch, ss->bo,
647 PAN_BO_ACCESS_PRIVATE |
648 PAN_BO_ACCESS_READ |
649 PAN_BO_ACCESS_FRAGMENT);
650
651 struct panfrost_device *dev = pan_device(ctx->base.screen);
652 unsigned rt_count = MAX2(ctx->pipe_framebuffer.nr_cbufs, 1);
653 struct panfrost_transfer xfer;
654 unsigned rt_size;
655
656 if (dev->quirks & MIDGARD_SFBD)
657 rt_size = 0;
658 else if (dev->quirks & IS_BIFROST)
659 rt_size = sizeof(struct bifrost_blend_rt);
660 else
661 rt_size = sizeof(struct midgard_blend_rt);
662
663 unsigned desc_size = MALI_STATE_LENGTH + rt_size * rt_count;
664 xfer = panfrost_pool_alloc_aligned(&batch->pool, desc_size, MALI_STATE_LENGTH);
665
666 struct panfrost_blend_final blend[PIPE_MAX_COLOR_BUFS];
667
668 for (unsigned c = 0; c < ctx->pipe_framebuffer.nr_cbufs; ++c)
669 blend[c] = panfrost_get_blend_for_context(ctx, c);
670
671 panfrost_emit_frag_shader(ctx, (struct mali_state_packed *) xfer.cpu, blend);
672
673 if (!(dev->quirks & MIDGARD_SFBD))
674 panfrost_emit_blend(batch, xfer.cpu + MALI_STATE_LENGTH, blend);
675 else
676 batch->draws |= PIPE_CLEAR_COLOR0;
677
678 return xfer.gpu;
679 }
680
681 mali_ptr
682 panfrost_emit_viewport(struct panfrost_batch *batch)
683 {
684 struct panfrost_context *ctx = batch->ctx;
685 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
686 const struct pipe_scissor_state *ss = &ctx->scissor;
687 const struct pipe_rasterizer_state *rast = &ctx->rasterizer->base;
688 const struct pipe_framebuffer_state *fb = &ctx->pipe_framebuffer;
689
690 /* Derive min/max from translate/scale. Note since |x| >= 0 by
691 * definition, we have that -|x| <= |x| hence translate - |scale| <=
692 * translate + |scale|, so the ordering is correct here. */
693 float vp_minx = (int) (vp->translate[0] - fabsf(vp->scale[0]));
694 float vp_maxx = (int) (vp->translate[0] + fabsf(vp->scale[0]));
695 float vp_miny = (int) (vp->translate[1] - fabsf(vp->scale[1]));
696 float vp_maxy = (int) (vp->translate[1] + fabsf(vp->scale[1]));
697 float minz = (vp->translate[2] - fabsf(vp->scale[2]));
698 float maxz = (vp->translate[2] + fabsf(vp->scale[2]));
699
700 /* Scissor to the intersection of viewport and to the scissor, clamped
701 * to the framebuffer */
702
703 unsigned minx = MIN2(fb->width, vp_minx);
704 unsigned maxx = MIN2(fb->width, vp_maxx);
705 unsigned miny = MIN2(fb->height, vp_miny);
706 unsigned maxy = MIN2(fb->height, vp_maxy);
707
708 if (ss && rast->scissor) {
709 minx = MAX2(ss->minx, minx);
710 miny = MAX2(ss->miny, miny);
711 maxx = MIN2(ss->maxx, maxx);
712 maxy = MIN2(ss->maxy, maxy);
713 }
714
715 struct panfrost_transfer T = panfrost_pool_alloc(&batch->pool, MALI_VIEWPORT_LENGTH);
716
717 pan_pack(T.cpu, VIEWPORT, cfg) {
718 cfg.scissor_minimum_x = minx;
719 cfg.scissor_minimum_y = miny;
720 cfg.scissor_maximum_x = maxx - 1;
721 cfg.scissor_maximum_y = maxy - 1;
722
723 cfg.minimum_z = rast->depth_clip_near ? minz : -INFINITY;
724 cfg.maximum_z = rast->depth_clip_far ? maxz : INFINITY;
725 }
726
727 panfrost_batch_union_scissor(batch, minx, miny, maxx, maxy);
728 return T.gpu;
729 }
730
731 static mali_ptr
732 panfrost_map_constant_buffer_gpu(struct panfrost_batch *batch,
733 enum pipe_shader_type st,
734 struct panfrost_constant_buffer *buf,
735 unsigned index)
736 {
737 struct pipe_constant_buffer *cb = &buf->cb[index];
738 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
739
740 if (rsrc) {
741 panfrost_batch_add_bo(batch, rsrc->bo,
742 PAN_BO_ACCESS_SHARED |
743 PAN_BO_ACCESS_READ |
744 panfrost_bo_access_for_stage(st));
745
746 /* Alignment gauranteed by
747 * PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT */
748 return rsrc->bo->gpu + cb->buffer_offset;
749 } else if (cb->user_buffer) {
750 return panfrost_pool_upload_aligned(&batch->pool,
751 cb->user_buffer +
752 cb->buffer_offset,
753 cb->buffer_size, 16);
754 } else {
755 unreachable("No constant buffer");
756 }
757 }
758
759 struct sysval_uniform {
760 union {
761 float f[4];
762 int32_t i[4];
763 uint32_t u[4];
764 uint64_t du[2];
765 };
766 };
767
768 static void
769 panfrost_upload_viewport_scale_sysval(struct panfrost_batch *batch,
770 struct sysval_uniform *uniform)
771 {
772 struct panfrost_context *ctx = batch->ctx;
773 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
774
775 uniform->f[0] = vp->scale[0];
776 uniform->f[1] = vp->scale[1];
777 uniform->f[2] = vp->scale[2];
778 }
779
780 static void
781 panfrost_upload_viewport_offset_sysval(struct panfrost_batch *batch,
782 struct sysval_uniform *uniform)
783 {
784 struct panfrost_context *ctx = batch->ctx;
785 const struct pipe_viewport_state *vp = &ctx->pipe_viewport;
786
787 uniform->f[0] = vp->translate[0];
788 uniform->f[1] = vp->translate[1];
789 uniform->f[2] = vp->translate[2];
790 }
791
792 static void panfrost_upload_txs_sysval(struct panfrost_batch *batch,
793 enum pipe_shader_type st,
794 unsigned int sysvalid,
795 struct sysval_uniform *uniform)
796 {
797 struct panfrost_context *ctx = batch->ctx;
798 unsigned texidx = PAN_SYSVAL_ID_TO_TXS_TEX_IDX(sysvalid);
799 unsigned dim = PAN_SYSVAL_ID_TO_TXS_DIM(sysvalid);
800 bool is_array = PAN_SYSVAL_ID_TO_TXS_IS_ARRAY(sysvalid);
801 struct pipe_sampler_view *tex = &ctx->sampler_views[st][texidx]->base;
802
803 assert(dim);
804 uniform->i[0] = u_minify(tex->texture->width0, tex->u.tex.first_level);
805
806 if (dim > 1)
807 uniform->i[1] = u_minify(tex->texture->height0,
808 tex->u.tex.first_level);
809
810 if (dim > 2)
811 uniform->i[2] = u_minify(tex->texture->depth0,
812 tex->u.tex.first_level);
813
814 if (is_array)
815 uniform->i[dim] = tex->texture->array_size;
816 }
817
818 static void
819 panfrost_upload_ssbo_sysval(struct panfrost_batch *batch,
820 enum pipe_shader_type st,
821 unsigned ssbo_id,
822 struct sysval_uniform *uniform)
823 {
824 struct panfrost_context *ctx = batch->ctx;
825
826 assert(ctx->ssbo_mask[st] & (1 << ssbo_id));
827 struct pipe_shader_buffer sb = ctx->ssbo[st][ssbo_id];
828
829 /* Compute address */
830 struct panfrost_bo *bo = pan_resource(sb.buffer)->bo;
831
832 panfrost_batch_add_bo(batch, bo,
833 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_RW |
834 panfrost_bo_access_for_stage(st));
835
836 /* Upload address and size as sysval */
837 uniform->du[0] = bo->gpu + sb.buffer_offset;
838 uniform->u[2] = sb.buffer_size;
839 }
840
841 static void
842 panfrost_upload_sampler_sysval(struct panfrost_batch *batch,
843 enum pipe_shader_type st,
844 unsigned samp_idx,
845 struct sysval_uniform *uniform)
846 {
847 struct panfrost_context *ctx = batch->ctx;
848 struct pipe_sampler_state *sampl = &ctx->samplers[st][samp_idx]->base;
849
850 uniform->f[0] = sampl->min_lod;
851 uniform->f[1] = sampl->max_lod;
852 uniform->f[2] = sampl->lod_bias;
853
854 /* Even without any errata, Midgard represents "no mipmapping" as
855 * fixing the LOD with the clamps; keep behaviour consistent. c.f.
856 * panfrost_create_sampler_state which also explains our choice of
857 * epsilon value (again to keep behaviour consistent) */
858
859 if (sampl->min_mip_filter == PIPE_TEX_MIPFILTER_NONE)
860 uniform->f[1] = uniform->f[0] + (1.0/256.0);
861 }
862
863 static void
864 panfrost_upload_num_work_groups_sysval(struct panfrost_batch *batch,
865 struct sysval_uniform *uniform)
866 {
867 struct panfrost_context *ctx = batch->ctx;
868
869 uniform->u[0] = ctx->compute_grid->grid[0];
870 uniform->u[1] = ctx->compute_grid->grid[1];
871 uniform->u[2] = ctx->compute_grid->grid[2];
872 }
873
874 static void
875 panfrost_upload_sysvals(struct panfrost_batch *batch, void *buf,
876 struct panfrost_shader_state *ss,
877 enum pipe_shader_type st)
878 {
879 struct sysval_uniform *uniforms = (void *)buf;
880
881 for (unsigned i = 0; i < ss->sysval_count; ++i) {
882 int sysval = ss->sysval[i];
883
884 switch (PAN_SYSVAL_TYPE(sysval)) {
885 case PAN_SYSVAL_VIEWPORT_SCALE:
886 panfrost_upload_viewport_scale_sysval(batch,
887 &uniforms[i]);
888 break;
889 case PAN_SYSVAL_VIEWPORT_OFFSET:
890 panfrost_upload_viewport_offset_sysval(batch,
891 &uniforms[i]);
892 break;
893 case PAN_SYSVAL_TEXTURE_SIZE:
894 panfrost_upload_txs_sysval(batch, st,
895 PAN_SYSVAL_ID(sysval),
896 &uniforms[i]);
897 break;
898 case PAN_SYSVAL_SSBO:
899 panfrost_upload_ssbo_sysval(batch, st,
900 PAN_SYSVAL_ID(sysval),
901 &uniforms[i]);
902 break;
903 case PAN_SYSVAL_NUM_WORK_GROUPS:
904 panfrost_upload_num_work_groups_sysval(batch,
905 &uniforms[i]);
906 break;
907 case PAN_SYSVAL_SAMPLER:
908 panfrost_upload_sampler_sysval(batch, st,
909 PAN_SYSVAL_ID(sysval),
910 &uniforms[i]);
911 break;
912 default:
913 assert(0);
914 }
915 }
916 }
917
918 static const void *
919 panfrost_map_constant_buffer_cpu(struct panfrost_constant_buffer *buf,
920 unsigned index)
921 {
922 struct pipe_constant_buffer *cb = &buf->cb[index];
923 struct panfrost_resource *rsrc = pan_resource(cb->buffer);
924
925 if (rsrc)
926 return rsrc->bo->cpu;
927 else if (cb->user_buffer)
928 return cb->user_buffer;
929 else
930 unreachable("No constant buffer");
931 }
932
933 mali_ptr
934 panfrost_emit_const_buf(struct panfrost_batch *batch,
935 enum pipe_shader_type stage,
936 mali_ptr *push_constants)
937 {
938 struct panfrost_context *ctx = batch->ctx;
939 struct panfrost_shader_variants *all = ctx->shader[stage];
940
941 if (!all)
942 return 0;
943
944 struct panfrost_constant_buffer *buf = &ctx->constant_buffer[stage];
945
946 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
947
948 /* Uniforms are implicitly UBO #0 */
949 bool has_uniforms = buf->enabled_mask & (1 << 0);
950
951 /* Allocate room for the sysval and the uniforms */
952 size_t sys_size = sizeof(float) * 4 * ss->sysval_count;
953 size_t uniform_size = has_uniforms ? (buf->cb[0].buffer_size) : 0;
954 size_t size = sys_size + uniform_size;
955 struct panfrost_transfer transfer =
956 panfrost_pool_alloc_aligned(&batch->pool, size, 16);
957
958 /* Upload sysvals requested by the shader */
959 panfrost_upload_sysvals(batch, transfer.cpu, ss, stage);
960
961 /* Upload uniforms */
962 if (has_uniforms && uniform_size) {
963 const void *cpu = panfrost_map_constant_buffer_cpu(buf, 0);
964 memcpy(transfer.cpu + sys_size, cpu, uniform_size);
965 }
966
967 /* Next up, attach UBOs. UBO #0 is the uniforms we just
968 * uploaded, so it's always included. The count is the highest UBO
969 * addressable -- gaps are included. */
970
971 unsigned ubo_count = 32 - __builtin_clz(buf->enabled_mask | 1);
972
973 size_t sz = MALI_UNIFORM_BUFFER_LENGTH * ubo_count;
974 struct panfrost_transfer ubos =
975 panfrost_pool_alloc_aligned(&batch->pool, sz,
976 MALI_UNIFORM_BUFFER_LENGTH);
977
978 uint64_t *ubo_ptr = (uint64_t *) ubos.cpu;
979
980 /* Upload uniforms as a UBO */
981
982 if (size) {
983 pan_pack(ubo_ptr, UNIFORM_BUFFER, cfg) {
984 cfg.entries = DIV_ROUND_UP(size, 16);
985 cfg.pointer = transfer.gpu;
986 }
987 } else {
988 *ubo_ptr = 0;
989 }
990
991 /* The rest are honest-to-goodness UBOs */
992
993 for (unsigned ubo = 1; ubo < ubo_count; ++ubo) {
994 size_t usz = buf->cb[ubo].buffer_size;
995 bool enabled = buf->enabled_mask & (1 << ubo);
996 bool empty = usz == 0;
997
998 if (!enabled || empty) {
999 ubo_ptr[ubo] = 0;
1000 continue;
1001 }
1002
1003 pan_pack(ubo_ptr + ubo, UNIFORM_BUFFER, cfg) {
1004 cfg.entries = DIV_ROUND_UP(usz, 16);
1005 cfg.pointer = panfrost_map_constant_buffer_gpu(batch,
1006 stage, buf, ubo);
1007 }
1008 }
1009
1010 *push_constants = transfer.gpu;
1011
1012 buf->dirty_mask = 0;
1013 return ubos.gpu;
1014 }
1015
1016 mali_ptr
1017 panfrost_emit_shared_memory(struct panfrost_batch *batch,
1018 const struct pipe_grid_info *info)
1019 {
1020 struct panfrost_context *ctx = batch->ctx;
1021 struct panfrost_device *dev = pan_device(ctx->base.screen);
1022 struct panfrost_shader_variants *all = ctx->shader[PIPE_SHADER_COMPUTE];
1023 struct panfrost_shader_state *ss = &all->variants[all->active_variant];
1024 unsigned single_size = util_next_power_of_two(MAX2(ss->shared_size,
1025 128));
1026
1027 unsigned log2_instances =
1028 util_logbase2_ceil(info->grid[0]) +
1029 util_logbase2_ceil(info->grid[1]) +
1030 util_logbase2_ceil(info->grid[2]);
1031
1032 unsigned shared_size = single_size * (1 << log2_instances) * dev->core_count;
1033 struct panfrost_bo *bo = panfrost_batch_get_shared_memory(batch,
1034 shared_size,
1035 1);
1036
1037 struct mali_shared_memory shared = {
1038 .shared_memory = bo->gpu,
1039 .shared_workgroup_count = log2_instances,
1040 .shared_shift = util_logbase2(single_size) + 1
1041 };
1042
1043 return panfrost_pool_upload_aligned(&batch->pool, &shared,
1044 sizeof(shared), 64);
1045 }
1046
1047 static mali_ptr
1048 panfrost_get_tex_desc(struct panfrost_batch *batch,
1049 enum pipe_shader_type st,
1050 struct panfrost_sampler_view *view)
1051 {
1052 if (!view)
1053 return (mali_ptr) 0;
1054
1055 struct pipe_sampler_view *pview = &view->base;
1056 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1057
1058 /* Add the BO to the job so it's retained until the job is done. */
1059
1060 panfrost_batch_add_bo(batch, rsrc->bo,
1061 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1062 panfrost_bo_access_for_stage(st));
1063
1064 panfrost_batch_add_bo(batch, view->bo,
1065 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1066 panfrost_bo_access_for_stage(st));
1067
1068 return view->bo->gpu;
1069 }
1070
1071 static void
1072 panfrost_update_sampler_view(struct panfrost_sampler_view *view,
1073 struct pipe_context *pctx)
1074 {
1075 struct panfrost_resource *rsrc = pan_resource(view->base.texture);
1076 if (view->texture_bo != rsrc->bo->gpu ||
1077 view->modifier != rsrc->modifier) {
1078 panfrost_bo_unreference(view->bo);
1079 panfrost_create_sampler_view_bo(view, pctx, &rsrc->base);
1080 }
1081 }
1082
1083 mali_ptr
1084 panfrost_emit_texture_descriptors(struct panfrost_batch *batch,
1085 enum pipe_shader_type stage)
1086 {
1087 struct panfrost_context *ctx = batch->ctx;
1088 struct panfrost_device *device = pan_device(ctx->base.screen);
1089
1090 if (!ctx->sampler_view_count[stage])
1091 return 0;
1092
1093 if (device->quirks & IS_BIFROST) {
1094 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1095 MALI_BIFROST_TEXTURE_LENGTH *
1096 ctx->sampler_view_count[stage],
1097 MALI_BIFROST_TEXTURE_LENGTH);
1098
1099 struct mali_bifrost_texture_packed *out =
1100 (struct mali_bifrost_texture_packed *) T.cpu;
1101
1102 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1103 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1104 struct pipe_sampler_view *pview = &view->base;
1105 struct panfrost_resource *rsrc = pan_resource(pview->texture);
1106
1107 panfrost_update_sampler_view(view, &ctx->base);
1108 out[i] = view->bifrost_descriptor;
1109
1110 /* Add the BOs to the job so they are retained until the job is done. */
1111
1112 panfrost_batch_add_bo(batch, rsrc->bo,
1113 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1114 panfrost_bo_access_for_stage(stage));
1115
1116 panfrost_batch_add_bo(batch, view->bo,
1117 PAN_BO_ACCESS_SHARED | PAN_BO_ACCESS_READ |
1118 panfrost_bo_access_for_stage(stage));
1119 }
1120
1121 return T.gpu;
1122 } else {
1123 uint64_t trampolines[PIPE_MAX_SHADER_SAMPLER_VIEWS];
1124
1125 for (int i = 0; i < ctx->sampler_view_count[stage]; ++i) {
1126 struct panfrost_sampler_view *view = ctx->sampler_views[stage][i];
1127
1128 panfrost_update_sampler_view(view, &ctx->base);
1129
1130 trampolines[i] = panfrost_get_tex_desc(batch, stage, view);
1131 }
1132
1133 return panfrost_pool_upload_aligned(&batch->pool, trampolines,
1134 sizeof(uint64_t) *
1135 ctx->sampler_view_count[stage],
1136 sizeof(uint64_t));
1137 }
1138 }
1139
1140 mali_ptr
1141 panfrost_emit_sampler_descriptors(struct panfrost_batch *batch,
1142 enum pipe_shader_type stage)
1143 {
1144 struct panfrost_context *ctx = batch->ctx;
1145
1146 if (!ctx->sampler_count[stage])
1147 return 0;
1148
1149 size_t desc_size = MALI_BIFROST_SAMPLER_LENGTH;
1150 assert(MALI_BIFROST_SAMPLER_LENGTH == MALI_MIDGARD_SAMPLER_LENGTH);
1151
1152 size_t sz = desc_size * ctx->sampler_count[stage];
1153 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool, sz, desc_size);
1154 struct mali_midgard_sampler_packed *out = (struct mali_midgard_sampler_packed *) T.cpu;
1155
1156 for (unsigned i = 0; i < ctx->sampler_count[stage]; ++i)
1157 out[i] = ctx->samplers[stage][i]->hw;
1158
1159 return T.gpu;
1160 }
1161
1162 mali_ptr
1163 panfrost_emit_vertex_data(struct panfrost_batch *batch,
1164 mali_ptr *buffers)
1165 {
1166 struct panfrost_context *ctx = batch->ctx;
1167 struct panfrost_vertex_state *so = ctx->vertex;
1168 struct panfrost_shader_state *vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1169
1170 /* Worst case: everything is NPOT, which is only possible if instancing
1171 * is enabled. Otherwise single record is gauranteed */
1172 struct panfrost_transfer S = panfrost_pool_alloc_aligned(&batch->pool,
1173 MALI_ATTRIBUTE_BUFFER_LENGTH * vs->attribute_count *
1174 (ctx->instance_count > 1 ? 2 : 1),
1175 MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
1176
1177 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1178 MALI_ATTRIBUTE_LENGTH * vs->attribute_count,
1179 MALI_ATTRIBUTE_LENGTH);
1180
1181 struct mali_attribute_buffer_packed *bufs =
1182 (struct mali_attribute_buffer_packed *) S.cpu;
1183
1184 struct mali_attribute_packed *out =
1185 (struct mali_attribute_packed *) T.cpu;
1186
1187 unsigned attrib_to_buffer[PIPE_MAX_ATTRIBS] = { 0 };
1188 unsigned k = 0;
1189
1190 for (unsigned i = 0; i < so->num_elements; ++i) {
1191 /* We map buffers 1:1 with the attributes, which
1192 * means duplicating some vertex buffers (who cares? aside from
1193 * maybe some caching implications but I somehow doubt that
1194 * matters) */
1195
1196 struct pipe_vertex_element *elem = &so->pipe[i];
1197 unsigned vbi = elem->vertex_buffer_index;
1198 attrib_to_buffer[i] = k;
1199
1200 if (!(ctx->vb_mask & (1 << vbi)))
1201 continue;
1202
1203 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1204 struct panfrost_resource *rsrc;
1205
1206 rsrc = pan_resource(buf->buffer.resource);
1207 if (!rsrc)
1208 continue;
1209
1210 /* Add a dependency of the batch on the vertex buffer */
1211 panfrost_batch_add_bo(batch, rsrc->bo,
1212 PAN_BO_ACCESS_SHARED |
1213 PAN_BO_ACCESS_READ |
1214 PAN_BO_ACCESS_VERTEX_TILER);
1215
1216 /* Mask off lower bits, see offset fixup below */
1217 mali_ptr raw_addr = rsrc->bo->gpu + buf->buffer_offset;
1218 mali_ptr addr = raw_addr & ~63;
1219
1220 /* Since we advanced the base pointer, we shrink the buffer
1221 * size, but add the offset we subtracted */
1222 unsigned size = rsrc->base.width0 + (raw_addr - addr)
1223 - buf->buffer_offset;
1224
1225 /* When there is a divisor, the hardware-level divisor is
1226 * the product of the instance divisor and the padded count */
1227 unsigned divisor = elem->instance_divisor;
1228 unsigned hw_divisor = ctx->padded_count * divisor;
1229 unsigned stride = buf->stride;
1230
1231 /* If there's a divisor(=1) but no instancing, we want every
1232 * attribute to be the same */
1233
1234 if (divisor && ctx->instance_count == 1)
1235 stride = 0;
1236
1237 if (!divisor || ctx->instance_count <= 1) {
1238 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1239 if (ctx->instance_count > 1) {
1240 cfg.type = MALI_ATTRIBUTE_TYPE_1D_MODULUS;
1241 cfg.divisor = ctx->padded_count;
1242 }
1243
1244 cfg.pointer = addr;
1245 cfg.stride = stride;
1246 cfg.size = size;
1247 }
1248 } else if (util_is_power_of_two_or_zero(hw_divisor)) {
1249 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1250 cfg.type = MALI_ATTRIBUTE_TYPE_1D_POT_DIVISOR;
1251 cfg.pointer = addr;
1252 cfg.stride = stride;
1253 cfg.size = size;
1254 cfg.divisor_r = __builtin_ctz(hw_divisor);
1255 }
1256
1257 } else {
1258 unsigned shift = 0, extra_flags = 0;
1259
1260 unsigned magic_divisor =
1261 panfrost_compute_magic_divisor(hw_divisor, &shift, &extra_flags);
1262
1263 pan_pack(bufs + k, ATTRIBUTE_BUFFER, cfg) {
1264 cfg.type = MALI_ATTRIBUTE_TYPE_1D_NPOT_DIVISOR;
1265 cfg.pointer = addr;
1266 cfg.stride = stride;
1267 cfg.size = size;
1268
1269 cfg.divisor_r = shift;
1270 cfg.divisor_e = extra_flags;
1271 }
1272
1273 pan_pack(bufs + k + 1, ATTRIBUTE_BUFFER_CONTINUATION_NPOT, cfg) {
1274 cfg.divisor_numerator = magic_divisor;
1275 cfg.divisor = divisor;
1276 }
1277
1278 ++k;
1279 }
1280
1281 ++k;
1282 }
1283
1284 /* Add special gl_VertexID/gl_InstanceID buffers */
1285
1286 if (unlikely(vs->attribute_count >= PAN_VERTEX_ID)) {
1287 panfrost_vertex_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1288
1289 pan_pack(out + PAN_VERTEX_ID, ATTRIBUTE, cfg) {
1290 cfg.buffer_index = k++;
1291 cfg.format = so->formats[PAN_VERTEX_ID];
1292 }
1293
1294 panfrost_instance_id(ctx->padded_count, &bufs[k], ctx->instance_count > 1);
1295
1296 pan_pack(out + PAN_INSTANCE_ID, ATTRIBUTE, cfg) {
1297 cfg.buffer_index = k++;
1298 cfg.format = so->formats[PAN_INSTANCE_ID];
1299 }
1300 }
1301
1302 /* Attribute addresses require 64-byte alignment, so let:
1303 *
1304 * base' = base & ~63 = base - (base & 63)
1305 * offset' = offset + (base & 63)
1306 *
1307 * Since base' + offset' = base + offset, these are equivalent
1308 * addressing modes and now base is 64 aligned.
1309 */
1310
1311 for (unsigned i = 0; i < so->num_elements; ++i) {
1312 unsigned vbi = so->pipe[i].vertex_buffer_index;
1313 struct pipe_vertex_buffer *buf = &ctx->vertex_buffers[vbi];
1314
1315 /* Adjust by the masked off bits of the offset. Make sure we
1316 * read src_offset from so->hw (which is not GPU visible)
1317 * rather than target (which is) due to caching effects */
1318
1319 unsigned src_offset = so->pipe[i].src_offset;
1320
1321 /* BOs aligned to 4k so guaranteed aligned to 64 */
1322 src_offset += (buf->buffer_offset & 63);
1323
1324 /* Also, somewhat obscurely per-instance data needs to be
1325 * offset in response to a delayed start in an indexed draw */
1326
1327 if (so->pipe[i].instance_divisor && ctx->instance_count > 1)
1328 src_offset -= buf->stride * ctx->offset_start;
1329
1330 pan_pack(out + i, ATTRIBUTE, cfg) {
1331 cfg.buffer_index = attrib_to_buffer[i];
1332 cfg.format = so->formats[i];
1333 cfg.offset = src_offset;
1334 }
1335 }
1336
1337 *buffers = S.gpu;
1338 return T.gpu;
1339 }
1340
1341 static mali_ptr
1342 panfrost_emit_varyings(struct panfrost_batch *batch,
1343 struct mali_attribute_buffer_packed *slot,
1344 unsigned stride, unsigned count)
1345 {
1346 unsigned size = stride * count;
1347 mali_ptr ptr = panfrost_pool_alloc_aligned(&batch->invisible_pool, size, 64).gpu;
1348
1349 pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1350 cfg.stride = stride;
1351 cfg.size = size;
1352 cfg.pointer = ptr;
1353 }
1354
1355 return ptr;
1356 }
1357
1358 static unsigned
1359 panfrost_streamout_offset(unsigned stride, unsigned offset,
1360 struct pipe_stream_output_target *target)
1361 {
1362 return (target->buffer_offset + (offset * stride * 4)) & 63;
1363 }
1364
1365 static void
1366 panfrost_emit_streamout(struct panfrost_batch *batch,
1367 struct mali_attribute_buffer_packed *slot,
1368 unsigned stride_words, unsigned offset, unsigned count,
1369 struct pipe_stream_output_target *target)
1370 {
1371 unsigned stride = stride_words * 4;
1372 unsigned max_size = target->buffer_size;
1373 unsigned expected_size = stride * count;
1374
1375 /* Grab the BO and bind it to the batch */
1376 struct panfrost_bo *bo = pan_resource(target->buffer)->bo;
1377
1378 /* Varyings are WRITE from the perspective of the VERTEX but READ from
1379 * the perspective of the TILER and FRAGMENT.
1380 */
1381 panfrost_batch_add_bo(batch, bo,
1382 PAN_BO_ACCESS_SHARED |
1383 PAN_BO_ACCESS_RW |
1384 PAN_BO_ACCESS_VERTEX_TILER |
1385 PAN_BO_ACCESS_FRAGMENT);
1386
1387 /* We will have an offset applied to get alignment */
1388 mali_ptr addr = bo->gpu + target->buffer_offset + (offset * stride);
1389
1390 pan_pack(slot, ATTRIBUTE_BUFFER, cfg) {
1391 cfg.pointer = (addr & ~63);
1392 cfg.stride = stride;
1393 cfg.size = MIN2(max_size, expected_size) + (addr & 63);
1394 }
1395 }
1396
1397 static bool
1398 has_point_coord(unsigned mask, gl_varying_slot loc)
1399 {
1400 if ((loc >= VARYING_SLOT_TEX0) && (loc <= VARYING_SLOT_TEX7))
1401 return (mask & (1 << (loc - VARYING_SLOT_TEX0)));
1402 else if (loc == VARYING_SLOT_PNTC)
1403 return (mask & (1 << 8));
1404 else
1405 return false;
1406 }
1407
1408 /* Helpers for manipulating stream out information so we can pack varyings
1409 * accordingly. Compute the src_offset for a given captured varying */
1410
1411 static struct pipe_stream_output *
1412 pan_get_so(struct pipe_stream_output_info *info, gl_varying_slot loc)
1413 {
1414 for (unsigned i = 0; i < info->num_outputs; ++i) {
1415 if (info->output[i].register_index == loc)
1416 return &info->output[i];
1417 }
1418
1419 unreachable("Varying not captured");
1420 }
1421
1422 static unsigned
1423 pan_varying_size(enum mali_format fmt)
1424 {
1425 unsigned type = MALI_EXTRACT_TYPE(fmt);
1426 unsigned chan = MALI_EXTRACT_CHANNELS(fmt);
1427 unsigned bits = MALI_EXTRACT_BITS(fmt);
1428 unsigned bpc = 0;
1429
1430 if (bits == MALI_CHANNEL_FLOAT) {
1431 /* No doubles */
1432 bool fp16 = (type == MALI_FORMAT_SINT);
1433 assert(fp16 || (type == MALI_FORMAT_UNORM));
1434
1435 bpc = fp16 ? 2 : 4;
1436 } else {
1437 assert(type >= MALI_FORMAT_SNORM && type <= MALI_FORMAT_SINT);
1438
1439 /* See the enums */
1440 bits = 1 << bits;
1441 assert(bits >= 8);
1442 bpc = bits / 8;
1443 }
1444
1445 return bpc * chan;
1446 }
1447
1448 /* Indices for named (non-XFB) varyings that are present. These are packed
1449 * tightly so they correspond to a bitfield present (P) indexed by (1 <<
1450 * PAN_VARY_*). This has the nice property that you can lookup the buffer index
1451 * of a given special field given a shift S by:
1452 *
1453 * idx = popcount(P & ((1 << S) - 1))
1454 *
1455 * That is... look at all of the varyings that come earlier and count them, the
1456 * count is the new index since plus one. Likewise, the total number of special
1457 * buffers required is simply popcount(P)
1458 */
1459
1460 enum pan_special_varying {
1461 PAN_VARY_GENERAL = 0,
1462 PAN_VARY_POSITION = 1,
1463 PAN_VARY_PSIZ = 2,
1464 PAN_VARY_PNTCOORD = 3,
1465 PAN_VARY_FACE = 4,
1466 PAN_VARY_FRAGCOORD = 5,
1467
1468 /* Keep last */
1469 PAN_VARY_MAX,
1470 };
1471
1472 /* Given a varying, figure out which index it correpsonds to */
1473
1474 static inline unsigned
1475 pan_varying_index(unsigned present, enum pan_special_varying v)
1476 {
1477 unsigned mask = (1 << v) - 1;
1478 return util_bitcount(present & mask);
1479 }
1480
1481 /* Get the base offset for XFB buffers, which by convention come after
1482 * everything else. Wrapper function for semantic reasons; by construction this
1483 * is just popcount. */
1484
1485 static inline unsigned
1486 pan_xfb_base(unsigned present)
1487 {
1488 return util_bitcount(present);
1489 }
1490
1491 /* Computes the present mask for varyings so we can start emitting varying records */
1492
1493 static inline unsigned
1494 pan_varying_present(
1495 struct panfrost_shader_state *vs,
1496 struct panfrost_shader_state *fs,
1497 unsigned quirks)
1498 {
1499 /* At the moment we always emit general and position buffers. Not
1500 * strictly necessary but usually harmless */
1501
1502 unsigned present = (1 << PAN_VARY_GENERAL) | (1 << PAN_VARY_POSITION);
1503
1504 /* Enable special buffers by the shader info */
1505
1506 if (vs->writes_point_size)
1507 present |= (1 << PAN_VARY_PSIZ);
1508
1509 if (fs->reads_point_coord)
1510 present |= (1 << PAN_VARY_PNTCOORD);
1511
1512 if (fs->reads_face)
1513 present |= (1 << PAN_VARY_FACE);
1514
1515 if (fs->reads_frag_coord && !(quirks & IS_BIFROST))
1516 present |= (1 << PAN_VARY_FRAGCOORD);
1517
1518 /* Also, if we have a point sprite, we need a point coord buffer */
1519
1520 for (unsigned i = 0; i < fs->varying_count; i++) {
1521 gl_varying_slot loc = fs->varyings_loc[i];
1522
1523 if (has_point_coord(fs->point_sprite_mask, loc))
1524 present |= (1 << PAN_VARY_PNTCOORD);
1525 }
1526
1527 return present;
1528 }
1529
1530 /* Emitters for varying records */
1531
1532 static void
1533 pan_emit_vary(struct mali_attribute_packed *out,
1534 unsigned present, enum pan_special_varying buf,
1535 unsigned quirks, enum mali_format format,
1536 unsigned offset)
1537 {
1538 unsigned nr_channels = MALI_EXTRACT_CHANNELS(format);
1539 unsigned swizzle = quirks & HAS_SWIZZLES ?
1540 panfrost_get_default_swizzle(nr_channels) :
1541 panfrost_bifrost_swizzle(nr_channels);
1542
1543 pan_pack(out, ATTRIBUTE, cfg) {
1544 cfg.buffer_index = pan_varying_index(present, buf);
1545 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1546 cfg.format = (format << 12) | swizzle;
1547 cfg.offset = offset;
1548 }
1549 }
1550
1551 /* General varying that is unused */
1552
1553 static void
1554 pan_emit_vary_only(struct mali_attribute_packed *out,
1555 unsigned present, unsigned quirks)
1556 {
1557 pan_emit_vary(out, present, 0, quirks, MALI_VARYING_DISCARD, 0);
1558 }
1559
1560 /* Special records */
1561
1562 static const enum mali_format pan_varying_formats[PAN_VARY_MAX] = {
1563 [PAN_VARY_POSITION] = MALI_VARYING_POS,
1564 [PAN_VARY_PSIZ] = MALI_R16F,
1565 [PAN_VARY_PNTCOORD] = MALI_R16F,
1566 [PAN_VARY_FACE] = MALI_R32I,
1567 [PAN_VARY_FRAGCOORD] = MALI_RGBA32F
1568 };
1569
1570 static void
1571 pan_emit_vary_special(struct mali_attribute_packed *out,
1572 unsigned present, enum pan_special_varying buf,
1573 unsigned quirks)
1574 {
1575 assert(buf < PAN_VARY_MAX);
1576 pan_emit_vary(out, present, buf, quirks, pan_varying_formats[buf], 0);
1577 }
1578
1579 static enum mali_format
1580 pan_xfb_format(enum mali_format format, unsigned nr)
1581 {
1582 if (MALI_EXTRACT_BITS(format) == MALI_CHANNEL_FLOAT)
1583 return MALI_R32F | MALI_NR_CHANNELS(nr);
1584 else
1585 return MALI_EXTRACT_TYPE(format) | MALI_NR_CHANNELS(nr) | MALI_CHANNEL_32;
1586 }
1587
1588 /* Transform feedback records. Note struct pipe_stream_output is (if packed as
1589 * a bitfield) 32-bit, smaller than a 64-bit pointer, so may as well pass by
1590 * value. */
1591
1592 static void
1593 pan_emit_vary_xfb(struct mali_attribute_packed *out,
1594 unsigned present,
1595 unsigned max_xfb,
1596 unsigned *streamout_offsets,
1597 unsigned quirks,
1598 enum mali_format format,
1599 struct pipe_stream_output o)
1600 {
1601 unsigned swizzle = quirks & HAS_SWIZZLES ?
1602 panfrost_get_default_swizzle(o.num_components) :
1603 panfrost_bifrost_swizzle(o.num_components);
1604
1605 pan_pack(out, ATTRIBUTE, cfg) {
1606 /* XFB buffers come after everything else */
1607 cfg.buffer_index = pan_xfb_base(present) + o.output_buffer;
1608 cfg.unknown = quirks & IS_BIFROST ? 0x0 : 0x1;
1609
1610 /* Override number of channels and precision to highp */
1611 cfg.format = (pan_xfb_format(format, o.num_components) << 12) | swizzle;
1612
1613 /* Apply given offsets together */
1614 cfg.offset = (o.dst_offset * 4) /* dwords */
1615 + streamout_offsets[o.output_buffer];
1616 }
1617 }
1618
1619 /* Determine if we should capture a varying for XFB. This requires actually
1620 * having a buffer for it. If we don't capture it, we'll fallback to a general
1621 * varying path (linked or unlinked, possibly discarding the write) */
1622
1623 static bool
1624 panfrost_xfb_captured(struct panfrost_shader_state *xfb,
1625 unsigned loc, unsigned max_xfb)
1626 {
1627 if (!(xfb->so_mask & (1ll << loc)))
1628 return false;
1629
1630 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1631 return o->output_buffer < max_xfb;
1632 }
1633
1634 static void
1635 pan_emit_general_varying(struct mali_attribute_packed *out,
1636 struct panfrost_shader_state *other,
1637 struct panfrost_shader_state *xfb,
1638 gl_varying_slot loc,
1639 enum mali_format format,
1640 unsigned present,
1641 unsigned quirks,
1642 unsigned *gen_offsets,
1643 enum mali_format *gen_formats,
1644 unsigned *gen_stride,
1645 unsigned idx,
1646 bool should_alloc)
1647 {
1648 /* Check if we're linked */
1649 signed other_idx = -1;
1650
1651 for (unsigned j = 0; j < other->varying_count; ++j) {
1652 if (other->varyings_loc[j] == loc) {
1653 other_idx = j;
1654 break;
1655 }
1656 }
1657
1658 if (other_idx < 0) {
1659 pan_emit_vary_only(out, present, quirks);
1660 return;
1661 }
1662
1663 unsigned offset = gen_offsets[other_idx];
1664
1665 if (should_alloc) {
1666 /* We're linked, so allocate a space via a watermark allocation */
1667 enum mali_format alt = other->varyings[other_idx];
1668
1669 /* Do interpolation at minimum precision */
1670 unsigned size_main = pan_varying_size(format);
1671 unsigned size_alt = pan_varying_size(alt);
1672 unsigned size = MIN2(size_main, size_alt);
1673
1674 /* If a varying is marked for XFB but not actually captured, we
1675 * should match the format to the format that would otherwise
1676 * be used for XFB, since dEQP checks for invariance here. It's
1677 * unclear if this is required by the spec. */
1678
1679 if (xfb->so_mask & (1ull << loc)) {
1680 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1681 format = pan_xfb_format(format, o->num_components);
1682 size = pan_varying_size(format);
1683 } else if (size == size_alt) {
1684 format = alt;
1685 }
1686
1687 gen_offsets[idx] = *gen_stride;
1688 gen_formats[other_idx] = format;
1689 offset = *gen_stride;
1690 *gen_stride += size;
1691 }
1692
1693 pan_emit_vary(out, present, PAN_VARY_GENERAL, quirks, format, offset);
1694 }
1695
1696 /* Higher-level wrapper around all of the above, classifying a varying into one
1697 * of the above types */
1698
1699 static void
1700 panfrost_emit_varying(
1701 struct mali_attribute_packed *out,
1702 struct panfrost_shader_state *stage,
1703 struct panfrost_shader_state *other,
1704 struct panfrost_shader_state *xfb,
1705 unsigned present,
1706 unsigned max_xfb,
1707 unsigned *streamout_offsets,
1708 unsigned quirks,
1709 unsigned *gen_offsets,
1710 enum mali_format *gen_formats,
1711 unsigned *gen_stride,
1712 unsigned idx,
1713 bool should_alloc,
1714 bool is_fragment)
1715 {
1716 gl_varying_slot loc = stage->varyings_loc[idx];
1717 enum mali_format format = stage->varyings[idx];
1718
1719 /* Override format to match linkage */
1720 if (!should_alloc && gen_formats[idx])
1721 format = gen_formats[idx];
1722
1723 if (has_point_coord(stage->point_sprite_mask, loc)) {
1724 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1725 } else if (panfrost_xfb_captured(xfb, loc, max_xfb)) {
1726 struct pipe_stream_output *o = pan_get_so(&xfb->stream_output, loc);
1727 pan_emit_vary_xfb(out, present, max_xfb, streamout_offsets, quirks, format, *o);
1728 } else if (loc == VARYING_SLOT_POS) {
1729 if (is_fragment)
1730 pan_emit_vary_special(out, present, PAN_VARY_FRAGCOORD, quirks);
1731 else
1732 pan_emit_vary_special(out, present, PAN_VARY_POSITION, quirks);
1733 } else if (loc == VARYING_SLOT_PSIZ) {
1734 pan_emit_vary_special(out, present, PAN_VARY_PSIZ, quirks);
1735 } else if (loc == VARYING_SLOT_PNTC) {
1736 pan_emit_vary_special(out, present, PAN_VARY_PNTCOORD, quirks);
1737 } else if (loc == VARYING_SLOT_FACE) {
1738 pan_emit_vary_special(out, present, PAN_VARY_FACE, quirks);
1739 } else {
1740 pan_emit_general_varying(out, other, xfb, loc, format, present,
1741 quirks, gen_offsets, gen_formats, gen_stride,
1742 idx, should_alloc);
1743 }
1744 }
1745
1746 static void
1747 pan_emit_special_input(struct mali_attribute_buffer_packed *out,
1748 unsigned present,
1749 enum pan_special_varying v,
1750 unsigned special)
1751 {
1752 if (present & (1 << v)) {
1753 unsigned idx = pan_varying_index(present, v);
1754
1755 pan_pack(out + idx, ATTRIBUTE_BUFFER, cfg) {
1756 cfg.special = special;
1757 cfg.type = 0;
1758 }
1759 }
1760 }
1761
1762 void
1763 panfrost_emit_varying_descriptor(struct panfrost_batch *batch,
1764 unsigned vertex_count,
1765 struct mali_vertex_tiler_postfix *vertex_postfix,
1766 struct mali_vertex_tiler_postfix *tiler_postfix,
1767 union midgard_primitive_size *primitive_size)
1768 {
1769 /* Load the shaders */
1770 struct panfrost_context *ctx = batch->ctx;
1771 struct panfrost_device *dev = pan_device(ctx->base.screen);
1772 struct panfrost_shader_state *vs, *fs;
1773 size_t vs_size, fs_size;
1774
1775 /* Allocate the varying descriptor */
1776
1777 vs = panfrost_get_shader_state(ctx, PIPE_SHADER_VERTEX);
1778 fs = panfrost_get_shader_state(ctx, PIPE_SHADER_FRAGMENT);
1779 vs_size = MALI_ATTRIBUTE_LENGTH * vs->varying_count;
1780 fs_size = MALI_ATTRIBUTE_LENGTH * fs->varying_count;
1781
1782 struct panfrost_transfer trans = panfrost_pool_alloc_aligned(
1783 &batch->pool, vs_size + fs_size, MALI_ATTRIBUTE_LENGTH);
1784
1785 struct pipe_stream_output_info *so = &vs->stream_output;
1786 unsigned present = pan_varying_present(vs, fs, dev->quirks);
1787
1788 /* Check if this varying is linked by us. This is the case for
1789 * general-purpose, non-captured varyings. If it is, link it. If it's
1790 * not, use the provided stream out information to determine the
1791 * offset, since it was already linked for us. */
1792
1793 unsigned gen_offsets[32];
1794 enum mali_format gen_formats[32];
1795 memset(gen_offsets, 0, sizeof(gen_offsets));
1796 memset(gen_formats, 0, sizeof(gen_formats));
1797
1798 unsigned gen_stride = 0;
1799 assert(vs->varying_count < ARRAY_SIZE(gen_offsets));
1800 assert(fs->varying_count < ARRAY_SIZE(gen_offsets));
1801
1802 unsigned streamout_offsets[32];
1803
1804 for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1805 streamout_offsets[i] = panfrost_streamout_offset(
1806 so->stride[i],
1807 ctx->streamout.offsets[i],
1808 ctx->streamout.targets[i]);
1809 }
1810
1811 struct mali_attribute_packed *ovs = (struct mali_attribute_packed *)trans.cpu;
1812 struct mali_attribute_packed *ofs = ovs + vs->varying_count;
1813
1814 for (unsigned i = 0; i < vs->varying_count; i++) {
1815 panfrost_emit_varying(ovs + i, vs, fs, vs, present,
1816 ctx->streamout.num_targets, streamout_offsets,
1817 dev->quirks,
1818 gen_offsets, gen_formats, &gen_stride, i, true, false);
1819 }
1820
1821 for (unsigned i = 0; i < fs->varying_count; i++) {
1822 panfrost_emit_varying(ofs + i, fs, vs, vs, present,
1823 ctx->streamout.num_targets, streamout_offsets,
1824 dev->quirks,
1825 gen_offsets, gen_formats, &gen_stride, i, false, true);
1826 }
1827
1828 unsigned xfb_base = pan_xfb_base(present);
1829 struct panfrost_transfer T = panfrost_pool_alloc_aligned(&batch->pool,
1830 MALI_ATTRIBUTE_BUFFER_LENGTH * (xfb_base + ctx->streamout.num_targets),
1831 MALI_ATTRIBUTE_BUFFER_LENGTH * 2);
1832 struct mali_attribute_buffer_packed *varyings =
1833 (struct mali_attribute_buffer_packed *) T.cpu;
1834
1835 /* Emit the stream out buffers */
1836
1837 unsigned out_count = u_stream_outputs_for_vertices(ctx->active_prim,
1838 ctx->vertex_count);
1839
1840 for (unsigned i = 0; i < ctx->streamout.num_targets; ++i) {
1841 panfrost_emit_streamout(batch, &varyings[xfb_base + i],
1842 so->stride[i],
1843 ctx->streamout.offsets[i],
1844 out_count,
1845 ctx->streamout.targets[i]);
1846 }
1847
1848 panfrost_emit_varyings(batch,
1849 &varyings[pan_varying_index(present, PAN_VARY_GENERAL)],
1850 gen_stride, vertex_count);
1851
1852 /* fp32 vec4 gl_Position */
1853 tiler_postfix->position_varying = panfrost_emit_varyings(batch,
1854 &varyings[pan_varying_index(present, PAN_VARY_POSITION)],
1855 sizeof(float) * 4, vertex_count);
1856
1857 if (present & (1 << PAN_VARY_PSIZ)) {
1858 primitive_size->pointer = panfrost_emit_varyings(batch,
1859 &varyings[pan_varying_index(present, PAN_VARY_PSIZ)],
1860 2, vertex_count);
1861 }
1862
1863 pan_emit_special_input(varyings, present, PAN_VARY_PNTCOORD, MALI_ATTRIBUTE_SPECIAL_POINT_COORD);
1864 pan_emit_special_input(varyings, present, PAN_VARY_FACE, MALI_ATTRIBUTE_SPECIAL_FRONT_FACING);
1865 pan_emit_special_input(varyings, present, PAN_VARY_FRAGCOORD, MALI_ATTRIBUTE_SPECIAL_FRAG_COORD);
1866
1867 vertex_postfix->varyings = T.gpu;
1868 tiler_postfix->varyings = T.gpu;
1869
1870 vertex_postfix->varying_meta = trans.gpu;
1871 tiler_postfix->varying_meta = trans.gpu + vs_size;
1872 }
1873
1874 void
1875 panfrost_emit_vertex_tiler_jobs(struct panfrost_batch *batch,
1876 struct mali_vertex_tiler_prefix *vertex_prefix,
1877 struct mali_vertex_tiler_postfix *vertex_postfix,
1878 struct mali_vertex_tiler_prefix *tiler_prefix,
1879 struct mali_vertex_tiler_postfix *tiler_postfix,
1880 union midgard_primitive_size *primitive_size)
1881 {
1882 struct panfrost_context *ctx = batch->ctx;
1883 struct panfrost_device *device = pan_device(ctx->base.screen);
1884 bool wallpapering = ctx->wallpaper_batch && batch->scoreboard.tiler_dep;
1885 struct bifrost_payload_vertex bifrost_vertex = {0,};
1886 struct bifrost_payload_tiler bifrost_tiler = {0,};
1887 struct midgard_payload_vertex_tiler midgard_vertex = {0,};
1888 struct midgard_payload_vertex_tiler midgard_tiler = {0,};
1889 void *vp, *tp;
1890 size_t vp_size, tp_size;
1891
1892 if (device->quirks & IS_BIFROST) {
1893 bifrost_vertex.prefix = *vertex_prefix;
1894 bifrost_vertex.postfix = *vertex_postfix;
1895 vp = &bifrost_vertex;
1896 vp_size = sizeof(bifrost_vertex);
1897
1898 bifrost_tiler.prefix = *tiler_prefix;
1899 bifrost_tiler.tiler.primitive_size = *primitive_size;
1900 bifrost_tiler.tiler.tiler_meta = panfrost_batch_get_tiler_meta(batch, ~0);
1901 bifrost_tiler.postfix = *tiler_postfix;
1902 tp = &bifrost_tiler;
1903 tp_size = sizeof(bifrost_tiler);
1904 } else {
1905 midgard_vertex.prefix = *vertex_prefix;
1906 midgard_vertex.postfix = *vertex_postfix;
1907 vp = &midgard_vertex;
1908 vp_size = sizeof(midgard_vertex);
1909
1910 midgard_tiler.prefix = *tiler_prefix;
1911 midgard_tiler.postfix = *tiler_postfix;
1912 midgard_tiler.primitive_size = *primitive_size;
1913 tp = &midgard_tiler;
1914 tp_size = sizeof(midgard_tiler);
1915 }
1916
1917 if (wallpapering) {
1918 /* Inject in reverse order, with "predicted" job indices.
1919 * THIS IS A HACK XXX */
1920 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false,
1921 batch->scoreboard.job_index + 2, tp, tp_size, true);
1922 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
1923 vp, vp_size, true);
1924 return;
1925 }
1926
1927 /* If rasterizer discard is enable, only submit the vertex */
1928
1929 unsigned vertex = panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_VERTEX, false, 0,
1930 vp, vp_size, false);
1931
1932 if (ctx->rasterizer->base.rasterizer_discard)
1933 return;
1934
1935 panfrost_new_job(&batch->pool, &batch->scoreboard, MALI_JOB_TYPE_TILER, false, vertex, tp, tp_size,
1936 false);
1937 }
1938
1939 /* TODO: stop hardcoding this */
1940 mali_ptr
1941 panfrost_emit_sample_locations(struct panfrost_batch *batch)
1942 {
1943 uint16_t locations[] = {
1944 128, 128,
1945 0, 256,
1946 0, 256,
1947 0, 256,
1948 0, 256,
1949 0, 256,
1950 0, 256,
1951 0, 256,
1952 0, 256,
1953 0, 256,
1954 0, 256,
1955 0, 256,
1956 0, 256,
1957 0, 256,
1958 0, 256,
1959 0, 256,
1960 0, 256,
1961 0, 256,
1962 0, 256,
1963 0, 256,
1964 0, 256,
1965 0, 256,
1966 0, 256,
1967 0, 256,
1968 0, 256,
1969 0, 256,
1970 0, 256,
1971 0, 256,
1972 0, 256,
1973 0, 256,
1974 0, 256,
1975 0, 256,
1976 128, 128,
1977 0, 0,
1978 0, 0,
1979 0, 0,
1980 0, 0,
1981 0, 0,
1982 0, 0,
1983 0, 0,
1984 0, 0,
1985 0, 0,
1986 0, 0,
1987 0, 0,
1988 0, 0,
1989 0, 0,
1990 0, 0,
1991 0, 0,
1992 };
1993
1994 return panfrost_pool_upload_aligned(&batch->pool, locations, 96 * sizeof(uint16_t), 64);
1995 }