braodcom/vc5: Find the actual first TF output for our TF spec.
[mesa.git] / src / gallium / drivers / vc5 / vc5_program.c
1 /*
2 * Copyright © 2014-2017 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include <inttypes.h>
25 #include "util/u_format.h"
26 #include "util/u_math.h"
27 #include "util/u_memory.h"
28 #include "util/ralloc.h"
29 #include "util/hash_table.h"
30 #include "tgsi/tgsi_dump.h"
31 #include "tgsi/tgsi_parse.h"
32 #include "compiler/nir/nir.h"
33 #include "compiler/nir/nir_builder.h"
34 #include "nir/tgsi_to_nir.h"
35 #include "compiler/v3d_compiler.h"
36 #include "vc5_context.h"
37 #include "broadcom/cle/v3d_packet_v33_pack.h"
38
39 static gl_varying_slot
40 vc5_get_slot_for_driver_location(nir_shader *s, uint32_t driver_location)
41 {
42 nir_foreach_variable(var, &s->outputs) {
43 if (var->data.driver_location == driver_location) {
44 return var->data.location;
45 }
46 }
47
48 return -1;
49 }
50
51 static void
52 vc5_set_transform_feedback_outputs(struct vc5_uncompiled_shader *so,
53 const struct pipe_stream_output_info *stream_output)
54 {
55 if (!stream_output->num_outputs)
56 return;
57
58 struct v3d_varying_slot slots[PIPE_MAX_SO_OUTPUTS * 4];
59 int slot_count = 0;
60
61 for (int buffer = 0; buffer < PIPE_MAX_SO_BUFFERS; buffer++) {
62 uint32_t buffer_offset = 0;
63 uint32_t vpm_start = slot_count;
64
65 for (int i = 0; i < stream_output->num_outputs; i++) {
66 const struct pipe_stream_output *output =
67 &stream_output->output[i];
68
69 if (output->output_buffer != buffer)
70 continue;
71
72 /* We assume that the SO outputs appear in increasing
73 * order in the buffer.
74 */
75 assert(output->dst_offset >= buffer_offset);
76
77 /* Pad any undefined slots in the output */
78 for (int j = buffer_offset; j < output->dst_offset; j++) {
79 slots[slot_count] =
80 v3d_slot_from_slot_and_component(VARYING_SLOT_POS, 0);
81 slot_count++;
82 }
83
84 /* Set the coordinate shader up to output the
85 * components of this varying.
86 */
87 for (int j = 0; j < output->num_components; j++) {
88 gl_varying_slot slot =
89 vc5_get_slot_for_driver_location(so->base.ir.nir, output->register_index);
90
91 slots[slot_count] =
92 v3d_slot_from_slot_and_component(slot,
93 output->start_component + j);
94 slot_count++;
95 }
96 }
97
98 uint32_t vpm_size = slot_count - vpm_start;
99 if (!vpm_size)
100 continue;
101
102 struct V3D33_TRANSFORM_FEEDBACK_OUTPUT_DATA_SPEC unpacked = {
103 /* We need the offset from the coordinate shader's VPM
104 * output block, which has the [X, Y, Z, W, Xs, Ys]
105 * values at the start. Note that this will need some
106 * shifting when PSIZ is also present.
107 */
108 .first_shaded_vertex_value_to_output = vpm_start + 6,
109 .number_of_consecutive_vertex_values_to_output_as_32_bit_values_minus_1 = vpm_size - 1,
110 .output_buffer_to_write_to = buffer,
111 };
112 V3D33_TRANSFORM_FEEDBACK_OUTPUT_DATA_SPEC_pack(NULL,
113 (void *)&so->tf_specs[so->num_tf_specs++],
114 &unpacked);
115 }
116
117 so->num_tf_outputs = slot_count;
118 so->tf_outputs = ralloc_array(so->base.ir.nir, struct v3d_varying_slot,
119 slot_count);
120 memcpy(so->tf_outputs, slots, sizeof(*slots) * slot_count);
121 }
122
123 static int
124 type_size(const struct glsl_type *type)
125 {
126 return glsl_count_attribute_slots(type, false);
127 }
128
129 static void *
130 vc5_shader_state_create(struct pipe_context *pctx,
131 const struct pipe_shader_state *cso)
132 {
133 struct vc5_context *vc5 = vc5_context(pctx);
134 struct vc5_uncompiled_shader *so = CALLOC_STRUCT(vc5_uncompiled_shader);
135 if (!so)
136 return NULL;
137
138 so->program_id = vc5->next_uncompiled_program_id++;
139
140 nir_shader *s;
141
142 if (cso->type == PIPE_SHADER_IR_NIR) {
143 /* The backend takes ownership of the NIR shader on state
144 * creation.
145 */
146 s = cso->ir.nir;
147
148 NIR_PASS_V(s, nir_lower_io, nir_var_all, type_size,
149 (nir_lower_io_options)0);
150 } else {
151 assert(cso->type == PIPE_SHADER_IR_TGSI);
152
153 if (V3D_DEBUG & V3D_DEBUG_TGSI) {
154 fprintf(stderr, "prog %d TGSI:\n",
155 so->program_id);
156 tgsi_dump(cso->tokens, 0);
157 fprintf(stderr, "\n");
158 }
159 s = tgsi_to_nir(cso->tokens, &v3d_nir_options);
160 }
161
162 NIR_PASS_V(s, nir_opt_global_to_local);
163 NIR_PASS_V(s, nir_lower_regs_to_ssa);
164 NIR_PASS_V(s, nir_normalize_cubemap_coords);
165
166 NIR_PASS_V(s, nir_lower_load_const_to_scalar);
167
168 v3d_optimize_nir(s);
169
170 NIR_PASS_V(s, nir_remove_dead_variables, nir_var_local);
171
172 /* Garbage collect dead instructions */
173 nir_sweep(s);
174
175 so->base.type = PIPE_SHADER_IR_NIR;
176 so->base.ir.nir = s;
177
178 vc5_set_transform_feedback_outputs(so, &cso->stream_output);
179
180 if (V3D_DEBUG & (V3D_DEBUG_NIR |
181 v3d_debug_flag_for_shader_stage(s->stage))) {
182 fprintf(stderr, "%s prog %d NIR:\n",
183 gl_shader_stage_name(s->stage),
184 so->program_id);
185 nir_print_shader(s, stderr);
186 fprintf(stderr, "\n");
187 }
188
189 return so;
190 }
191
192 static struct vc5_compiled_shader *
193 vc5_get_compiled_shader(struct vc5_context *vc5, struct v3d_key *key)
194 {
195 struct vc5_uncompiled_shader *shader_state = key->shader_state;
196 nir_shader *s = shader_state->base.ir.nir;
197
198 struct hash_table *ht;
199 uint32_t key_size;
200 if (s->stage == MESA_SHADER_FRAGMENT) {
201 ht = vc5->fs_cache;
202 key_size = sizeof(struct v3d_fs_key);
203 } else {
204 ht = vc5->vs_cache;
205 key_size = sizeof(struct v3d_vs_key);
206 }
207
208 struct hash_entry *entry = _mesa_hash_table_search(ht, key);
209 if (entry)
210 return entry->data;
211
212 struct vc5_compiled_shader *shader =
213 rzalloc(NULL, struct vc5_compiled_shader);
214
215 int program_id = shader_state->program_id;
216 int variant_id =
217 p_atomic_inc_return(&shader_state->compiled_variant_count);
218 uint64_t *qpu_insts;
219 uint32_t shader_size;
220
221 switch (s->stage) {
222 case MESA_SHADER_VERTEX:
223 shader->prog_data.vs = rzalloc(shader, struct v3d_vs_prog_data);
224
225 qpu_insts = v3d_compile_vs(vc5->screen->compiler,
226 (struct v3d_vs_key *)key,
227 shader->prog_data.vs, s,
228 program_id, variant_id,
229 &shader_size);
230 break;
231 case MESA_SHADER_FRAGMENT:
232 shader->prog_data.fs = rzalloc(shader, struct v3d_fs_prog_data);
233
234 qpu_insts = v3d_compile_fs(vc5->screen->compiler,
235 (struct v3d_fs_key *)key,
236 shader->prog_data.fs, s,
237 program_id, variant_id,
238 &shader_size);
239 break;
240 default:
241 unreachable("bad stage");
242 }
243
244 vc5_set_shader_uniform_dirty_flags(shader);
245
246 shader->bo = vc5_bo_alloc(vc5->screen, shader_size, "shader");
247 vc5_bo_map(shader->bo);
248 memcpy(shader->bo->map, qpu_insts, shader_size);
249
250 free(qpu_insts);
251
252 struct vc5_key *dup_key;
253 dup_key = ralloc_size(shader, key_size);
254 memcpy(dup_key, key, key_size);
255 _mesa_hash_table_insert(ht, dup_key, shader);
256
257 return shader;
258 }
259
260 static void
261 vc5_setup_shared_key(struct vc5_context *vc5, struct v3d_key *key,
262 struct vc5_texture_stateobj *texstate)
263 {
264 for (int i = 0; i < texstate->num_textures; i++) {
265 struct pipe_sampler_view *sampler = texstate->textures[i];
266 struct vc5_sampler_view *vc5_sampler = vc5_sampler_view(sampler);
267 struct pipe_sampler_state *sampler_state =
268 texstate->samplers[i];
269
270 if (!sampler)
271 continue;
272
273 key->tex[i].return_size =
274 vc5_get_tex_return_size(sampler->format);
275
276 /* For 16-bit, we set up the sampler to always return 2
277 * channels (meaning no recompiles for most statechanges),
278 * while for 32 we actually scale the returns with channels.
279 */
280 if (key->tex[i].return_size == 16) {
281 key->tex[i].return_channels = 2;
282 } else {
283 key->tex[i].return_channels =
284 vc5_get_tex_return_channels(sampler->format);
285 }
286
287 if (vc5_get_tex_return_size(sampler->format) == 32) {
288 memcpy(key->tex[i].swizzle,
289 vc5_sampler->swizzle,
290 sizeof(vc5_sampler->swizzle));
291 } else {
292 /* For 16-bit returns, we let the sampler state handle
293 * the swizzle.
294 */
295 key->tex[i].swizzle[0] = PIPE_SWIZZLE_X;
296 key->tex[i].swizzle[1] = PIPE_SWIZZLE_Y;
297 key->tex[i].swizzle[2] = PIPE_SWIZZLE_Z;
298 key->tex[i].swizzle[3] = PIPE_SWIZZLE_W;
299 }
300
301 if (sampler->texture->nr_samples > 1) {
302 key->tex[i].msaa_width = sampler->texture->width0;
303 key->tex[i].msaa_height = sampler->texture->height0;
304 } else if (sampler){
305 key->tex[i].compare_mode = sampler_state->compare_mode;
306 key->tex[i].compare_func = sampler_state->compare_func;
307 key->tex[i].wrap_s = sampler_state->wrap_s;
308 key->tex[i].wrap_t = sampler_state->wrap_t;
309 }
310 }
311
312 key->ucp_enables = vc5->rasterizer->base.clip_plane_enable;
313 }
314
315 static void
316 vc5_update_compiled_fs(struct vc5_context *vc5, uint8_t prim_mode)
317 {
318 struct vc5_job *job = vc5->job;
319 struct v3d_fs_key local_key;
320 struct v3d_fs_key *key = &local_key;
321
322 if (!(vc5->dirty & (VC5_DIRTY_PRIM_MODE |
323 VC5_DIRTY_BLEND |
324 VC5_DIRTY_FRAMEBUFFER |
325 VC5_DIRTY_ZSA |
326 VC5_DIRTY_RASTERIZER |
327 VC5_DIRTY_SAMPLE_MASK |
328 VC5_DIRTY_FRAGTEX |
329 VC5_DIRTY_UNCOMPILED_FS))) {
330 return;
331 }
332
333 memset(key, 0, sizeof(*key));
334 vc5_setup_shared_key(vc5, &key->base, &vc5->fragtex);
335 key->base.shader_state = vc5->prog.bind_fs;
336 key->is_points = (prim_mode == PIPE_PRIM_POINTS);
337 key->is_lines = (prim_mode >= PIPE_PRIM_LINES &&
338 prim_mode <= PIPE_PRIM_LINE_STRIP);
339 key->clamp_color = vc5->rasterizer->base.clamp_fragment_color;
340 if (vc5->blend->logicop_enable) {
341 key->logicop_func = vc5->blend->logicop_func;
342 } else {
343 key->logicop_func = PIPE_LOGICOP_COPY;
344 }
345 if (job->msaa) {
346 key->msaa = vc5->rasterizer->base.multisample;
347 key->sample_coverage = (vc5->rasterizer->base.multisample &&
348 vc5->sample_mask != (1 << VC5_MAX_SAMPLES) - 1);
349 key->sample_alpha_to_coverage = vc5->blend->alpha_to_coverage;
350 key->sample_alpha_to_one = vc5->blend->alpha_to_one;
351 }
352
353 key->depth_enabled = (vc5->zsa->base.depth.enabled ||
354 vc5->zsa->base.stencil[0].enabled);
355 if (vc5->zsa->base.alpha.enabled) {
356 key->alpha_test = true;
357 key->alpha_test_func = vc5->zsa->base.alpha.func;
358 }
359
360 if (vc5->framebuffer.cbufs[0]) {
361 struct pipe_surface *cbuf = vc5->framebuffer.cbufs[0];
362 const struct util_format_description *desc =
363 util_format_description(cbuf->format);
364
365 key->swap_color_rb = desc->swizzle[0] == PIPE_SWIZZLE_Z;
366 }
367
368 if (key->is_points) {
369 key->point_sprite_mask =
370 vc5->rasterizer->base.sprite_coord_enable;
371 key->point_coord_upper_left =
372 (vc5->rasterizer->base.sprite_coord_mode ==
373 PIPE_SPRITE_COORD_UPPER_LEFT);
374 }
375
376 key->light_twoside = vc5->rasterizer->base.light_twoside;
377
378 struct vc5_compiled_shader *old_fs = vc5->prog.fs;
379 vc5->prog.fs = vc5_get_compiled_shader(vc5, &key->base);
380 if (vc5->prog.fs == old_fs)
381 return;
382
383 vc5->dirty |= VC5_DIRTY_COMPILED_FS;
384
385 if (old_fs &&
386 (vc5->prog.fs->prog_data.fs->flat_shade_flags !=
387 old_fs->prog_data.fs->flat_shade_flags ||
388 (vc5->rasterizer->base.flatshade &&
389 vc5->prog.fs->prog_data.fs->shade_model_flags !=
390 old_fs->prog_data.fs->shade_model_flags))) {
391 vc5->dirty |= VC5_DIRTY_FLAT_SHADE_FLAGS;
392 }
393
394 if (old_fs && memcmp(vc5->prog.fs->prog_data.fs->input_slots,
395 old_fs->prog_data.fs->input_slots,
396 sizeof(vc5->prog.fs->prog_data.fs->input_slots))) {
397 vc5->dirty |= VC5_DIRTY_FS_INPUTS;
398 }
399 }
400
401 static void
402 vc5_update_compiled_vs(struct vc5_context *vc5, uint8_t prim_mode)
403 {
404 struct v3d_vs_key local_key;
405 struct v3d_vs_key *key = &local_key;
406
407 if (!(vc5->dirty & (VC5_DIRTY_PRIM_MODE |
408 VC5_DIRTY_RASTERIZER |
409 VC5_DIRTY_VERTTEX |
410 VC5_DIRTY_VTXSTATE |
411 VC5_DIRTY_UNCOMPILED_VS |
412 VC5_DIRTY_FS_INPUTS))) {
413 return;
414 }
415
416 memset(key, 0, sizeof(*key));
417 vc5_setup_shared_key(vc5, &key->base, &vc5->verttex);
418 key->base.shader_state = vc5->prog.bind_vs;
419 key->num_fs_inputs = vc5->prog.fs->prog_data.fs->base.num_inputs;
420 STATIC_ASSERT(sizeof(key->fs_inputs) ==
421 sizeof(vc5->prog.fs->prog_data.fs->input_slots));
422 memcpy(key->fs_inputs, vc5->prog.fs->prog_data.fs->input_slots,
423 sizeof(key->fs_inputs));
424 key->clamp_color = vc5->rasterizer->base.clamp_vertex_color;
425
426 key->per_vertex_point_size =
427 (prim_mode == PIPE_PRIM_POINTS &&
428 vc5->rasterizer->base.point_size_per_vertex);
429
430 struct vc5_compiled_shader *vs =
431 vc5_get_compiled_shader(vc5, &key->base);
432 if (vs != vc5->prog.vs) {
433 vc5->prog.vs = vs;
434 vc5->dirty |= VC5_DIRTY_COMPILED_VS;
435 }
436
437 key->is_coord = true;
438 /* Coord shaders only output varyings used by transform feedback. */
439 struct vc5_uncompiled_shader *shader_state = key->base.shader_state;
440 memcpy(key->fs_inputs, shader_state->tf_outputs,
441 sizeof(*key->fs_inputs) * shader_state->num_tf_outputs);
442 if (shader_state->num_tf_outputs < key->num_fs_inputs) {
443 memset(&key->fs_inputs[shader_state->num_tf_outputs],
444 0,
445 sizeof(*key->fs_inputs) * (key->num_fs_inputs -
446 shader_state->num_tf_outputs));
447 }
448 key->num_fs_inputs = shader_state->num_tf_outputs;
449
450 struct vc5_compiled_shader *cs =
451 vc5_get_compiled_shader(vc5, &key->base);
452 if (cs != vc5->prog.cs) {
453 vc5->prog.cs = cs;
454 vc5->dirty |= VC5_DIRTY_COMPILED_CS;
455 }
456 }
457
458 void
459 vc5_update_compiled_shaders(struct vc5_context *vc5, uint8_t prim_mode)
460 {
461 vc5_update_compiled_fs(vc5, prim_mode);
462 vc5_update_compiled_vs(vc5, prim_mode);
463 }
464
465 static uint32_t
466 fs_cache_hash(const void *key)
467 {
468 return _mesa_hash_data(key, sizeof(struct v3d_fs_key));
469 }
470
471 static uint32_t
472 vs_cache_hash(const void *key)
473 {
474 return _mesa_hash_data(key, sizeof(struct v3d_vs_key));
475 }
476
477 static bool
478 fs_cache_compare(const void *key1, const void *key2)
479 {
480 return memcmp(key1, key2, sizeof(struct v3d_fs_key)) == 0;
481 }
482
483 static bool
484 vs_cache_compare(const void *key1, const void *key2)
485 {
486 return memcmp(key1, key2, sizeof(struct v3d_vs_key)) == 0;
487 }
488
489 static void
490 delete_from_cache_if_matches(struct hash_table *ht,
491 struct vc5_compiled_shader **last_compile,
492 struct hash_entry *entry,
493 struct vc5_uncompiled_shader *so)
494 {
495 const struct v3d_key *key = entry->key;
496
497 if (key->shader_state == so) {
498 struct vc5_compiled_shader *shader = entry->data;
499 _mesa_hash_table_remove(ht, entry);
500 vc5_bo_unreference(&shader->bo);
501
502 if (shader == *last_compile)
503 *last_compile = NULL;
504
505 ralloc_free(shader);
506 }
507 }
508
509 static void
510 vc5_shader_state_delete(struct pipe_context *pctx, void *hwcso)
511 {
512 struct vc5_context *vc5 = vc5_context(pctx);
513 struct vc5_uncompiled_shader *so = hwcso;
514
515 struct hash_entry *entry;
516 hash_table_foreach(vc5->fs_cache, entry) {
517 delete_from_cache_if_matches(vc5->fs_cache, &vc5->prog.fs,
518 entry, so);
519 }
520 hash_table_foreach(vc5->vs_cache, entry) {
521 delete_from_cache_if_matches(vc5->vs_cache, &vc5->prog.vs,
522 entry, so);
523 }
524
525 ralloc_free(so->base.ir.nir);
526 free(so);
527 }
528
529 static void
530 vc5_fp_state_bind(struct pipe_context *pctx, void *hwcso)
531 {
532 struct vc5_context *vc5 = vc5_context(pctx);
533 vc5->prog.bind_fs = hwcso;
534 vc5->dirty |= VC5_DIRTY_UNCOMPILED_FS;
535 }
536
537 static void
538 vc5_vp_state_bind(struct pipe_context *pctx, void *hwcso)
539 {
540 struct vc5_context *vc5 = vc5_context(pctx);
541 vc5->prog.bind_vs = hwcso;
542 vc5->dirty |= VC5_DIRTY_UNCOMPILED_VS;
543 }
544
545 void
546 vc5_program_init(struct pipe_context *pctx)
547 {
548 struct vc5_context *vc5 = vc5_context(pctx);
549
550 pctx->create_vs_state = vc5_shader_state_create;
551 pctx->delete_vs_state = vc5_shader_state_delete;
552
553 pctx->create_fs_state = vc5_shader_state_create;
554 pctx->delete_fs_state = vc5_shader_state_delete;
555
556 pctx->bind_fs_state = vc5_fp_state_bind;
557 pctx->bind_vs_state = vc5_vp_state_bind;
558
559 vc5->fs_cache = _mesa_hash_table_create(pctx, fs_cache_hash,
560 fs_cache_compare);
561 vc5->vs_cache = _mesa_hash_table_create(pctx, vs_cache_hash,
562 vs_cache_compare);
563 }
564
565 void
566 vc5_program_fini(struct pipe_context *pctx)
567 {
568 struct vc5_context *vc5 = vc5_context(pctx);
569
570 struct hash_entry *entry;
571 hash_table_foreach(vc5->fs_cache, entry) {
572 struct vc5_compiled_shader *shader = entry->data;
573 vc5_bo_unreference(&shader->bo);
574 ralloc_free(shader);
575 _mesa_hash_table_remove(vc5->fs_cache, entry);
576 }
577
578 hash_table_foreach(vc5->vs_cache, entry) {
579 struct vc5_compiled_shader *shader = entry->data;
580 vc5_bo_unreference(&shader->bo);
581 ralloc_free(shader);
582 _mesa_hash_table_remove(vc5->vs_cache, entry);
583 }
584 }